Skip to content

Commit

Permalink
[fix] engine: duckduckgo - CAPTCHA detection
Browse files Browse the repository at this point in the history
The previous implementation could not distinguish a CAPTCHA response from an
ordinary result list.  In the previous implementation a CAPTCHA was taken as a
result list where no items are in.

DDG does not block IPs.  Instead, a CAPTCHA wall is placed in front of request
on a dubious request.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
  • Loading branch information
return42 committed Oct 19, 2024
1 parent 88caa1d commit 0504513
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 5 deletions.
11 changes: 11 additions & 0 deletions searx/engines/duckduckgo.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from searx import redisdb
from searx.enginelib.traits import EngineTraits
from searx.utils import extr
from searx.exceptions import SearxEngineCaptchaException

if TYPE_CHECKING:
import logging
Expand Down Expand Up @@ -292,13 +293,23 @@ def request(query, params):
return params


def detect_ddg_captcha(dom):
"""In case of CAPTCHA ddg open its own *not a Robot* dialog and is
not redirected to CAPTCHA page.
"""
if eval_xpath(dom, "//form[@id='challenge-form']"):
# set suspend time to zero is OK --> ddg does not block the IP
raise SearxEngineCaptchaException(suspended_time=0)


def response(resp):

if resp.status_code == 303:
return []

results = []
doc = lxml.html.fromstring(resp.text)
detect_ddg_captcha(doc)

result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table')

Expand Down
12 changes: 7 additions & 5 deletions searx/exceptions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Exception types raised by SearXNG modules.
"""
from __future__ import annotations

from typing import Optional, Union

Expand Down Expand Up @@ -61,7 +62,7 @@ class SearxEngineAccessDeniedException(SearxEngineResponseException):
"""This settings contains the default suspended time (default 86400 sec / 1
day)."""

def __init__(self, suspended_time: int = None, message: str = 'Access denied'):
def __init__(self, suspended_time: int | None = None, message: str = 'Access denied'):
"""Generic exception to raise when an engine denies access to the results.
:param suspended_time: How long the engine is going to be suspended in
Expand All @@ -70,12 +71,13 @@ def __init__(self, suspended_time: int = None, message: str = 'Access denied'):
:param message: Internal message. Defaults to ``Access denied``
:type message: str
"""
suspended_time = suspended_time or self._get_default_suspended_time()
if suspended_time is None:
suspended_time = self._get_default_suspended_time()
super().__init__(message + ', suspended_time=' + str(suspended_time))
self.suspended_time = suspended_time
self.message = message

def _get_default_suspended_time(self):
def _get_default_suspended_time(self) -> int:
from searx import get_setting # pylint: disable=C0415

return get_setting(self.SUSPEND_TIME_SETTING)
Expand All @@ -88,7 +90,7 @@ class SearxEngineCaptchaException(SearxEngineAccessDeniedException):
"""This settings contains the default suspended time (default 86400 sec / 1
day)."""

def __init__(self, suspended_time=None, message='CAPTCHA'):
def __init__(self, suspended_time: int | None = None, message='CAPTCHA'):
super().__init__(message=message, suspended_time=suspended_time)


Expand All @@ -102,7 +104,7 @@ class SearxEngineTooManyRequestsException(SearxEngineAccessDeniedException):
"""This settings contains the default suspended time (default 3660 sec / 1
hour)."""

def __init__(self, suspended_time=None, message='Too many request'):
def __init__(self, suspended_time: int | None = None, message='Too many request'):
super().__init__(message=message, suspended_time=suspended_time)


Expand Down

0 comments on commit 0504513

Please sign in to comment.