V4.1.0 (#160)

1. random browser in each class instance (headers are set automatically), 2. CLI: file download debugging (removed chunks, added timeout), 3. removed sleep between requests, 4. removed VQDExtractionException, 5. BUGFIX: AsyncDDGS - used AsyncSession instead of Session, 6. BUGFIX AsyncDDGS on windows - change asyncio.set_event_loop_policy 7. logging: improved debug messages.
deedy5 · Dec 13, 2023 · 256c0fd · 256c0fd
1 parent 1d4d73a
commit 256c0fd
Show file tree

Hide file tree

Showing 7 changed files with 44 additions and 79 deletions.
diff --git a/README.md b/README.md
@@ -226,7 +226,6 @@ with DDGS(proxies="socks5://user:password@geo.iproyal.com:32325", timeout=20) as
 
 Exceptions:
 - `DuckDuckGoSearchException`: Raised when there is a generic exception during the API request.
-- `VQDExtractionException`: Raised when there is an error extracting the VQD value for a search query.
 
 [Go To TOP](#TOP)
 

diff --git a/duckduckgo_search/cli.py b/duckduckgo_search/cli.py
@@ -4,13 +4,13 @@
 import os
 from concurrent.futures import as_completed, ThreadPoolExecutor
 from datetime import datetime
-from random import choice
 from urllib.parse import unquote
 
 import click
 from curl_cffi import requests
 
-from .duckduckgo_search import DDGS, USERAGENTS
+from .duckduckgo_search import DDGS
+from .utils import _random_browser
 from .version import __version__
 
 logger = logging.getLogger(__name__)
@@ -79,14 +79,11 @@ def sanitize_keywords(keywords):
 
 
 def download_file(url, dir_path, filename, proxy):
-    headers = {"User-Agent": choice(USERAGENTS)}
     try:
-        with requests.Session(headers=headers, proxies=proxy, impersonate="chrome110") as session:
-            resp = session.get(url, stream=True)
-            resp.raise_for_status()
-            with open(os.path.join(dir_path, filename[:200]), "wb") as file:
-                for chunk in resp.iter_content():
-                    file.write(chunk)
+        resp = requests.get(url, proxies=proxy, impersonate=_random_browser(), timeout=10)
+        resp.raise_for_status()
+        with open(os.path.join(dir_path, filename[:200]), "wb") as file:
+            file.write(resp.content)
     except Exception as ex:
         logger.debug(f"download_file url={url} {type(ex).__name__} {ex}")
 

diff --git a/duckduckgo_search/duckduckgo_search.py b/duckduckgo_search/duckduckgo_search.py
@@ -3,18 +3,16 @@
 from datetime import datetime, timezone
 from decimal import Decimal
 from itertools import cycle
-from random import choice
-from time import sleep
 from typing import Deque, Dict, Iterator, Optional, Set, Tuple
 
 from lxml import html
 from curl_cffi import requests
 
 from .exceptions import DuckDuckGoSearchException
 from .models import MapsResult
-from .utils import HEADERS, USERAGENTS, _extract_vqd, _is_500_in_url, _normalize, _normalize_url, _text_extract_json
+from .utils import _extract_vqd, _is_500_in_url, _normalize, _normalize_url, _random_browser, _text_extract_json
 
-logger = logging.getLogger(__name__)
+logger = logging.getLogger("duckduckgo_search.DDGS")
 
 
 class DDGS:
@@ -27,13 +25,11 @@ class DDGS:
     """
 
     def __init__(self, headers=None, proxies=None, timeout=10) -> None:
-        if headers is None:
-            headers = HEADERS
-            headers["User-Agent"] = choice(USERAGENTS)
         self.proxies = proxies if proxies and isinstance(proxies, dict) else {"http": proxies, "https": proxies}
         self._session = requests.Session(
-            headers=headers, proxies=self.proxies, timeout=timeout, http_version=2, impersonate="chrome110"
+            headers=headers, proxies=self.proxies, timeout=timeout, impersonate=_random_browser()
         )
+        self._session.headers["Referer"] = "https://duckduckgo.com/"
 
     def __enter__(self) -> "DDGS":
         return self
@@ -44,6 +40,7 @@ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
     def _get_url(self, method: str, url: str, **kwargs) -> Optional[requests.Response]:
         try:
             resp = self._session.request(method, url, **kwargs)
+            logger.debug(f"_get_url() {url} {resp.status_code} {resp.http_version} {resp.elapsed} {len(resp.content)}")
             resp.raise_for_status()
             if _is_500_in_url(str(resp.url)) or resp.status_code == 202:
                 raise
@@ -58,11 +55,6 @@ def _get_vqd(self, keywords: str) -> Optional[str]:
         if resp:
             return _extract_vqd(resp.content, keywords)
 
-    def _sleep(self) -> None:
-        """Sleep between API requests if proxies is None."""
-        if self.proxies is None:
-            sleep(0.75)
-
     def text(
         self,
         keywords: str,
@@ -151,7 +143,7 @@ def _text_api(
             if resp is None:
                 return
 
-            page_data = _text_extract_json(resp.content)
+            page_data = _text_extract_json(resp.content, keywords)
             if page_data is None:
                 return
 
@@ -173,7 +165,6 @@ def _text_api(
             if max_results is None or result_exists is False or next_page_url is None:
                 return
             payload["s"] = next_page_url.split("s=")[1].split("&")[0]
-            self._sleep()
 
     def _text_html(
         self,
@@ -248,7 +239,6 @@ def _text_html(
             names = next_page.xpath('.//input[@type="hidden"]/@name')
             values = next_page.xpath('.//input[@type="hidden"]/@value')
             payload = {n: v for n, v in zip(names, values)}
-            self._sleep()
 
     def _text_lite(
         self,
@@ -271,6 +261,7 @@ def _text_lite(
         """
         assert keywords, "keywords is mandatory"
 
+        self._session.headers["Referer"] = "https://lite.duckduckgo.com/"
         payload = {
             "q": keywords,
             "s": "0",
@@ -323,7 +314,6 @@ def _text_lite(
                 return
             payload["s"] = next_page_s[0]
             payload["vqd"] = _extract_vqd(resp.content, keywords)
-            self._sleep()
 
     def images(
         self,
@@ -417,7 +407,6 @@ def images(
             if next is None:
                 return
             payload["s"] = next.split("s=")[-1].split("&")[0]
-            self._sleep()
 
     def videos(
         self,
@@ -492,7 +481,6 @@ def videos(
             if next is None:
                 return
             payload["s"] = next.split("s=")[-1].split("&")[0]
-            self._sleep()
 
     def news(
         self,
@@ -566,7 +554,6 @@ def news(
             if next is None:
                 return
             payload["s"] = next.split("s=")[-1].split("&")[0]
-            self._sleep()
 
     def answers(self, keywords: str) -> Iterator[Dict[str, Optional[str]]]:
         """DuckDuckGo instant answers. Query params: https://duckduckgo.com/params
@@ -753,7 +740,7 @@ def maps(
         lat_b -= Decimal(radius) * Decimal(0.008983)
         lon_l -= Decimal(radius) * Decimal(0.008983)
         lon_r += Decimal(radius) * Decimal(0.008983)
-        logging.debug(f"bbox coordinates\n{lat_t} {lon_l}\n{lat_b} {lon_r}")
+        logger.debug(f"bbox coordinates\n{lat_t} {lon_l}\n{lat_b} {lon_r}")
 
         # сreate a queue of search squares (bboxes)
         work_bboxes: Deque[Tuple[Decimal, Decimal, Decimal, Decimal]] = deque()
@@ -818,7 +805,6 @@ def maps(
                 bbox3 = (lat_middle, lon_l, lat_b, lon_middle)
                 bbox4 = (lat_middle, lon_middle, lat_b, lon_r)
                 work_bboxes.extendleft([bbox1, bbox2, bbox3, bbox4])
-            self._sleep()
 
     def translate(
         self, keywords: str, from_: Optional[str] = None, to: str = "en"

diff --git a/duckduckgo_search/duckduckgo_search_async.py b/duckduckgo_search/duckduckgo_search_async.py
@@ -1,20 +1,23 @@
 import asyncio
 import logging
+import sys
 from collections import deque
 from datetime import datetime, timezone
 from decimal import Decimal
 from itertools import cycle
-from random import choice
 from typing import AsyncIterator, Deque, Dict, Optional, Set, Tuple
 
 from lxml import html
 from curl_cffi import requests
 
 from .exceptions import DuckDuckGoSearchException
 from .models import MapsResult
-from .utils import HEADERS, USERAGENTS, _extract_vqd, _is_500_in_url, _normalize, _normalize_url, _text_extract_json
+from .utils import _extract_vqd, _is_500_in_url, _normalize, _normalize_url, _random_browser, _text_extract_json
 
-logger = logging.getLogger(__name__)
+logger = logging.getLogger("duckduckgo_search.AsyncDDGS")
+# Not working on Windows, NotImplementedError (https://curl-cffi.readthedocs.io/en/latest/faq/)
+if sys.platform.lower().startswith("win"):
+    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
 
 
 class AsyncDDGS:
@@ -27,13 +30,11 @@ class AsyncDDGS:
     """
 
     def __init__(self, headers=None, proxies=None, timeout=10) -> None:
-        if headers is None:
-            headers = HEADERS
-            headers["User-Agent"] = choice(USERAGENTS)
         self.proxies = proxies if proxies and isinstance(proxies, dict) else {"http": proxies, "https": proxies}
-        self._session = requests.Session(
-            headers=headers, proxies=self.proxies, timeout=timeout, http_version=2, impersonate="chrome110"
+        self._session = requests.AsyncSession(
+            headers=headers, proxies=self.proxies, timeout=timeout, impersonate=_random_browser()
         )
+        self._session.headers["Referer"] = "https://duckduckgo.com/"
 
     async def __aenter__(self) -> "AsyncDDGS":
         return self
@@ -43,7 +44,8 @@ async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
 
     async def _get_url(self, method: str, url: str, **kwargs) -> Optional[requests.Response]:
         try:
-            resp = self._session.request(method, url, **kwargs)
+            resp = await self._session.request(method, url, **kwargs)
+            logger.debug(f"_get_url() {url} {resp.status_code} {resp.http_version} {resp.elapsed} {len(resp.content)}")
             resp.raise_for_status()
             if _is_500_in_url(str(resp.url)) or resp.status_code == 202:
                 raise
@@ -58,11 +60,6 @@ async def _get_vqd(self, keywords: str) -> Optional[str]:
         if resp:
             return _extract_vqd(resp.content, keywords)
 
-    async def _sleep(self) -> None:
-        """Sleep between API requests if proxies is None."""
-        if self.proxies is None:
-            await asyncio.sleep(0.75)
-
     async def text(
         self,
         keywords: str,
@@ -152,7 +149,7 @@ async def _text_api(
             if resp is None:
                 return
 
-            page_data = _text_extract_json(resp.content)
+            page_data = _text_extract_json(resp.content, keywords)
             if page_data is None:
                 return
 
@@ -174,7 +171,6 @@ async def _text_api(
             if max_results is None or result_exists is False or next_page_url is None:
                 return
             payload["s"] = next_page_url.split("s=")[1].split("&")[0]
-            await self._sleep()
 
     async def _text_html(
         self,
@@ -248,7 +244,6 @@ async def _text_html(
             names = next_page.xpath('.//input[@type="hidden"]/@name')
             values = next_page.xpath('.//input[@type="hidden"]/@value')
             payload = {n: v for n, v in zip(names, values)}
-            await self._sleep()
 
     async def _text_lite(
         self,
@@ -271,6 +266,7 @@ async def _text_lite(
         """
         assert keywords, "keywords is mandatory"
 
+        self._session.headers["Referer"] = "https://lite.duckduckgo.com/"
         payload = {
             "q": keywords,
             "s": "0",
@@ -323,7 +319,6 @@ async def _text_lite(
                 return
             payload["s"] = next_page_s[0]
             payload["vqd"] = _extract_vqd(resp.content, keywords)
-            await self._sleep()
 
     async def images(
         self,
@@ -417,7 +412,6 @@ async def images(
             if next is None:
                 return
             payload["s"] = next.split("s=")[-1].split("&")[0]
-            await self._sleep()
 
     async def videos(
         self,
@@ -492,7 +486,6 @@ async def videos(
             if next is None:
                 return
             payload["s"] = next.split("s=")[-1].split("&")[0]
-            await self._sleep()
 
     async def news(
         self,
@@ -566,7 +559,6 @@ async def news(
             if next is None:
                 return
             payload["s"] = next.split("s=")[-1].split("&")[0]
-            await self._sleep()
 
     async def answers(self, keywords: str) -> AsyncIterator[Dict[str, Optional[str]]]:
         """DuckDuckGo instant answers. Query params: https://duckduckgo.com/params
@@ -753,7 +745,7 @@ async def maps(
         lat_b -= Decimal(radius) * Decimal(0.008983)
         lon_l -= Decimal(radius) * Decimal(0.008983)
         lon_r += Decimal(radius) * Decimal(0.008983)
-        logging.debug(f"bbox coordinates\n{lat_t} {lon_l}\n{lat_b} {lon_r}")
+        logger.debug(f"bbox coordinates\n{lat_t} {lon_l}\n{lat_b} {lon_r}")
 
         # сreate a queue of search squares (bboxes)
         work_bboxes: Deque[Tuple[Decimal, Decimal, Decimal, Decimal]] = deque()
@@ -818,7 +810,6 @@ async def maps(
                 bbox3 = (lat_middle, lon_l, lat_b, lon_middle)
                 bbox4 = (lat_middle, lon_middle, lat_b, lon_r)
                 work_bboxes.extendleft([bbox1, bbox2, bbox3, bbox4])
-            await self._sleep()
 
     async def translate(
         self, keywords: str, from_: Optional[str] = None, to: str = "en"

diff --git a/duckduckgo_search/exceptions.py b/duckduckgo_search/exceptions.py
@@ -1,6 +1,2 @@
 class DuckDuckGoSearchException(Exception):
     """Base exception class for duckduckgo_search."""
-
-
-class VQDExtractionException(DuckDuckGoSearchException):
-    """Exception raised for error in extract vqd."""