Skip to content

Commit

Permalink
V4.1.0 (#160)
Browse files Browse the repository at this point in the history
1. random browser in each class instance (headers are set automatically),
2. CLI: file download debugging (removed chunks, added timeout),
3. removed sleep between requests,
4. removed VQDExtractionException,
5. BUGFIX: AsyncDDGS - used AsyncSession instead of Session,
6. BUGFIX AsyncDDGS on windows - change asyncio.set_event_loop_policy
7. logging: improved debug messages.
  • Loading branch information
deedy5 authored Dec 13, 2023
1 parent 1d4d73a commit 256c0fd
Show file tree
Hide file tree
Showing 7 changed files with 44 additions and 79 deletions.
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,6 @@ with DDGS(proxies="socks5://user:password@geo.iproyal.com:32325", timeout=20) as

Exceptions:
- `DuckDuckGoSearchException`: Raised when there is a generic exception during the API request.
- `VQDExtractionException`: Raised when there is an error extracting the VQD value for a search query.

[Go To TOP](#TOP)

Expand Down
15 changes: 6 additions & 9 deletions duckduckgo_search/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
import os
from concurrent.futures import as_completed, ThreadPoolExecutor
from datetime import datetime
from random import choice
from urllib.parse import unquote

import click
from curl_cffi import requests

from .duckduckgo_search import DDGS, USERAGENTS
from .duckduckgo_search import DDGS
from .utils import _random_browser
from .version import __version__

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -79,14 +79,11 @@ def sanitize_keywords(keywords):


def download_file(url, dir_path, filename, proxy):
headers = {"User-Agent": choice(USERAGENTS)}
try:
with requests.Session(headers=headers, proxies=proxy, impersonate="chrome110") as session:
resp = session.get(url, stream=True)
resp.raise_for_status()
with open(os.path.join(dir_path, filename[:200]), "wb") as file:
for chunk in resp.iter_content():
file.write(chunk)
resp = requests.get(url, proxies=proxy, impersonate=_random_browser(), timeout=10)
resp.raise_for_status()
with open(os.path.join(dir_path, filename[:200]), "wb") as file:
file.write(resp.content)
except Exception as ex:
logger.debug(f"download_file url={url} {type(ex).__name__} {ex}")

Expand Down
30 changes: 8 additions & 22 deletions duckduckgo_search/duckduckgo_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,16 @@
from datetime import datetime, timezone
from decimal import Decimal
from itertools import cycle
from random import choice
from time import sleep
from typing import Deque, Dict, Iterator, Optional, Set, Tuple

from lxml import html
from curl_cffi import requests

from .exceptions import DuckDuckGoSearchException
from .models import MapsResult
from .utils import HEADERS, USERAGENTS, _extract_vqd, _is_500_in_url, _normalize, _normalize_url, _text_extract_json
from .utils import _extract_vqd, _is_500_in_url, _normalize, _normalize_url, _random_browser, _text_extract_json

logger = logging.getLogger(__name__)
logger = logging.getLogger("duckduckgo_search.DDGS")


class DDGS:
Expand All @@ -27,13 +25,11 @@ class DDGS:
"""

def __init__(self, headers=None, proxies=None, timeout=10) -> None:
if headers is None:
headers = HEADERS
headers["User-Agent"] = choice(USERAGENTS)
self.proxies = proxies if proxies and isinstance(proxies, dict) else {"http": proxies, "https": proxies}
self._session = requests.Session(
headers=headers, proxies=self.proxies, timeout=timeout, http_version=2, impersonate="chrome110"
headers=headers, proxies=self.proxies, timeout=timeout, impersonate=_random_browser()
)
self._session.headers["Referer"] = "https://duckduckgo.com/"

def __enter__(self) -> "DDGS":
return self
Expand All @@ -44,6 +40,7 @@ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
def _get_url(self, method: str, url: str, **kwargs) -> Optional[requests.Response]:
try:
resp = self._session.request(method, url, **kwargs)
logger.debug(f"_get_url() {url} {resp.status_code} {resp.http_version} {resp.elapsed} {len(resp.content)}")
resp.raise_for_status()
if _is_500_in_url(str(resp.url)) or resp.status_code == 202:
raise
Expand All @@ -58,11 +55,6 @@ def _get_vqd(self, keywords: str) -> Optional[str]:
if resp:
return _extract_vqd(resp.content, keywords)

def _sleep(self) -> None:
"""Sleep between API requests if proxies is None."""
if self.proxies is None:
sleep(0.75)

def text(
self,
keywords: str,
Expand Down Expand Up @@ -151,7 +143,7 @@ def _text_api(
if resp is None:
return

page_data = _text_extract_json(resp.content)
page_data = _text_extract_json(resp.content, keywords)
if page_data is None:
return

Expand All @@ -173,7 +165,6 @@ def _text_api(
if max_results is None or result_exists is False or next_page_url is None:
return
payload["s"] = next_page_url.split("s=")[1].split("&")[0]
self._sleep()

def _text_html(
self,
Expand Down Expand Up @@ -248,7 +239,6 @@ def _text_html(
names = next_page.xpath('.//input[@type="hidden"]/@name')
values = next_page.xpath('.//input[@type="hidden"]/@value')
payload = {n: v for n, v in zip(names, values)}
self._sleep()

def _text_lite(
self,
Expand All @@ -271,6 +261,7 @@ def _text_lite(
"""
assert keywords, "keywords is mandatory"

self._session.headers["Referer"] = "https://lite.duckduckgo.com/"
payload = {
"q": keywords,
"s": "0",
Expand Down Expand Up @@ -323,7 +314,6 @@ def _text_lite(
return
payload["s"] = next_page_s[0]
payload["vqd"] = _extract_vqd(resp.content, keywords)
self._sleep()

def images(
self,
Expand Down Expand Up @@ -417,7 +407,6 @@ def images(
if next is None:
return
payload["s"] = next.split("s=")[-1].split("&")[0]
self._sleep()

def videos(
self,
Expand Down Expand Up @@ -492,7 +481,6 @@ def videos(
if next is None:
return
payload["s"] = next.split("s=")[-1].split("&")[0]
self._sleep()

def news(
self,
Expand Down Expand Up @@ -566,7 +554,6 @@ def news(
if next is None:
return
payload["s"] = next.split("s=")[-1].split("&")[0]
self._sleep()

def answers(self, keywords: str) -> Iterator[Dict[str, Optional[str]]]:
"""DuckDuckGo instant answers. Query params: https://duckduckgo.com/params
Expand Down Expand Up @@ -753,7 +740,7 @@ def maps(
lat_b -= Decimal(radius) * Decimal(0.008983)
lon_l -= Decimal(radius) * Decimal(0.008983)
lon_r += Decimal(radius) * Decimal(0.008983)
logging.debug(f"bbox coordinates\n{lat_t} {lon_l}\n{lat_b} {lon_r}")
logger.debug(f"bbox coordinates\n{lat_t} {lon_l}\n{lat_b} {lon_r}")

# сreate a queue of search squares (bboxes)
work_bboxes: Deque[Tuple[Decimal, Decimal, Decimal, Decimal]] = deque()
Expand Down Expand Up @@ -818,7 +805,6 @@ def maps(
bbox3 = (lat_middle, lon_l, lat_b, lon_middle)
bbox4 = (lat_middle, lon_middle, lat_b, lon_r)
work_bboxes.extendleft([bbox1, bbox2, bbox3, bbox4])
self._sleep()

def translate(
self, keywords: str, from_: Optional[str] = None, to: str = "en"
Expand Down
37 changes: 14 additions & 23 deletions duckduckgo_search/duckduckgo_search_async.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
import asyncio
import logging
import sys
from collections import deque
from datetime import datetime, timezone
from decimal import Decimal
from itertools import cycle
from random import choice
from typing import AsyncIterator, Deque, Dict, Optional, Set, Tuple

from lxml import html
from curl_cffi import requests

from .exceptions import DuckDuckGoSearchException
from .models import MapsResult
from .utils import HEADERS, USERAGENTS, _extract_vqd, _is_500_in_url, _normalize, _normalize_url, _text_extract_json
from .utils import _extract_vqd, _is_500_in_url, _normalize, _normalize_url, _random_browser, _text_extract_json

logger = logging.getLogger(__name__)
logger = logging.getLogger("duckduckgo_search.AsyncDDGS")
# Not working on Windows, NotImplementedError (https://curl-cffi.readthedocs.io/en/latest/faq/)
if sys.platform.lower().startswith("win"):
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())


class AsyncDDGS:
Expand All @@ -27,13 +30,11 @@ class AsyncDDGS:
"""

def __init__(self, headers=None, proxies=None, timeout=10) -> None:
if headers is None:
headers = HEADERS
headers["User-Agent"] = choice(USERAGENTS)
self.proxies = proxies if proxies and isinstance(proxies, dict) else {"http": proxies, "https": proxies}
self._session = requests.Session(
headers=headers, proxies=self.proxies, timeout=timeout, http_version=2, impersonate="chrome110"
self._session = requests.AsyncSession(
headers=headers, proxies=self.proxies, timeout=timeout, impersonate=_random_browser()
)
self._session.headers["Referer"] = "https://duckduckgo.com/"

async def __aenter__(self) -> "AsyncDDGS":
return self
Expand All @@ -43,7 +44,8 @@ async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:

async def _get_url(self, method: str, url: str, **kwargs) -> Optional[requests.Response]:
try:
resp = self._session.request(method, url, **kwargs)
resp = await self._session.request(method, url, **kwargs)
logger.debug(f"_get_url() {url} {resp.status_code} {resp.http_version} {resp.elapsed} {len(resp.content)}")
resp.raise_for_status()
if _is_500_in_url(str(resp.url)) or resp.status_code == 202:
raise
Expand All @@ -58,11 +60,6 @@ async def _get_vqd(self, keywords: str) -> Optional[str]:
if resp:
return _extract_vqd(resp.content, keywords)

async def _sleep(self) -> None:
"""Sleep between API requests if proxies is None."""
if self.proxies is None:
await asyncio.sleep(0.75)

async def text(
self,
keywords: str,
Expand Down Expand Up @@ -152,7 +149,7 @@ async def _text_api(
if resp is None:
return

page_data = _text_extract_json(resp.content)
page_data = _text_extract_json(resp.content, keywords)
if page_data is None:
return

Expand All @@ -174,7 +171,6 @@ async def _text_api(
if max_results is None or result_exists is False or next_page_url is None:
return
payload["s"] = next_page_url.split("s=")[1].split("&")[0]
await self._sleep()

async def _text_html(
self,
Expand Down Expand Up @@ -248,7 +244,6 @@ async def _text_html(
names = next_page.xpath('.//input[@type="hidden"]/@name')
values = next_page.xpath('.//input[@type="hidden"]/@value')
payload = {n: v for n, v in zip(names, values)}
await self._sleep()

async def _text_lite(
self,
Expand All @@ -271,6 +266,7 @@ async def _text_lite(
"""
assert keywords, "keywords is mandatory"

self._session.headers["Referer"] = "https://lite.duckduckgo.com/"
payload = {
"q": keywords,
"s": "0",
Expand Down Expand Up @@ -323,7 +319,6 @@ async def _text_lite(
return
payload["s"] = next_page_s[0]
payload["vqd"] = _extract_vqd(resp.content, keywords)
await self._sleep()

async def images(
self,
Expand Down Expand Up @@ -417,7 +412,6 @@ async def images(
if next is None:
return
payload["s"] = next.split("s=")[-1].split("&")[0]
await self._sleep()

async def videos(
self,
Expand Down Expand Up @@ -492,7 +486,6 @@ async def videos(
if next is None:
return
payload["s"] = next.split("s=")[-1].split("&")[0]
await self._sleep()

async def news(
self,
Expand Down Expand Up @@ -566,7 +559,6 @@ async def news(
if next is None:
return
payload["s"] = next.split("s=")[-1].split("&")[0]
await self._sleep()

async def answers(self, keywords: str) -> AsyncIterator[Dict[str, Optional[str]]]:
"""DuckDuckGo instant answers. Query params: https://duckduckgo.com/params
Expand Down Expand Up @@ -753,7 +745,7 @@ async def maps(
lat_b -= Decimal(radius) * Decimal(0.008983)
lon_l -= Decimal(radius) * Decimal(0.008983)
lon_r += Decimal(radius) * Decimal(0.008983)
logging.debug(f"bbox coordinates\n{lat_t} {lon_l}\n{lat_b} {lon_r}")
logger.debug(f"bbox coordinates\n{lat_t} {lon_l}\n{lat_b} {lon_r}")

# сreate a queue of search squares (bboxes)
work_bboxes: Deque[Tuple[Decimal, Decimal, Decimal, Decimal]] = deque()
Expand Down Expand Up @@ -818,7 +810,6 @@ async def maps(
bbox3 = (lat_middle, lon_l, lat_b, lon_middle)
bbox4 = (lat_middle, lon_middle, lat_b, lon_r)
work_bboxes.extendleft([bbox1, bbox2, bbox3, bbox4])
await self._sleep()

async def translate(
self, keywords: str, from_: Optional[str] = None, to: str = "en"
Expand Down
4 changes: 0 additions & 4 deletions duckduckgo_search/exceptions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,2 @@
class DuckDuckGoSearchException(Exception):
"""Base exception class for duckduckgo_search."""


class VQDExtractionException(DuckDuckGoSearchException):
"""Exception raised for error in extract vqd."""
Loading

0 comments on commit 256c0fd

Please sign in to comment.