Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve Search engine #22070

Merged
merged 2 commits into from
Dec 29, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 9 additions & 21 deletions src/searchengine/nova3/helpers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#VERSION: 1.49
#VERSION: 1.50

# Author:
# Christophe DUMEZ (chris@qbittorrent.org)
Expand Down Expand Up @@ -29,12 +29,13 @@

import datetime
import gzip
import html.entities
import html
import io
import os
import re
import socket
import socks
import ssl
import sys
import tempfile
import urllib.error
Expand Down Expand Up @@ -72,29 +73,16 @@ def getBrowserUserAgent() -> str:
socket.socket = socks.socksocket # type: ignore[misc]


def htmlentitydecode(s: str) -> str:
# First convert alpha entities (such as é)
# (Inspired from http://mail.python.org/pipermail/python-list/2007-June/443813.html)
def entity2char(m: re.Match[str]) -> str:
entity = m.group(1)
if entity in html.entities.name2codepoint:
return chr(html.entities.name2codepoint[entity])
return " " # Unknown entity: We replace with a space.
t = re.sub('&(%s);' % '|'.join(html.entities.name2codepoint), entity2char, s)
# This is only provided for backward compatibility, new code should not use it
htmlentitydecode = html.unescape

# Then convert numerical entities (such as é)
t = re.sub(r'&#(\d+);', lambda x: chr(int(x.group(1))), t)

# Then convert hexa entities (such as é)
return re.sub(r'&#x(\w+);', lambda x: chr(int(x.group(1), 16)), t)


def retrieve_url(url: str, custom_headers: Mapping[str, Any] = {}, request_data: Optional[Any] = None) -> str:
def retrieve_url(url: str, custom_headers: Mapping[str, Any] = {}, request_data: Optional[Any] = None, ssl_context: Optional[ssl.SSLContext] = None) -> str:
""" Return the content of the url page as a string """

request = urllib.request.Request(url, request_data, {**headers, **custom_headers})
try:
response = urllib.request.urlopen(request)
response = urllib.request.urlopen(request, context=ssl_context)
except urllib.error.URLError as errno:
print(f"Connection error: {errno.reason}", file=sys.stderr)
return ""
Expand All @@ -117,14 +105,14 @@ def retrieve_url(url: str, custom_headers: Mapping[str, Any] = {}, request_data:
return dataStr


def download_file(url: str, referer: Optional[str] = None) -> str:
def download_file(url: str, referer: Optional[str] = None, ssl_context: Optional[ssl.SSLContext] = None) -> str:
""" Download file at url and write it to a file, return the path to the file and the url """

# Download url
request = urllib.request.Request(url, headers=headers)
if referer is not None:
request.add_header('referer', referer)
response = urllib.request.urlopen(request)
response = urllib.request.urlopen(request, context=ssl_context)
data = response.read()

# Check if it is gzipped
Expand Down
Loading