From 90e457a671f161c7a56896b9be0d5ae996e7f0df Mon Sep 17 00:00:00 2001 From: Chocobo1 Date: Fri, 27 Dec 2024 01:56:33 +0800 Subject: [PATCH 1/2] Use built-in method for decoding HTML entities --- src/searchengine/nova3/helpers.py | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/src/searchengine/nova3/helpers.py b/src/searchengine/nova3/helpers.py index ef8376a28d53..dfdfe234cbbc 100644 --- a/src/searchengine/nova3/helpers.py +++ b/src/searchengine/nova3/helpers.py @@ -1,4 +1,4 @@ -#VERSION: 1.49 +#VERSION: 1.50 # Author: # Christophe DUMEZ (chris@qbittorrent.org) @@ -29,7 +29,7 @@ import datetime import gzip -import html.entities +import html import io import os import re @@ -72,21 +72,8 @@ def getBrowserUserAgent() -> str: socket.socket = socks.socksocket # type: ignore[misc] -def htmlentitydecode(s: str) -> str: - # First convert alpha entities (such as é) - # (Inspired from http://mail.python.org/pipermail/python-list/2007-June/443813.html) - def entity2char(m: re.Match[str]) -> str: - entity = m.group(1) - if entity in html.entities.name2codepoint: - return chr(html.entities.name2codepoint[entity]) - return " " # Unknown entity: We replace with a space. - t = re.sub('&(%s);' % '|'.join(html.entities.name2codepoint), entity2char, s) - - # Then convert numerical entities (such as é) - t = re.sub(r'&#(\d+);', lambda x: chr(int(x.group(1))), t) - - # Then convert hexa entities (such as é) - return re.sub(r'&#x(\w+);', lambda x: chr(int(x.group(1), 16)), t) +# This is only provided for backward compatibility, new code should not use it +htmlentitydecode = html.unescape def retrieve_url(url: str, custom_headers: Mapping[str, Any] = {}, request_data: Optional[Any] = None) -> str: From cc31a909316cce598027a2764a39a595bbf9d077 Mon Sep 17 00:00:00 2001 From: Chocobo1 Date: Fri, 27 Dec 2024 02:15:35 +0800 Subject: [PATCH 2/2] Provide SSL context field The allows the caller to provide proper SSL parameters and avoid dirty monkey patching to suppress SSL errors. --- src/searchengine/nova3/helpers.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/searchengine/nova3/helpers.py b/src/searchengine/nova3/helpers.py index dfdfe234cbbc..abf201439ab6 100644 --- a/src/searchengine/nova3/helpers.py +++ b/src/searchengine/nova3/helpers.py @@ -35,6 +35,7 @@ import re import socket import socks +import ssl import sys import tempfile import urllib.error @@ -76,12 +77,12 @@ def getBrowserUserAgent() -> str: htmlentitydecode = html.unescape -def retrieve_url(url: str, custom_headers: Mapping[str, Any] = {}, request_data: Optional[Any] = None) -> str: +def retrieve_url(url: str, custom_headers: Mapping[str, Any] = {}, request_data: Optional[Any] = None, ssl_context: Optional[ssl.SSLContext] = None) -> str: """ Return the content of the url page as a string """ request = urllib.request.Request(url, request_data, {**headers, **custom_headers}) try: - response = urllib.request.urlopen(request) + response = urllib.request.urlopen(request, context=ssl_context) except urllib.error.URLError as errno: print(f"Connection error: {errno.reason}", file=sys.stderr) return "" @@ -104,14 +105,14 @@ def retrieve_url(url: str, custom_headers: Mapping[str, Any] = {}, request_data: return dataStr -def download_file(url: str, referer: Optional[str] = None) -> str: +def download_file(url: str, referer: Optional[str] = None, ssl_context: Optional[ssl.SSLContext] = None) -> str: """ Download file at url and write it to a file, return the path to the file and the url """ # Download url request = urllib.request.Request(url, headers=headers) if referer is not None: request.add_header('referer', referer) - response = urllib.request.urlopen(request) + response = urllib.request.urlopen(request, context=ssl_context) data = response.read() # Check if it is gzipped