Skip to content

Commit

Permalink
Use built-in method for decoding HTML entities
Browse files Browse the repository at this point in the history
  • Loading branch information
Chocobo1 committed Dec 26, 2024
1 parent 7487cd7 commit 90e457a
Showing 1 changed file with 4 additions and 17 deletions.
21 changes: 4 additions & 17 deletions src/searchengine/nova3/helpers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#VERSION: 1.49
#VERSION: 1.50

# Author:
# Christophe DUMEZ (chris@qbittorrent.org)
Expand Down Expand Up @@ -29,7 +29,7 @@

import datetime
import gzip
import html.entities
import html
import io
import os
import re
Expand Down Expand Up @@ -72,21 +72,8 @@ def getBrowserUserAgent() -> str:
socket.socket = socks.socksocket # type: ignore[misc]


def htmlentitydecode(s: str) -> str:
# First convert alpha entities (such as é)
# (Inspired from http://mail.python.org/pipermail/python-list/2007-June/443813.html)
def entity2char(m: re.Match[str]) -> str:
entity = m.group(1)
if entity in html.entities.name2codepoint:
return chr(html.entities.name2codepoint[entity])
return " " # Unknown entity: We replace with a space.
t = re.sub('&(%s);' % '|'.join(html.entities.name2codepoint), entity2char, s)

# Then convert numerical entities (such as é)
t = re.sub(r'&#(\d+);', lambda x: chr(int(x.group(1))), t)

# Then convert hexa entities (such as é)
return re.sub(r'&#x(\w+);', lambda x: chr(int(x.group(1), 16)), t)
# This is only provided for backward compatibility, new code should not use it
htmlentitydecode = html.unescape


def retrieve_url(url: str, custom_headers: Mapping[str, Any] = {}, request_data: Optional[Any] = None) -> str:
Expand Down

0 comments on commit 90e457a

Please sign in to comment.