From 21bfb4996e3fa4042e7ae018587d50ff2281f583 Mon Sep 17 00:00:00 2001 From: Markus Date: Sun, 1 Sep 2024 18:53:56 +0200 Subject: [PATCH] [fix] engine yahoo: HTML tags are included in result titles - https://github.com/searxng/searxng/issues/3790 Signed-off-by: Markus Heiser --- searx/engines/yahoo.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py index 305cf523dcd..8dba443c799 100644 --- a/searx/engines/yahoo.py +++ b/searx/engines/yahoo.py @@ -16,6 +16,7 @@ eval_xpath_getindex, eval_xpath_list, extract_text, + html_to_text, ) from searx.enginelib.traits import EngineTraits @@ -133,12 +134,20 @@ def response(resp): url = parse_url(url) title = eval_xpath_getindex(result, './/h3//a/@aria-label', 0, default='') - title = extract_text(title) + title: str = extract_text(title) content = eval_xpath_getindex(result, './/div[contains(@class, "compText")]', 0, default='') - content = extract_text(content, allow_none=True) + content: str = extract_text(content, allow_none=True) # append result - results.append({'url': url, 'title': title, 'content': content}) + results.append( + { + 'url': url, + # title sometimes contains HTML tags / see + # https://github.com/searxng/searxng/issues/3790 + 'title': " ".join(html_to_text(title).strip().split()), + 'content': " ".join(html_to_text(content).strip().split()), + } + ) for suggestion in eval_xpath_list(dom, '//div[contains(@class, "AlsoTry")]//table//a'): # append suggestion