Skip to content

Commit

Permalink
standard-search: Fix parsing of description and divs with the results
Browse files Browse the repository at this point in the history
Google has introduced new formats for their result page (I got at least
4 different page formats in my tests). This commit fixes the parsing for
those four different formats for the description field. The parsing of
other attributes like cache_link is still broken.

Closes abenassi#92
Closes abenassi#88
Closes abenassi#84
  • Loading branch information
kulla committed Feb 10, 2021
1 parent 546a59c commit 7095691
Showing 1 changed file with 27 additions and 14 deletions.
41 changes: 27 additions & 14 deletions googleapi/modules/standard_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def __init__(self):
self.index = None # What index on this page it was on
self.number_of_results = None # The total number of results the query returned
self.is_pdf = None # This boolean is true if google thinks this result leads to a PDF file

def __repr__(self):
name = self._limit_str_size(self.name, 55)
description = self._limit_str_size(self.description, 49)
Expand Down Expand Up @@ -72,7 +72,17 @@ def search(query, pages=1, lang='en', area='com', ncr=False, void=True, time_per

if html:
soup = BeautifulSoup(html, "html.parser")

divs = soup.findAll("div", attrs={"class": "g"})
if len(divs) == 0:
divs = soup.select("#main > div")
if len(divs) == 1:
divs = soup.findAll("div", attrs={"data-hveid": "CAsQAA"})
else:
divs = divs[2:]

if len(divs) == 0:
divs = soup.select("body > div:nth-child(3) > div")[1:]

results_div = soup.find("div", attrs={"id": "resultStats"})
number_of_results = _get_number_of_results(results_div)
Expand All @@ -92,12 +102,13 @@ def search(query, pages=1, lang='en', area='com', ncr=False, void=True, time_per
res.cached = _get_cached(li)
res.number_of_results = number_of_results
res.is_pdf = _get_is_pdf(li)
if void is True:
if res.description is None:
continue

if void is True and (res.description is None or res.link is None):
continue

results.append(res)
j += 1

return results


Expand Down Expand Up @@ -188,15 +199,17 @@ def _get_description(li):
TODO: There are some text encoding problems to resolve."""

sdiv = li.find("div", attrs={"class": "IsZvec"})
if sdiv:
stspan = sdiv.find("span", attrs={"class": "aCOpRe"})
if stspan is not None:
# return stspan.text.encode("utf-8").strip()
return stspan.text.strip()
else:
return None
# div.IsZvec span.aCOpRE is an old selection which can be deleted
# after Google is not using it any more
description = li.select_one("div.BNeawe div.BNeawe")
if description == None:
description = li.select_one("div.IsZvec span.aCOpRe")
if description == None:
description = li.find("span", attrs={"class": "qXLe6d"})
if description == None:
description = li.find("div", attrs={"class": "I5aSse"})

return description.text.strip() if description != None else None

def _get_thumb():
"""Return the link to a thumbnail of the website."""
Expand All @@ -216,7 +229,7 @@ def _get_is_pdf(li):
"""Return if the link is marked by google as PDF"""
sdiv = li.find("span", attrs={"class": "ZGwO7 C0kchf NaCKVc"})
return True if sdiv else False

def _get_number_of_results(results_div):
"""Return the total number of results of the google search.
Note that the returned value will be the same for all the GoogleResult
Expand Down

0 comments on commit 7095691

Please sign in to comment.