diff --git a/googleapi/modules/standard_search.py b/googleapi/modules/standard_search.py index 7862940..d100bd3 100644 --- a/googleapi/modules/standard_search.py +++ b/googleapi/modules/standard_search.py @@ -28,7 +28,7 @@ def __init__(self): self.index = None # What index on this page it was on self.number_of_results = None # The total number of results the query returned self.is_pdf = None # This boolean is true if google thinks this result leads to a PDF file - + def __repr__(self): name = self._limit_str_size(self.name, 55) description = self._limit_str_size(self.description, 49) @@ -72,7 +72,17 @@ def search(query, pages=1, lang='en', area='com', ncr=False, void=True, time_per if html: soup = BeautifulSoup(html, "html.parser") + divs = soup.findAll("div", attrs={"class": "g"}) + if len(divs) == 0: + divs = soup.select("#main > div") + if len(divs) == 1: + divs = soup.findAll("div", attrs={"data-hveid": "CAsQAA"}) + else: + divs = divs[2:] + + if len(divs) == 0: + divs = soup.select("body > div:nth-child(3) > div")[1:] results_div = soup.find("div", attrs={"id": "resultStats"}) number_of_results = _get_number_of_results(results_div) @@ -92,12 +102,13 @@ def search(query, pages=1, lang='en', area='com', ncr=False, void=True, time_per res.cached = _get_cached(li) res.number_of_results = number_of_results res.is_pdf = _get_is_pdf(li) - - if void is True: - if res.description is None: - continue + + if void is True and (res.description is None or res.link is None): + continue + results.append(res) j += 1 + return results @@ -188,15 +199,17 @@ def _get_description(li): TODO: There are some text encoding problems to resolve.""" - sdiv = li.find("div", attrs={"class": "IsZvec"}) - if sdiv: - stspan = sdiv.find("span", attrs={"class": "aCOpRe"}) - if stspan is not None: - # return stspan.text.encode("utf-8").strip() - return stspan.text.strip() - else: - return None + # div.IsZvec span.aCOpRE is an old selection which can be deleted + # after Google is not using it any more + description = li.select_one("div.BNeawe div.BNeawe") + if description == None: + description = li.select_one("div.IsZvec span.aCOpRe") + if description == None: + description = li.find("span", attrs={"class": "qXLe6d"}) + if description == None: + description = li.find("div", attrs={"class": "I5aSse"}) + return description.text.strip() if description != None else None def _get_thumb(): """Return the link to a thumbnail of the website.""" @@ -216,7 +229,7 @@ def _get_is_pdf(li): """Return if the link is marked by google as PDF""" sdiv = li.find("span", attrs={"class": "ZGwO7 C0kchf NaCKVc"}) return True if sdiv else False - + def _get_number_of_results(results_div): """Return the total number of results of the google search. Note that the returned value will be the same for all the GoogleResult