standard-search: Fix parsing of description and divs with the results

Google has introduced new formats for their result page (I got at least 4 different page formats in my tests). This commit fixes the parsing for those four different formats for the description field. The parsing of other attributes like cache_link is still broken. Closes abenassi#92 Closes abenassi#88 Closes abenassi#84
kulla · Feb 10, 2021 · 7095691 · 7095691
1 parent 546a59c
commit 7095691
Showing 1 changed file with 27 additions and 14 deletions.
diff --git a/googleapi/modules/standard_search.py b/googleapi/modules/standard_search.py
@@ -28,7 +28,7 @@ def __init__(self):
         self.index = None  # What index on this page it was on
         self.number_of_results = None # The total number of results the query returned
         self.is_pdf = None # This boolean is true if google thinks this result leads to a PDF file
-        
+
     def __repr__(self):
         name = self._limit_str_size(self.name, 55)
         description = self._limit_str_size(self.description, 49)
@@ -72,7 +72,17 @@ def search(query, pages=1, lang='en', area='com', ncr=False, void=True, time_per
 
         if html:
             soup = BeautifulSoup(html, "html.parser")
+
             divs = soup.findAll("div", attrs={"class": "g"})
+            if len(divs) == 0:
+                divs = soup.select("#main > div")
+                if len(divs) == 1:
+                    divs = soup.findAll("div", attrs={"data-hveid": "CAsQAA"})
+                else:
+                    divs = divs[2:]
+
+            if len(divs) == 0:
+                divs = soup.select("body > div:nth-child(3) > div")[1:]
 
             results_div = soup.find("div", attrs={"id": "resultStats"})
             number_of_results = _get_number_of_results(results_div)
@@ -92,12 +102,13 @@ def search(query, pages=1, lang='en', area='com', ncr=False, void=True, time_per
                 res.cached = _get_cached(li)
                 res.number_of_results = number_of_results
                 res.is_pdf = _get_is_pdf(li)
-                
-                if void is True:
-                    if res.description is None:
-                        continue
+
+                if void is True and (res.description is None or res.link is None):
+                    continue
+
                 results.append(res)
                 j += 1
+
     return results
 
 
@@ -188,15 +199,17 @@ def _get_description(li):
 
     TODO: There are some text encoding problems to resolve."""
 
-    sdiv = li.find("div", attrs={"class": "IsZvec"})
-    if sdiv:
-        stspan = sdiv.find("span", attrs={"class": "aCOpRe"})
-        if stspan is not None:
-            # return stspan.text.encode("utf-8").strip()
-            return stspan.text.strip()
-    else:
-        return None
+    # div.IsZvec span.aCOpRE is an old selection which can be deleted
+    # after Google is not using it any more
+    description = li.select_one("div.BNeawe div.BNeawe")
+    if description == None:
+        description = li.select_one("div.IsZvec span.aCOpRe")
+    if description == None:
+        description = li.find("span", attrs={"class": "qXLe6d"})
+    if description == None:
+        description = li.find("div", attrs={"class": "I5aSse"})
 
+    return description.text.strip() if description != None else None
 
 def _get_thumb():
     """Return the link to a thumbnail of the website."""
@@ -216,7 +229,7 @@ def _get_is_pdf(li):
     """Return if the link is marked by google as PDF"""
     sdiv = li.find("span", attrs={"class": "ZGwO7 C0kchf NaCKVc"})
     return True if sdiv else False
-    
+
 def _get_number_of_results(results_div):
     """Return the total number of results of the google search.
     Note that the returned value will be the same for all the GoogleResult