Merge pull request #16 from shaikhsajid1111/fixes

Fixes for scrolling issue and Timeout
shaikhsajid1111 · Feb 5, 2022 · 6f836f6 · 6f836f6
2 parents fa23d56 + 66694bd
commit 6f836f6
Show file tree

Hide file tree

Showing 7 changed files with 98 additions and 53 deletions.
diff --git a/README.MD b/README.MD
@@ -48,11 +48,12 @@ page_name = "facebookai"
 posts_count = 10
 browser = "firefox"
 proxy = "IP:PORT" #if proxy requires authentication then user:password@IP:PORT
-meta_ai = Facebook_scraper(page_name,posts_count,browser,proxy=proxy)
+timeout = 600 #600 seconds
+meta_ai = Facebook_scraper(page_name,posts_count,browser,proxy=proxy,timeout=timeout)
 
 ```
 
-<h3> Parameters for  <code>Facebook_scraper(page_name,posts_count,browser,proxy) </code> class </h3>
+<h3> Parameters for  <code>Facebook_scraper(page_name,posts_count,browser,proxy,timeout) </code> class </h3>
 <table>
 <th>
 <tr>
@@ -109,6 +110,18 @@ string
 optional argument, if user wants to set proxy, if proxy requires authentication then the format will be <code> user:password@IP:PORT </code>
 </td>
 </tr>
+<tr>
+<td>
+timeout
+</td>
+<td>
+integer
+</td>
+<td>
+The maximum amount of time the bot should run for. If not passed, the default timeout is set to 10 minutes
+ </code>
+</td>
+</tr>
 
 </table>
 <br>

diff --git a/changelog.MD b/changelog.MD
@@ -1,5 +1,14 @@
 <h1> Changelog </h1>
 <section>
+<h2> 2.0.0 </h2>
+<h3>Added</h3>
+<li>Timeout argument to set the maximum amount of time the bot should run in case if no post were found.</li>
+<h3>Changes</h3>
+<li>Updated selenium from version <code>3.141.0</code> to <code>4.1.0</code> </li>
+<h3>Fixed</h3>
+<li>Fixed issue of browser keep on scrolling above despite calling scroll down method, happening due to different multiple functions call  </li>
+<br>
+<section>
 <h2> 0.1.10 </h2>
 <h3>Added</h3>
 <li>Support for new Facebook Layout</li>

diff --git a/facebook_page_scraper/driver_utilities.py b/facebook_page_scraper/driver_utilities.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python3
+from fileinput import close
+
+
 try:
     from selenium.webdriver.support.ui import WebDriverWait
     from selenium.webdriver.support import expected_conditions as EC
@@ -28,7 +31,7 @@ def __close_error_popup(driver):
         than click on close button to skip that popup.'''
         try:
             WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,'a.layerCancel'))) #wait for popup to show
-            button = driver.find_element_by_css_selector("a.layerCancel") #grab that popup's close button
+            button = driver.find_element(By.CSS_SELECTOR,"a.layerCancel") #grab that popup's close button
             button.click()  #click "close" button
         except WebDriverException:
             #it is possible that even after waiting for given amount of time,modal may not appear
@@ -49,6 +52,19 @@ def __scroll_down_half(driver):
             Utilities.__close_driver(driver)
             print("error at scroll_down_half method : {}".format(ex))
 
+    @staticmethod
+    def __close_modern_layout_signup_modal(driver):
+      try:
+        driver.execute_script(
+            "window.scrollTo(0, document.body.scrollHeight);")
+        close_button = driver.find_element(By.CSS_SELECTOR,'[aria-label="Close"]')
+        close_button.click()
+      except NoSuchElementException:
+        pass
+      except Exception as ex:
+        print("error at close_modern_layout_signup_modal: {}".format(ex))
+
+
     @staticmethod
     def __scroll_down(driver,layout):
         """expects driver's instance as a argument, and it scrolls down page to the most bottom till the height"""
@@ -57,9 +73,13 @@ def __scroll_down(driver,layout):
             driver.execute_script(
                   "window.scrollTo(0, document.body.scrollHeight);")
           elif layout == "new":
-            body = driver.find_element_by_css_selector("body")
-            for _ in range(randint(2, 3)):
+            body = driver.find_element(By.CSS_SELECTOR,"body")
+            for _ in range(randint(5,6)):
+              body.send_keys(Keys.PAGE_UP)
+            for _ in range(randint(5, 8)):
               body.send_keys(Keys.PAGE_DOWN)
+            #driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+            #Utilities.__close_modern_layout_signup_modal(driver)
         except Exception as ex:
             #if any error occured than close the driver and exit
             Utilities.__close_driver(driver)
@@ -69,11 +89,11 @@ def __scroll_down(driver,layout):
     def __close_popup(driver):
         """expects driver's instance and closes modal that ask for login, by clicking "Not Now" button """
         try:
-            Utilities.__scroll_down_half(driver)  #try to scroll
+            #Utilities.__scroll_down_half(driver)  #try to scroll
             #wait for popup to show
             WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID,'expanding_cta_close_button')))
             #grab "Not Now" button
-            popup_close_button = driver.find_element_by_id('expanding_cta_close_button')
+            popup_close_button = driver.find_element(By.ID,'expanding_cta_close_button')
             popup_close_button.click()  #click the button
         except WebDriverException:
             #modal may not popup, so no need to raise exception in case it is not found
@@ -91,7 +111,7 @@ def __wait_for_element_to_appear(driver,layout):
         try:
             if layout == "old":
               #wait for page to load so posts are visible
-              body = driver.find_element_by_css_selector("body")
+              body = driver.find_element(By.CSS_SELECTOR,"body")
               for _ in range(randint(3, 5)):
                 body.send_keys(Keys.PAGE_DOWN)
               WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR,'.userContentWrapper')))
@@ -115,7 +135,7 @@ def __click_see_more(driver,content):
         """expects driver's instance and selenium element, click on "see more" link to open hidden content"""
         try:
             #find element and click 'see more' button
-            element = content.find_element_by_css_selector('span.see_more_link_inner')
+            element = content.find_element(By.CSS_SELECTOR,'span.see_more_link_inner')
             driver.execute_script("arguments[0].click();", element) #click button using js
 
         except NoSuchElementException:

diff --git a/facebook_page_scraper/element_finder.py b/facebook_page_scraper/element_finder.py
@@ -9,6 +9,7 @@
     from dateutil.parser import parse
     import dateutil
     import datetime
+    from selenium.webdriver.common.by import By
 except Exception as ex:
     print(ex)
 
@@ -59,14 +60,13 @@ def __find_status(post,layout):
             if layout == "old":
               #aim is to find element that looks like <a href="URL" class="_5pcq"></a>
               #after finding that element, get it's href value and pass it to different method that extracts post_id from that href
-              status_link = post.find_element_by_class_name("_5pcq").get_attribute("href")
+              status_link = post.find_element(By.CLASS_NAME,"_5pcq").get_attribute("href")
               #extract out post id from post's url
               status = Scraping_utilities._Scraping_utilities__extract_id_from_link(status_link)
             elif layout == "new":
-              links = post.find_elements_by_css_selector("a[role='link']")
-              link = Finder.__get_status_link(links)
+              #links = post.find_elements(By.CSS_SELECTOR,"a[role='link']")
+              link = post.find_element(By.CSS_SELECTOR,'.gpro0wi8.b1v8xokw')
               status_link = link.get_attribute('href')
-              print("Status Link: ",status_link)
               status = Scraping_utilities._Scraping_utilities__extract_id_from_link(
                   status_link)
         except NoSuchElementException:
@@ -85,10 +85,10 @@ def __find_share(post,layout):
         try:
             if layout == "old":
               #aim is to find element that have datatest-id attribute as UFI2SharesCount/root
-              shares = post.find_element_by_css_selector("[data-testid='UFI2SharesCount/root']").get_attribute('textContent')
+              shares = post.find_element(By.CSS_SELECTOR,"[data-testid='UFI2SharesCount/root']").get_attribute('textContent')
               shares = Scraping_utilities._Scraping_utilities__extract_numbers(shares)
             elif layout == "new":
-              elements = post.find_elements_by_css_selector("div.gtad4xkn")
+              elements = post.find_elements(By.CSS_SELECTOR,"div.gtad4xkn")
               shares = "0"
               for element in elements:
                 text = element.text
@@ -112,8 +112,7 @@ def __find_reactions(post):
         """finds all reaction of the facebook post using selenium's webdriver's method"""
         try:
             #find element that have attribute aria-label as 'See who reacted to this
-            reactions_all = post.find_element_by_css_selector(
-                '[aria-label="See who reacted to this"]')
+            reactions_all = post.find_element(By.CSS_SELECTOR,'[aria-label="See who reacted to this"]')
         except NoSuchElementException:
             reactions_all = ""
         except Exception as ex:
@@ -126,11 +125,11 @@ def __find_comments(post,layout):
         try:
             comments = ""
             if layout == "old":
-              comments = post.find_element_by_css_selector("a._3hg-").get_attribute('textContent')
+              comments = post.find_element(By.CSS_SELECTOR,"a._3hg-").get_attribute('textContent')
               #extract numbers from text
               comments = Scraping_utilities._Scraping_utilities__extract_numbers(comments)
             elif layout == "new":
-              elements = post.find_elements_by_css_selector("div.gtad4xkn")
+              elements = post.find_elements(By.CSS_SELECTOR,"div.gtad4xkn")
               comments = "0"
               for element in elements:
                 text = element.text
@@ -164,7 +163,7 @@ def __fetch_post_passage(href):
     @staticmethod
     def __element_exists(element,css_selector):
         try:
-            found = element.find_element_by_css_selector(css_selector)
+            found = element.find_element(By.CSS_SELECTOR,css_selector)
             return True
         except NoSuchElementException:
             return False
@@ -174,12 +173,12 @@ def __find_content(post,driver,layout):
         """finds content of the facebook post using selenium's webdriver's method and returns string containing text of the posts"""
         try:
             if layout == "old":
-              post_content = post.find_element_by_class_name('userContent')
+              post_content = post.find_element(By.CLASS_NAME,'userContent')
             elif layout == "new":
-              post_content = post.find_element_by_css_selector('[data-ad-preview="message"]')
+              post_content = post.find_element(By.CSS_SELECTOR,'[data-ad-preview="message"]')
             #if 'See more' or 'Continue reading' is present in post
             if Finder._Finder__element_exists(post_content,"span.text_exposed_link > a"):
-                element = post_content.find_element_by_css_selector("span.text_exposed_link > a") #grab that element
+                element = post_content.find_element(By.CSS_SELECTOR,"span.text_exposed_link > a") #grab that element
                 #if element have already the onclick function, that means it is expandable paragraph
                 if element.get_attribute("onclick"):
                     Utilities._Utilities__click_see_more(driver,post_content) #click 'see more' button to get hidden text as well
@@ -209,7 +208,7 @@ def __find_posted_time(post,layout,link_element):
             #extract element that looks like <abbr class='_5ptz' data-utime="some unix timestamp"> </abbr>
             #posted_time = post.find_element_by_css_selector("abbr._5ptz").get_attribute("data-utime")
             if layout == "old":
-              posted_time = post.find_element_by_tag_name("abbr").get_attribute('data-utime')
+              posted_time = post.find_element(By.TAG_NAME,"abbr").get_attribute('data-utime')
               return datetime.datetime.fromtimestamp(float(posted_time)).isoformat()
             elif layout == "new":
               aria_label_value = link_element.get_attribute("aria-label")
@@ -233,7 +232,7 @@ def __find_video_url(post,page_name,status):
         """finds video of the facebook post using selenium's webdriver's method"""
         try:
             #if video is found in the post, than create a video URL by concatenating post's id with page_name
-            video_element = post.find_element_by_tag_name("video")
+            video_element = post.find_element(By.TAG_NAME,"video")
             video = "https://www.facebook.com/{}/videos/{}".format(page_name,status)
 
         except NoSuchElementException:
@@ -250,7 +249,7 @@ def __find_image_url(post):
         """finds all image of the facebook post using selenium's webdriver's method"""
         try:
             #find all img tag that looks like <img class="scaledImageFitWidth img" src="">
-            images = post.find_elements_by_css_selector("img.scaledImageFitWidth.img")
+            images = post.find_elements(By.CSS_SELECTOR,"img.scaledImageFitWidth.img")
             #extract src attribute from all the img tag,store it in list
             sources = [image.get_attribute("src") for image in images] if len(images) > 0 else []
         except NoSuchElementException:
@@ -268,10 +267,9 @@ def __find_all_posts(driver,layout):
         try:
             #find all posts that looks like <div class="userContentWrapper"> </div>
             if layout == "old":
-              all_posts = driver.find_elements_by_css_selector("div.userContentWrapper")
+              all_posts = driver.find_elements(By.CSS_SELECTOR,"div.userContentWrapper")
             elif layout == "new":
-              all_posts = driver.find_elements_by_css_selector(
-                  '[aria-posinset]')
+              all_posts = driver.find_elements(By.CSS_SELECTOR,'[aria-posinset]')
             return all_posts
         except NoSuchElementException:
             print("Cannot find any posts! Exiting!")
@@ -288,17 +286,17 @@ def __find_name(driver,layout):
         """finds name of the facebook page using selenium's webdriver's method"""
         try:
             if layout == "old":
-              name =  driver.find_element_by_css_selector('a._64-f').get_attribute('textContent')
+              name =  driver.find_element(By.CSS_SELECTOR,'a._64-f').get_attribute('textContent')
             elif layout == "new":
-              name = driver.find_element_by_tag_name("strong").get_attribute("textContent")
+              name = driver.find_element(By.TAG_NAME,"strong").get_attribute("textContent")
             return name
         except Exception as ex:
             print("error at __find_name method : {}".format(ex))
 
     @staticmethod
     def __detect_ui(driver):
       try:
-        driver.find_element_by_id("pagelet_bluebar")
+        driver.find_element(By.ID,"pagelet_bluebar")
         return "old"
       except NoSuchElementException:
         return "new"
@@ -311,10 +309,10 @@ def __detect_ui(driver):
     def __find_reaction(layout, reactions_all):
       try:
         if layout == "old":
-          return reactions_all.find_elements_by_tag_name(
+          return reactions_all.find_elements(By.TAG_NAME,
               "a")
         elif layout == "new":
-          return reactions_all.find_elements_by_tag_name(
+          return reactions_all.find_elements(By.TAG_NAME,
             "div")
 
       except Exception as ex:

diff --git a/facebook_page_scraper/scraper.py b/facebook_page_scraper/scraper.py
@@ -8,6 +8,7 @@
     import json
     import csv
     import os
+    import time
 
 except Exception as ex:
     print(ex)
@@ -38,9 +39,8 @@ class Facebook_scraper:
     #on each iteration __close_after_retry is called to check if retry have turned to 0
     # if it returns true,it will break the loop. After coming out of loop,driver will be closed and it will return post whatever was found
 
-    retry = 10
 
-    def __init__(self,page_name,posts_count=10,browser="chrome",proxy=None):
+    def __init__(self,page_name,posts_count=10,browser="chrome",proxy=None,timeout=600):
         self.page_name = page_name
         self.posts_count = int(posts_count)
         #self.URL = "https://en-gb.facebook.com/pg/{}/posts".format(self.page_name)
@@ -49,21 +49,30 @@ def __init__(self,page_name,posts_count=10,browser="chrome",proxy=None):
         self.__driver = ''
         self.proxy = proxy
         self.__layout = ''
+        self.timeout = timeout
 
     def __start_driver(self):
         """changes the class member __driver value to driver on call"""
         self.__driver = Initializer(self.browser,self.proxy).init()
-    def __handle_popup_old_layout(self,layout):
+    def __handle_popup(self,layout):
         #while scrolling, wait for login popup to show, it can be skipped by clicking "Not Now" button
         try:
-          Utilities._Utilities__close_popup(self.__driver)
-        except:
-          pass
+          if layout == "old":
+            #if during scrolling any of error or signup popup shows
+            Utilities._Utilities__close_error_popup(self.__driver)
+            Utilities._Utilities__close_popup(self.__driver)
+          elif layout == "new":
+            Utilities._Utilities__close_modern_layout_signup_modal(self.__driver)
+        except Exception as ex:
+          print(ex)
+
+    def __check_timeout(self,start_time,current_time):
+      return (current_time-start_time) > self.timeout
 
     def scrap_to_json(self):
         #call the __start_driver and override class member __driver to webdriver's instance
         self.__start_driver()
-
+        starting_time = time.time()
         #navigate to URL
         self.__driver.get(self.URL)
 
@@ -75,21 +84,18 @@ def scrap_to_json(self):
         Utilities._Utilities__wait_for_element_to_appear(self.__driver,self.__layout)
         #scroll down to bottom most
         Utilities._Utilities__scroll_down(self.__driver,self.__layout)
-        self.__handle_popup_old_layout(self.__layout)
+        self.__handle_popup(self.__layout)
 
 
         name = Finder._Finder__find_name(self.__driver,self.__layout) #find name element
 
         while len(self.__data_dict) <= self.posts_count:
-
-            #if during scrolling any of error or signup popup shows
-            Utilities._Utilities__close_error_popup(self.__driver)
-            self.__handle_popup_old_layout(self.__layout)
+            self.__handle_popup(self.__layout)
             self.__find_elements(name)
-
-            if self.__close_after_retry() is True:
-                #keep a check if posts are available, if retry is 0, than it breaks loop
-                break
+            current_time = time.time()
+            if self.__check_timeout(starting_time,current_time) is True:
+              print("Timeout...")
+              break
             Utilities._Utilities__scroll_down(self.__driver, self.__layout)  #scroll down
             #print(len(self.__data_dict))
         #close the browser window after job is done.
@@ -163,7 +169,6 @@ def __find_elements(self,name):
         all_posts = Finder._Finder__find_all_posts(self.__driver,self.__layout) #find all posts
         all_posts = self.__remove_duplicates(all_posts) #remove duplicates from the list
 
-        self.__no_post_found(all_posts)  #after removing duplicates if length is 0, retry will decrease by 1
         #iterate over all the posts and find details from the same
         for post in all_posts:
             try:

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-selenium==3.141.0
+selenium==4.1.0
 webdriver-manager==3.2.2
 selenium-wire==4.3.1
 python-dateutil==2.8.2