diff --git a/README.MD b/README.MD index 266c4c3..69eb7fa 100644 --- a/README.MD +++ b/README.MD @@ -48,11 +48,12 @@ page_name = "facebookai" posts_count = 10 browser = "firefox" proxy = "IP:PORT" #if proxy requires authentication then user:password@IP:PORT -meta_ai = Facebook_scraper(page_name,posts_count,browser,proxy=proxy) +timeout = 600 #600 seconds +meta_ai = Facebook_scraper(page_name,posts_count,browser,proxy=proxy,timeout=timeout) ``` -

Parameters for Facebook_scraper(page_name,posts_count,browser,proxy) class

+

Parameters for Facebook_scraper(page_name,posts_count,browser,proxy,timeout) class

@@ -109,6 +110,18 @@ string optional argument, if user wants to set proxy, if proxy requires authentication then the format will be user:password@IP:PORT + + + + +
+timeout + +integer + +The maximum amount of time the bot should run for. If not passed, the default timeout is set to 10 minutes + +

diff --git a/changelog.MD b/changelog.MD index a5b735a..405eead 100644 --- a/changelog.MD +++ b/changelog.MD @@ -1,5 +1,14 @@

Changelog

+

2.0.0

+

Added

+
  • Timeout argument to set the maximum amount of time the bot should run in case if no post were found.
  • +

    Changes

    +
  • Updated selenium from version 3.141.0 to 4.1.0
  • +

    Fixed

    +
  • Fixed issue of browser keep on scrolling above despite calling scroll down method, happening due to different multiple functions call
  • +
    +

    0.1.10

    Added

  • Support for new Facebook Layout
  • diff --git a/facebook_page_scraper/driver_utilities.py b/facebook_page_scraper/driver_utilities.py index e55a386..1091288 100644 --- a/facebook_page_scraper/driver_utilities.py +++ b/facebook_page_scraper/driver_utilities.py @@ -1,4 +1,7 @@ #!/usr/bin/env python3 +from fileinput import close + + try: from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC @@ -28,7 +31,7 @@ def __close_error_popup(driver): than click on close button to skip that popup.''' try: WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,'a.layerCancel'))) #wait for popup to show - button = driver.find_element_by_css_selector("a.layerCancel") #grab that popup's close button + button = driver.find_element(By.CSS_SELECTOR,"a.layerCancel") #grab that popup's close button button.click() #click "close" button except WebDriverException: #it is possible that even after waiting for given amount of time,modal may not appear @@ -49,6 +52,19 @@ def __scroll_down_half(driver): Utilities.__close_driver(driver) print("error at scroll_down_half method : {}".format(ex)) + @staticmethod + def __close_modern_layout_signup_modal(driver): + try: + driver.execute_script( + "window.scrollTo(0, document.body.scrollHeight);") + close_button = driver.find_element(By.CSS_SELECTOR,'[aria-label="Close"]') + close_button.click() + except NoSuchElementException: + pass + except Exception as ex: + print("error at close_modern_layout_signup_modal: {}".format(ex)) + + @staticmethod def __scroll_down(driver,layout): """expects driver's instance as a argument, and it scrolls down page to the most bottom till the height""" @@ -57,9 +73,13 @@ def __scroll_down(driver,layout): driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") elif layout == "new": - body = driver.find_element_by_css_selector("body") - for _ in range(randint(2, 3)): + body = driver.find_element(By.CSS_SELECTOR,"body") + for _ in range(randint(5,6)): + body.send_keys(Keys.PAGE_UP) + for _ in range(randint(5, 8)): body.send_keys(Keys.PAGE_DOWN) + #driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + #Utilities.__close_modern_layout_signup_modal(driver) except Exception as ex: #if any error occured than close the driver and exit Utilities.__close_driver(driver) @@ -69,11 +89,11 @@ def __scroll_down(driver,layout): def __close_popup(driver): """expects driver's instance and closes modal that ask for login, by clicking "Not Now" button """ try: - Utilities.__scroll_down_half(driver) #try to scroll + #Utilities.__scroll_down_half(driver) #try to scroll #wait for popup to show WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID,'expanding_cta_close_button'))) #grab "Not Now" button - popup_close_button = driver.find_element_by_id('expanding_cta_close_button') + popup_close_button = driver.find_element(By.ID,'expanding_cta_close_button') popup_close_button.click() #click the button except WebDriverException: #modal may not popup, so no need to raise exception in case it is not found @@ -91,7 +111,7 @@ def __wait_for_element_to_appear(driver,layout): try: if layout == "old": #wait for page to load so posts are visible - body = driver.find_element_by_css_selector("body") + body = driver.find_element(By.CSS_SELECTOR,"body") for _ in range(randint(3, 5)): body.send_keys(Keys.PAGE_DOWN) WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR,'.userContentWrapper'))) @@ -115,7 +135,7 @@ def __click_see_more(driver,content): """expects driver's instance and selenium element, click on "see more" link to open hidden content""" try: #find element and click 'see more' button - element = content.find_element_by_css_selector('span.see_more_link_inner') + element = content.find_element(By.CSS_SELECTOR,'span.see_more_link_inner') driver.execute_script("arguments[0].click();", element) #click button using js except NoSuchElementException: diff --git a/facebook_page_scraper/element_finder.py b/facebook_page_scraper/element_finder.py index 1544d79..746c256 100644 --- a/facebook_page_scraper/element_finder.py +++ b/facebook_page_scraper/element_finder.py @@ -9,6 +9,7 @@ from dateutil.parser import parse import dateutil import datetime + from selenium.webdriver.common.by import By except Exception as ex: print(ex) @@ -59,14 +60,13 @@ def __find_status(post,layout): if layout == "old": #aim is to find element that looks like #after finding that element, get it's href value and pass it to different method that extracts post_id from that href - status_link = post.find_element_by_class_name("_5pcq").get_attribute("href") + status_link = post.find_element(By.CLASS_NAME,"_5pcq").get_attribute("href") #extract out post id from post's url status = Scraping_utilities._Scraping_utilities__extract_id_from_link(status_link) elif layout == "new": - links = post.find_elements_by_css_selector("a[role='link']") - link = Finder.__get_status_link(links) + #links = post.find_elements(By.CSS_SELECTOR,"a[role='link']") + link = post.find_element(By.CSS_SELECTOR,'.gpro0wi8.b1v8xokw') status_link = link.get_attribute('href') - print("Status Link: ",status_link) status = Scraping_utilities._Scraping_utilities__extract_id_from_link( status_link) except NoSuchElementException: @@ -85,10 +85,10 @@ def __find_share(post,layout): try: if layout == "old": #aim is to find element that have datatest-id attribute as UFI2SharesCount/root - shares = post.find_element_by_css_selector("[data-testid='UFI2SharesCount/root']").get_attribute('textContent') + shares = post.find_element(By.CSS_SELECTOR,"[data-testid='UFI2SharesCount/root']").get_attribute('textContent') shares = Scraping_utilities._Scraping_utilities__extract_numbers(shares) elif layout == "new": - elements = post.find_elements_by_css_selector("div.gtad4xkn") + elements = post.find_elements(By.CSS_SELECTOR,"div.gtad4xkn") shares = "0" for element in elements: text = element.text @@ -112,8 +112,7 @@ def __find_reactions(post): """finds all reaction of the facebook post using selenium's webdriver's method""" try: #find element that have attribute aria-label as 'See who reacted to this - reactions_all = post.find_element_by_css_selector( - '[aria-label="See who reacted to this"]') + reactions_all = post.find_element(By.CSS_SELECTOR,'[aria-label="See who reacted to this"]') except NoSuchElementException: reactions_all = "" except Exception as ex: @@ -126,11 +125,11 @@ def __find_comments(post,layout): try: comments = "" if layout == "old": - comments = post.find_element_by_css_selector("a._3hg-").get_attribute('textContent') + comments = post.find_element(By.CSS_SELECTOR,"a._3hg-").get_attribute('textContent') #extract numbers from text comments = Scraping_utilities._Scraping_utilities__extract_numbers(comments) elif layout == "new": - elements = post.find_elements_by_css_selector("div.gtad4xkn") + elements = post.find_elements(By.CSS_SELECTOR,"div.gtad4xkn") comments = "0" for element in elements: text = element.text @@ -164,7 +163,7 @@ def __fetch_post_passage(href): @staticmethod def __element_exists(element,css_selector): try: - found = element.find_element_by_css_selector(css_selector) + found = element.find_element(By.CSS_SELECTOR,css_selector) return True except NoSuchElementException: return False @@ -174,12 +173,12 @@ def __find_content(post,driver,layout): """finds content of the facebook post using selenium's webdriver's method and returns string containing text of the posts""" try: if layout == "old": - post_content = post.find_element_by_class_name('userContent') + post_content = post.find_element(By.CLASS_NAME,'userContent') elif layout == "new": - post_content = post.find_element_by_css_selector('[data-ad-preview="message"]') + post_content = post.find_element(By.CSS_SELECTOR,'[data-ad-preview="message"]') #if 'See more' or 'Continue reading' is present in post if Finder._Finder__element_exists(post_content,"span.text_exposed_link > a"): - element = post_content.find_element_by_css_selector("span.text_exposed_link > a") #grab that element + element = post_content.find_element(By.CSS_SELECTOR,"span.text_exposed_link > a") #grab that element #if element have already the onclick function, that means it is expandable paragraph if element.get_attribute("onclick"): Utilities._Utilities__click_see_more(driver,post_content) #click 'see more' button to get hidden text as well @@ -209,7 +208,7 @@ def __find_posted_time(post,layout,link_element): #extract element that looks like #posted_time = post.find_element_by_css_selector("abbr._5ptz").get_attribute("data-utime") if layout == "old": - posted_time = post.find_element_by_tag_name("abbr").get_attribute('data-utime') + posted_time = post.find_element(By.TAG_NAME,"abbr").get_attribute('data-utime') return datetime.datetime.fromtimestamp(float(posted_time)).isoformat() elif layout == "new": aria_label_value = link_element.get_attribute("aria-label") @@ -233,7 +232,7 @@ def __find_video_url(post,page_name,status): """finds video of the facebook post using selenium's webdriver's method""" try: #if video is found in the post, than create a video URL by concatenating post's id with page_name - video_element = post.find_element_by_tag_name("video") + video_element = post.find_element(By.TAG_NAME,"video") video = "https://www.facebook.com/{}/videos/{}".format(page_name,status) except NoSuchElementException: @@ -250,7 +249,7 @@ def __find_image_url(post): """finds all image of the facebook post using selenium's webdriver's method""" try: #find all img tag that looks like - images = post.find_elements_by_css_selector("img.scaledImageFitWidth.img") + images = post.find_elements(By.CSS_SELECTOR,"img.scaledImageFitWidth.img") #extract src attribute from all the img tag,store it in list sources = [image.get_attribute("src") for image in images] if len(images) > 0 else [] except NoSuchElementException: @@ -268,10 +267,9 @@ def __find_all_posts(driver,layout): try: #find all posts that looks like
    if layout == "old": - all_posts = driver.find_elements_by_css_selector("div.userContentWrapper") + all_posts = driver.find_elements(By.CSS_SELECTOR,"div.userContentWrapper") elif layout == "new": - all_posts = driver.find_elements_by_css_selector( - '[aria-posinset]') + all_posts = driver.find_elements(By.CSS_SELECTOR,'[aria-posinset]') return all_posts except NoSuchElementException: print("Cannot find any posts! Exiting!") @@ -288,9 +286,9 @@ def __find_name(driver,layout): """finds name of the facebook page using selenium's webdriver's method""" try: if layout == "old": - name = driver.find_element_by_css_selector('a._64-f').get_attribute('textContent') + name = driver.find_element(By.CSS_SELECTOR,'a._64-f').get_attribute('textContent') elif layout == "new": - name = driver.find_element_by_tag_name("strong").get_attribute("textContent") + name = driver.find_element(By.TAG_NAME,"strong").get_attribute("textContent") return name except Exception as ex: print("error at __find_name method : {}".format(ex)) @@ -298,7 +296,7 @@ def __find_name(driver,layout): @staticmethod def __detect_ui(driver): try: - driver.find_element_by_id("pagelet_bluebar") + driver.find_element(By.ID,"pagelet_bluebar") return "old" except NoSuchElementException: return "new" @@ -311,10 +309,10 @@ def __detect_ui(driver): def __find_reaction(layout, reactions_all): try: if layout == "old": - return reactions_all.find_elements_by_tag_name( + return reactions_all.find_elements(By.TAG_NAME, "a") elif layout == "new": - return reactions_all.find_elements_by_tag_name( + return reactions_all.find_elements(By.TAG_NAME, "div") except Exception as ex: diff --git a/facebook_page_scraper/scraper.py b/facebook_page_scraper/scraper.py index 4ec2e1a..a06bc18 100644 --- a/facebook_page_scraper/scraper.py +++ b/facebook_page_scraper/scraper.py @@ -8,6 +8,7 @@ import json import csv import os + import time except Exception as ex: print(ex) @@ -38,9 +39,8 @@ class Facebook_scraper: #on each iteration __close_after_retry is called to check if retry have turned to 0 # if it returns true,it will break the loop. After coming out of loop,driver will be closed and it will return post whatever was found - retry = 10 - def __init__(self,page_name,posts_count=10,browser="chrome",proxy=None): + def __init__(self,page_name,posts_count=10,browser="chrome",proxy=None,timeout=600): self.page_name = page_name self.posts_count = int(posts_count) #self.URL = "https://en-gb.facebook.com/pg/{}/posts".format(self.page_name) @@ -49,21 +49,30 @@ def __init__(self,page_name,posts_count=10,browser="chrome",proxy=None): self.__driver = '' self.proxy = proxy self.__layout = '' + self.timeout = timeout def __start_driver(self): """changes the class member __driver value to driver on call""" self.__driver = Initializer(self.browser,self.proxy).init() - def __handle_popup_old_layout(self,layout): + def __handle_popup(self,layout): #while scrolling, wait for login popup to show, it can be skipped by clicking "Not Now" button try: - Utilities._Utilities__close_popup(self.__driver) - except: - pass + if layout == "old": + #if during scrolling any of error or signup popup shows + Utilities._Utilities__close_error_popup(self.__driver) + Utilities._Utilities__close_popup(self.__driver) + elif layout == "new": + Utilities._Utilities__close_modern_layout_signup_modal(self.__driver) + except Exception as ex: + print(ex) + + def __check_timeout(self,start_time,current_time): + return (current_time-start_time) > self.timeout def scrap_to_json(self): #call the __start_driver and override class member __driver to webdriver's instance self.__start_driver() - + starting_time = time.time() #navigate to URL self.__driver.get(self.URL) @@ -75,21 +84,18 @@ def scrap_to_json(self): Utilities._Utilities__wait_for_element_to_appear(self.__driver,self.__layout) #scroll down to bottom most Utilities._Utilities__scroll_down(self.__driver,self.__layout) - self.__handle_popup_old_layout(self.__layout) + self.__handle_popup(self.__layout) name = Finder._Finder__find_name(self.__driver,self.__layout) #find name element while len(self.__data_dict) <= self.posts_count: - - #if during scrolling any of error or signup popup shows - Utilities._Utilities__close_error_popup(self.__driver) - self.__handle_popup_old_layout(self.__layout) + self.__handle_popup(self.__layout) self.__find_elements(name) - - if self.__close_after_retry() is True: - #keep a check if posts are available, if retry is 0, than it breaks loop - break + current_time = time.time() + if self.__check_timeout(starting_time,current_time) is True: + print("Timeout...") + break Utilities._Utilities__scroll_down(self.__driver, self.__layout) #scroll down #print(len(self.__data_dict)) #close the browser window after job is done. @@ -163,7 +169,6 @@ def __find_elements(self,name): all_posts = Finder._Finder__find_all_posts(self.__driver,self.__layout) #find all posts all_posts = self.__remove_duplicates(all_posts) #remove duplicates from the list - self.__no_post_found(all_posts) #after removing duplicates if length is 0, retry will decrease by 1 #iterate over all the posts and find details from the same for post in all_posts: try: diff --git a/requirements.txt b/requirements.txt index 937252e..2c84015 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -selenium==3.141.0 +selenium==4.1.0 webdriver-manager==3.2.2 selenium-wire==4.3.1 python-dateutil==2.8.2 \ No newline at end of file diff --git a/setup.py b/setup.py index c077f04..f3481c8 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setuptools.setup( name = "facebook_page_scraper", - version = "0.1.10", + version = "2.0.0", author = "Sajid Shaikh", author_email = "shaikhsajid3732@gmail.com", description = "Python package to scrap facebook's pages front end with no limitations",