diff --git a/README.MD b/README.MD
index 266c4c3..69eb7fa 100644
--- a/README.MD
+++ b/README.MD
@@ -48,11 +48,12 @@ page_name = "facebookai"
posts_count = 10
browser = "firefox"
proxy = "IP:PORT" #if proxy requires authentication then user:password@IP:PORT
-meta_ai = Facebook_scraper(page_name,posts_count,browser,proxy=proxy)
+timeout = 600 #600 seconds
+meta_ai = Facebook_scraper(page_name,posts_count,browser,proxy=proxy,timeout=timeout)
```
-
Parameters for Facebook_scraper(page_name,posts_count,browser,proxy)
class
+ Parameters for Facebook_scraper(page_name,posts_count,browser,proxy,timeout)
class
|
@@ -109,6 +110,18 @@ string
optional argument, if user wants to set proxy, if proxy requires authentication then the format will be user:password@IP:PORT
+
+
+timeout
+ |
+
+integer
+ |
+
+The maximum amount of time the bot should run for. If not passed, the default timeout is set to 10 minutes
+
+ |
+
diff --git a/changelog.MD b/changelog.MD
index a5b735a..405eead 100644
--- a/changelog.MD
+++ b/changelog.MD
@@ -1,5 +1,14 @@
Changelog
+ 2.0.0
+Added
+Timeout argument to set the maximum amount of time the bot should run in case if no post were found.
+Changes
+Updated selenium from version 3.141.0
to 4.1.0
+Fixed
+Fixed issue of browser keep on scrolling above despite calling scroll down method, happening due to different multiple functions call
+
+
0.1.10
Added
Support for new Facebook Layout
diff --git a/facebook_page_scraper/driver_utilities.py b/facebook_page_scraper/driver_utilities.py
index e55a386..1091288 100644
--- a/facebook_page_scraper/driver_utilities.py
+++ b/facebook_page_scraper/driver_utilities.py
@@ -1,4 +1,7 @@
#!/usr/bin/env python3
+from fileinput import close
+
+
try:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
@@ -28,7 +31,7 @@ def __close_error_popup(driver):
than click on close button to skip that popup.'''
try:
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,'a.layerCancel'))) #wait for popup to show
- button = driver.find_element_by_css_selector("a.layerCancel") #grab that popup's close button
+ button = driver.find_element(By.CSS_SELECTOR,"a.layerCancel") #grab that popup's close button
button.click() #click "close" button
except WebDriverException:
#it is possible that even after waiting for given amount of time,modal may not appear
@@ -49,6 +52,19 @@ def __scroll_down_half(driver):
Utilities.__close_driver(driver)
print("error at scroll_down_half method : {}".format(ex))
+ @staticmethod
+ def __close_modern_layout_signup_modal(driver):
+ try:
+ driver.execute_script(
+ "window.scrollTo(0, document.body.scrollHeight);")
+ close_button = driver.find_element(By.CSS_SELECTOR,'[aria-label="Close"]')
+ close_button.click()
+ except NoSuchElementException:
+ pass
+ except Exception as ex:
+ print("error at close_modern_layout_signup_modal: {}".format(ex))
+
+
@staticmethod
def __scroll_down(driver,layout):
"""expects driver's instance as a argument, and it scrolls down page to the most bottom till the height"""
@@ -57,9 +73,13 @@ def __scroll_down(driver,layout):
driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);")
elif layout == "new":
- body = driver.find_element_by_css_selector("body")
- for _ in range(randint(2, 3)):
+ body = driver.find_element(By.CSS_SELECTOR,"body")
+ for _ in range(randint(5,6)):
+ body.send_keys(Keys.PAGE_UP)
+ for _ in range(randint(5, 8)):
body.send_keys(Keys.PAGE_DOWN)
+ #driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+ #Utilities.__close_modern_layout_signup_modal(driver)
except Exception as ex:
#if any error occured than close the driver and exit
Utilities.__close_driver(driver)
@@ -69,11 +89,11 @@ def __scroll_down(driver,layout):
def __close_popup(driver):
"""expects driver's instance and closes modal that ask for login, by clicking "Not Now" button """
try:
- Utilities.__scroll_down_half(driver) #try to scroll
+ #Utilities.__scroll_down_half(driver) #try to scroll
#wait for popup to show
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID,'expanding_cta_close_button')))
#grab "Not Now" button
- popup_close_button = driver.find_element_by_id('expanding_cta_close_button')
+ popup_close_button = driver.find_element(By.ID,'expanding_cta_close_button')
popup_close_button.click() #click the button
except WebDriverException:
#modal may not popup, so no need to raise exception in case it is not found
@@ -91,7 +111,7 @@ def __wait_for_element_to_appear(driver,layout):
try:
if layout == "old":
#wait for page to load so posts are visible
- body = driver.find_element_by_css_selector("body")
+ body = driver.find_element(By.CSS_SELECTOR,"body")
for _ in range(randint(3, 5)):
body.send_keys(Keys.PAGE_DOWN)
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR,'.userContentWrapper')))
@@ -115,7 +135,7 @@ def __click_see_more(driver,content):
"""expects driver's instance and selenium element, click on "see more" link to open hidden content"""
try:
#find element and click 'see more' button
- element = content.find_element_by_css_selector('span.see_more_link_inner')
+ element = content.find_element(By.CSS_SELECTOR,'span.see_more_link_inner')
driver.execute_script("arguments[0].click();", element) #click button using js
except NoSuchElementException:
diff --git a/facebook_page_scraper/element_finder.py b/facebook_page_scraper/element_finder.py
index 1544d79..746c256 100644
--- a/facebook_page_scraper/element_finder.py
+++ b/facebook_page_scraper/element_finder.py
@@ -9,6 +9,7 @@
from dateutil.parser import parse
import dateutil
import datetime
+ from selenium.webdriver.common.by import By
except Exception as ex:
print(ex)
@@ -59,14 +60,13 @@ def __find_status(post,layout):
if layout == "old":
#aim is to find element that looks like
#after finding that element, get it's href value and pass it to different method that extracts post_id from that href
- status_link = post.find_element_by_class_name("_5pcq").get_attribute("href")
+ status_link = post.find_element(By.CLASS_NAME,"_5pcq").get_attribute("href")
#extract out post id from post's url
status = Scraping_utilities._Scraping_utilities__extract_id_from_link(status_link)
elif layout == "new":
- links = post.find_elements_by_css_selector("a[role='link']")
- link = Finder.__get_status_link(links)
+ #links = post.find_elements(By.CSS_SELECTOR,"a[role='link']")
+ link = post.find_element(By.CSS_SELECTOR,'.gpro0wi8.b1v8xokw')
status_link = link.get_attribute('href')
- print("Status Link: ",status_link)
status = Scraping_utilities._Scraping_utilities__extract_id_from_link(
status_link)
except NoSuchElementException:
@@ -85,10 +85,10 @@ def __find_share(post,layout):
try:
if layout == "old":
#aim is to find element that have datatest-id attribute as UFI2SharesCount/root
- shares = post.find_element_by_css_selector("[data-testid='UFI2SharesCount/root']").get_attribute('textContent')
+ shares = post.find_element(By.CSS_SELECTOR,"[data-testid='UFI2SharesCount/root']").get_attribute('textContent')
shares = Scraping_utilities._Scraping_utilities__extract_numbers(shares)
elif layout == "new":
- elements = post.find_elements_by_css_selector("div.gtad4xkn")
+ elements = post.find_elements(By.CSS_SELECTOR,"div.gtad4xkn")
shares = "0"
for element in elements:
text = element.text
@@ -112,8 +112,7 @@ def __find_reactions(post):
"""finds all reaction of the facebook post using selenium's webdriver's method"""
try:
#find element that have attribute aria-label as 'See who reacted to this
- reactions_all = post.find_element_by_css_selector(
- '[aria-label="See who reacted to this"]')
+ reactions_all = post.find_element(By.CSS_SELECTOR,'[aria-label="See who reacted to this"]')
except NoSuchElementException:
reactions_all = ""
except Exception as ex:
@@ -126,11 +125,11 @@ def __find_comments(post,layout):
try:
comments = ""
if layout == "old":
- comments = post.find_element_by_css_selector("a._3hg-").get_attribute('textContent')
+ comments = post.find_element(By.CSS_SELECTOR,"a._3hg-").get_attribute('textContent')
#extract numbers from text
comments = Scraping_utilities._Scraping_utilities__extract_numbers(comments)
elif layout == "new":
- elements = post.find_elements_by_css_selector("div.gtad4xkn")
+ elements = post.find_elements(By.CSS_SELECTOR,"div.gtad4xkn")
comments = "0"
for element in elements:
text = element.text
@@ -164,7 +163,7 @@ def __fetch_post_passage(href):
@staticmethod
def __element_exists(element,css_selector):
try:
- found = element.find_element_by_css_selector(css_selector)
+ found = element.find_element(By.CSS_SELECTOR,css_selector)
return True
except NoSuchElementException:
return False
@@ -174,12 +173,12 @@ def __find_content(post,driver,layout):
"""finds content of the facebook post using selenium's webdriver's method and returns string containing text of the posts"""
try:
if layout == "old":
- post_content = post.find_element_by_class_name('userContent')
+ post_content = post.find_element(By.CLASS_NAME,'userContent')
elif layout == "new":
- post_content = post.find_element_by_css_selector('[data-ad-preview="message"]')
+ post_content = post.find_element(By.CSS_SELECTOR,'[data-ad-preview="message"]')
#if 'See more' or 'Continue reading' is present in post
if Finder._Finder__element_exists(post_content,"span.text_exposed_link > a"):
- element = post_content.find_element_by_css_selector("span.text_exposed_link > a") #grab that element
+ element = post_content.find_element(By.CSS_SELECTOR,"span.text_exposed_link > a") #grab that element
#if element have already the onclick function, that means it is expandable paragraph
if element.get_attribute("onclick"):
Utilities._Utilities__click_see_more(driver,post_content) #click 'see more' button to get hidden text as well
@@ -209,7 +208,7 @@ def __find_posted_time(post,layout,link_element):
#extract element that looks like
#posted_time = post.find_element_by_css_selector("abbr._5ptz").get_attribute("data-utime")
if layout == "old":
- posted_time = post.find_element_by_tag_name("abbr").get_attribute('data-utime')
+ posted_time = post.find_element(By.TAG_NAME,"abbr").get_attribute('data-utime')
return datetime.datetime.fromtimestamp(float(posted_time)).isoformat()
elif layout == "new":
aria_label_value = link_element.get_attribute("aria-label")
@@ -233,7 +232,7 @@ def __find_video_url(post,page_name,status):
"""finds video of the facebook post using selenium's webdriver's method"""
try:
#if video is found in the post, than create a video URL by concatenating post's id with page_name
- video_element = post.find_element_by_tag_name("video")
+ video_element = post.find_element(By.TAG_NAME,"video")
video = "https://www.facebook.com/{}/videos/{}".format(page_name,status)
except NoSuchElementException:
@@ -250,7 +249,7 @@ def __find_image_url(post):
"""finds all image of the facebook post using selenium's webdriver's method"""
try:
#find all img tag that looks like
- images = post.find_elements_by_css_selector("img.scaledImageFitWidth.img")
+ images = post.find_elements(By.CSS_SELECTOR,"img.scaledImageFitWidth.img")
#extract src attribute from all the img tag,store it in list
sources = [image.get_attribute("src") for image in images] if len(images) > 0 else []
except NoSuchElementException:
@@ -268,10 +267,9 @@ def __find_all_posts(driver,layout):
try:
#find all posts that looks like
if layout == "old":
- all_posts = driver.find_elements_by_css_selector("div.userContentWrapper")
+ all_posts = driver.find_elements(By.CSS_SELECTOR,"div.userContentWrapper")
elif layout == "new":
- all_posts = driver.find_elements_by_css_selector(
- '[aria-posinset]')
+ all_posts = driver.find_elements(By.CSS_SELECTOR,'[aria-posinset]')
return all_posts
except NoSuchElementException:
print("Cannot find any posts! Exiting!")
@@ -288,9 +286,9 @@ def __find_name(driver,layout):
"""finds name of the facebook page using selenium's webdriver's method"""
try:
if layout == "old":
- name = driver.find_element_by_css_selector('a._64-f').get_attribute('textContent')
+ name = driver.find_element(By.CSS_SELECTOR,'a._64-f').get_attribute('textContent')
elif layout == "new":
- name = driver.find_element_by_tag_name("strong").get_attribute("textContent")
+ name = driver.find_element(By.TAG_NAME,"strong").get_attribute("textContent")
return name
except Exception as ex:
print("error at __find_name method : {}".format(ex))
@@ -298,7 +296,7 @@ def __find_name(driver,layout):
@staticmethod
def __detect_ui(driver):
try:
- driver.find_element_by_id("pagelet_bluebar")
+ driver.find_element(By.ID,"pagelet_bluebar")
return "old"
except NoSuchElementException:
return "new"
@@ -311,10 +309,10 @@ def __detect_ui(driver):
def __find_reaction(layout, reactions_all):
try:
if layout == "old":
- return reactions_all.find_elements_by_tag_name(
+ return reactions_all.find_elements(By.TAG_NAME,
"a")
elif layout == "new":
- return reactions_all.find_elements_by_tag_name(
+ return reactions_all.find_elements(By.TAG_NAME,
"div")
except Exception as ex:
diff --git a/facebook_page_scraper/scraper.py b/facebook_page_scraper/scraper.py
index 4ec2e1a..a06bc18 100644
--- a/facebook_page_scraper/scraper.py
+++ b/facebook_page_scraper/scraper.py
@@ -8,6 +8,7 @@
import json
import csv
import os
+ import time
except Exception as ex:
print(ex)
@@ -38,9 +39,8 @@ class Facebook_scraper:
#on each iteration __close_after_retry is called to check if retry have turned to 0
# if it returns true,it will break the loop. After coming out of loop,driver will be closed and it will return post whatever was found
- retry = 10
- def __init__(self,page_name,posts_count=10,browser="chrome",proxy=None):
+ def __init__(self,page_name,posts_count=10,browser="chrome",proxy=None,timeout=600):
self.page_name = page_name
self.posts_count = int(posts_count)
#self.URL = "https://en-gb.facebook.com/pg/{}/posts".format(self.page_name)
@@ -49,21 +49,30 @@ def __init__(self,page_name,posts_count=10,browser="chrome",proxy=None):
self.__driver = ''
self.proxy = proxy
self.__layout = ''
+ self.timeout = timeout
def __start_driver(self):
"""changes the class member __driver value to driver on call"""
self.__driver = Initializer(self.browser,self.proxy).init()
- def __handle_popup_old_layout(self,layout):
+ def __handle_popup(self,layout):
#while scrolling, wait for login popup to show, it can be skipped by clicking "Not Now" button
try:
- Utilities._Utilities__close_popup(self.__driver)
- except:
- pass
+ if layout == "old":
+ #if during scrolling any of error or signup popup shows
+ Utilities._Utilities__close_error_popup(self.__driver)
+ Utilities._Utilities__close_popup(self.__driver)
+ elif layout == "new":
+ Utilities._Utilities__close_modern_layout_signup_modal(self.__driver)
+ except Exception as ex:
+ print(ex)
+
+ def __check_timeout(self,start_time,current_time):
+ return (current_time-start_time) > self.timeout
def scrap_to_json(self):
#call the __start_driver and override class member __driver to webdriver's instance
self.__start_driver()
-
+ starting_time = time.time()
#navigate to URL
self.__driver.get(self.URL)
@@ -75,21 +84,18 @@ def scrap_to_json(self):
Utilities._Utilities__wait_for_element_to_appear(self.__driver,self.__layout)
#scroll down to bottom most
Utilities._Utilities__scroll_down(self.__driver,self.__layout)
- self.__handle_popup_old_layout(self.__layout)
+ self.__handle_popup(self.__layout)
name = Finder._Finder__find_name(self.__driver,self.__layout) #find name element
while len(self.__data_dict) <= self.posts_count:
-
- #if during scrolling any of error or signup popup shows
- Utilities._Utilities__close_error_popup(self.__driver)
- self.__handle_popup_old_layout(self.__layout)
+ self.__handle_popup(self.__layout)
self.__find_elements(name)
-
- if self.__close_after_retry() is True:
- #keep a check if posts are available, if retry is 0, than it breaks loop
- break
+ current_time = time.time()
+ if self.__check_timeout(starting_time,current_time) is True:
+ print("Timeout...")
+ break
Utilities._Utilities__scroll_down(self.__driver, self.__layout) #scroll down
#print(len(self.__data_dict))
#close the browser window after job is done.
@@ -163,7 +169,6 @@ def __find_elements(self,name):
all_posts = Finder._Finder__find_all_posts(self.__driver,self.__layout) #find all posts
all_posts = self.__remove_duplicates(all_posts) #remove duplicates from the list
- self.__no_post_found(all_posts) #after removing duplicates if length is 0, retry will decrease by 1
#iterate over all the posts and find details from the same
for post in all_posts:
try:
diff --git a/requirements.txt b/requirements.txt
index 937252e..2c84015 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-selenium==3.141.0
+selenium==4.1.0
webdriver-manager==3.2.2
selenium-wire==4.3.1
python-dateutil==2.8.2
\ No newline at end of file
diff --git a/setup.py b/setup.py
index c077f04..f3481c8 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
setuptools.setup(
name = "facebook_page_scraper",
- version = "0.1.10",
+ version = "2.0.0",
author = "Sajid Shaikh",
author_email = "shaikhsajid3732@gmail.com",
description = "Python package to scrap facebook's pages front end with no limitations",