Skip to content

Commit

Permalink
Merge pull request #16 from shaikhsajid1111/fixes
Browse files Browse the repository at this point in the history
Fixes for scrolling issue and Timeout
  • Loading branch information
shaikhsajid1111 authored Feb 5, 2022
2 parents fa23d56 + 66694bd commit 6f836f6
Show file tree
Hide file tree
Showing 7 changed files with 98 additions and 53 deletions.
17 changes: 15 additions & 2 deletions README.MD
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,12 @@ page_name = "facebookai"
posts_count = 10
browser = "firefox"
proxy = "IP:PORT" #if proxy requires authentication then user:password@IP:PORT
meta_ai = Facebook_scraper(page_name,posts_count,browser,proxy=proxy)
timeout = 600 #600 seconds
meta_ai = Facebook_scraper(page_name,posts_count,browser,proxy=proxy,timeout=timeout)

```

<h3> Parameters for <code>Facebook_scraper(page_name,posts_count,browser,proxy) </code> class </h3>
<h3> Parameters for <code>Facebook_scraper(page_name,posts_count,browser,proxy,timeout) </code> class </h3>
<table>
<th>
<tr>
Expand Down Expand Up @@ -109,6 +110,18 @@ string
optional argument, if user wants to set proxy, if proxy requires authentication then the format will be <code> user:password@IP:PORT </code>
</td>
</tr>
<tr>
<td>
timeout
</td>
<td>
integer
</td>
<td>
The maximum amount of time the bot should run for. If not passed, the default timeout is set to 10 minutes
</code>
</td>
</tr>

</table>
<br>
Expand Down
9 changes: 9 additions & 0 deletions changelog.MD
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
<h1> Changelog </h1>
<section>
<h2> 2.0.0 </h2>
<h3>Added</h3>
<li>Timeout argument to set the maximum amount of time the bot should run in case if no post were found.</li>
<h3>Changes</h3>
<li>Updated selenium from version <code>3.141.0</code> to <code>4.1.0</code> </li>
<h3>Fixed</h3>
<li>Fixed issue of browser keep on scrolling above despite calling scroll down method, happening due to different multiple functions call </li>
<br>
<section>
<h2> 0.1.10 </h2>
<h3>Added</h3>
<li>Support for new Facebook Layout</li>
Expand Down
34 changes: 27 additions & 7 deletions facebook_page_scraper/driver_utilities.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
#!/usr/bin/env python3
from fileinput import close


try:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
Expand Down Expand Up @@ -28,7 +31,7 @@ def __close_error_popup(driver):
than click on close button to skip that popup.'''
try:
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,'a.layerCancel'))) #wait for popup to show
button = driver.find_element_by_css_selector("a.layerCancel") #grab that popup's close button
button = driver.find_element(By.CSS_SELECTOR,"a.layerCancel") #grab that popup's close button
button.click() #click "close" button
except WebDriverException:
#it is possible that even after waiting for given amount of time,modal may not appear
Expand All @@ -49,6 +52,19 @@ def __scroll_down_half(driver):
Utilities.__close_driver(driver)
print("error at scroll_down_half method : {}".format(ex))

@staticmethod
def __close_modern_layout_signup_modal(driver):
try:
driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);")
close_button = driver.find_element(By.CSS_SELECTOR,'[aria-label="Close"]')
close_button.click()
except NoSuchElementException:
pass
except Exception as ex:
print("error at close_modern_layout_signup_modal: {}".format(ex))


@staticmethod
def __scroll_down(driver,layout):
"""expects driver's instance as a argument, and it scrolls down page to the most bottom till the height"""
Expand All @@ -57,9 +73,13 @@ def __scroll_down(driver,layout):
driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);")
elif layout == "new":
body = driver.find_element_by_css_selector("body")
for _ in range(randint(2, 3)):
body = driver.find_element(By.CSS_SELECTOR,"body")
for _ in range(randint(5,6)):
body.send_keys(Keys.PAGE_UP)
for _ in range(randint(5, 8)):
body.send_keys(Keys.PAGE_DOWN)
#driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#Utilities.__close_modern_layout_signup_modal(driver)
except Exception as ex:
#if any error occured than close the driver and exit
Utilities.__close_driver(driver)
Expand All @@ -69,11 +89,11 @@ def __scroll_down(driver,layout):
def __close_popup(driver):
"""expects driver's instance and closes modal that ask for login, by clicking "Not Now" button """
try:
Utilities.__scroll_down_half(driver) #try to scroll
#Utilities.__scroll_down_half(driver) #try to scroll
#wait for popup to show
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID,'expanding_cta_close_button')))
#grab "Not Now" button
popup_close_button = driver.find_element_by_id('expanding_cta_close_button')
popup_close_button = driver.find_element(By.ID,'expanding_cta_close_button')
popup_close_button.click() #click the button
except WebDriverException:
#modal may not popup, so no need to raise exception in case it is not found
Expand All @@ -91,7 +111,7 @@ def __wait_for_element_to_appear(driver,layout):
try:
if layout == "old":
#wait for page to load so posts are visible
body = driver.find_element_by_css_selector("body")
body = driver.find_element(By.CSS_SELECTOR,"body")
for _ in range(randint(3, 5)):
body.send_keys(Keys.PAGE_DOWN)
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR,'.userContentWrapper')))
Expand All @@ -115,7 +135,7 @@ def __click_see_more(driver,content):
"""expects driver's instance and selenium element, click on "see more" link to open hidden content"""
try:
#find element and click 'see more' button
element = content.find_element_by_css_selector('span.see_more_link_inner')
element = content.find_element(By.CSS_SELECTOR,'span.see_more_link_inner')
driver.execute_script("arguments[0].click();", element) #click button using js

except NoSuchElementException:
Expand Down
48 changes: 23 additions & 25 deletions facebook_page_scraper/element_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from dateutil.parser import parse
import dateutil
import datetime
from selenium.webdriver.common.by import By
except Exception as ex:
print(ex)

Expand Down Expand Up @@ -59,14 +60,13 @@ def __find_status(post,layout):
if layout == "old":
#aim is to find element that looks like <a href="URL" class="_5pcq"></a>
#after finding that element, get it's href value and pass it to different method that extracts post_id from that href
status_link = post.find_element_by_class_name("_5pcq").get_attribute("href")
status_link = post.find_element(By.CLASS_NAME,"_5pcq").get_attribute("href")
#extract out post id from post's url
status = Scraping_utilities._Scraping_utilities__extract_id_from_link(status_link)
elif layout == "new":
links = post.find_elements_by_css_selector("a[role='link']")
link = Finder.__get_status_link(links)
#links = post.find_elements(By.CSS_SELECTOR,"a[role='link']")
link = post.find_element(By.CSS_SELECTOR,'.gpro0wi8.b1v8xokw')
status_link = link.get_attribute('href')
print("Status Link: ",status_link)
status = Scraping_utilities._Scraping_utilities__extract_id_from_link(
status_link)
except NoSuchElementException:
Expand All @@ -85,10 +85,10 @@ def __find_share(post,layout):
try:
if layout == "old":
#aim is to find element that have datatest-id attribute as UFI2SharesCount/root
shares = post.find_element_by_css_selector("[data-testid='UFI2SharesCount/root']").get_attribute('textContent')
shares = post.find_element(By.CSS_SELECTOR,"[data-testid='UFI2SharesCount/root']").get_attribute('textContent')
shares = Scraping_utilities._Scraping_utilities__extract_numbers(shares)
elif layout == "new":
elements = post.find_elements_by_css_selector("div.gtad4xkn")
elements = post.find_elements(By.CSS_SELECTOR,"div.gtad4xkn")
shares = "0"
for element in elements:
text = element.text
Expand All @@ -112,8 +112,7 @@ def __find_reactions(post):
"""finds all reaction of the facebook post using selenium's webdriver's method"""
try:
#find element that have attribute aria-label as 'See who reacted to this
reactions_all = post.find_element_by_css_selector(
'[aria-label="See who reacted to this"]')
reactions_all = post.find_element(By.CSS_SELECTOR,'[aria-label="See who reacted to this"]')
except NoSuchElementException:
reactions_all = ""
except Exception as ex:
Expand All @@ -126,11 +125,11 @@ def __find_comments(post,layout):
try:
comments = ""
if layout == "old":
comments = post.find_element_by_css_selector("a._3hg-").get_attribute('textContent')
comments = post.find_element(By.CSS_SELECTOR,"a._3hg-").get_attribute('textContent')
#extract numbers from text
comments = Scraping_utilities._Scraping_utilities__extract_numbers(comments)
elif layout == "new":
elements = post.find_elements_by_css_selector("div.gtad4xkn")
elements = post.find_elements(By.CSS_SELECTOR,"div.gtad4xkn")
comments = "0"
for element in elements:
text = element.text
Expand Down Expand Up @@ -164,7 +163,7 @@ def __fetch_post_passage(href):
@staticmethod
def __element_exists(element,css_selector):
try:
found = element.find_element_by_css_selector(css_selector)
found = element.find_element(By.CSS_SELECTOR,css_selector)
return True
except NoSuchElementException:
return False
Expand All @@ -174,12 +173,12 @@ def __find_content(post,driver,layout):
"""finds content of the facebook post using selenium's webdriver's method and returns string containing text of the posts"""
try:
if layout == "old":
post_content = post.find_element_by_class_name('userContent')
post_content = post.find_element(By.CLASS_NAME,'userContent')
elif layout == "new":
post_content = post.find_element_by_css_selector('[data-ad-preview="message"]')
post_content = post.find_element(By.CSS_SELECTOR,'[data-ad-preview="message"]')
#if 'See more' or 'Continue reading' is present in post
if Finder._Finder__element_exists(post_content,"span.text_exposed_link > a"):
element = post_content.find_element_by_css_selector("span.text_exposed_link > a") #grab that element
element = post_content.find_element(By.CSS_SELECTOR,"span.text_exposed_link > a") #grab that element
#if element have already the onclick function, that means it is expandable paragraph
if element.get_attribute("onclick"):
Utilities._Utilities__click_see_more(driver,post_content) #click 'see more' button to get hidden text as well
Expand Down Expand Up @@ -209,7 +208,7 @@ def __find_posted_time(post,layout,link_element):
#extract element that looks like <abbr class='_5ptz' data-utime="some unix timestamp"> </abbr>
#posted_time = post.find_element_by_css_selector("abbr._5ptz").get_attribute("data-utime")
if layout == "old":
posted_time = post.find_element_by_tag_name("abbr").get_attribute('data-utime')
posted_time = post.find_element(By.TAG_NAME,"abbr").get_attribute('data-utime')
return datetime.datetime.fromtimestamp(float(posted_time)).isoformat()
elif layout == "new":
aria_label_value = link_element.get_attribute("aria-label")
Expand All @@ -233,7 +232,7 @@ def __find_video_url(post,page_name,status):
"""finds video of the facebook post using selenium's webdriver's method"""
try:
#if video is found in the post, than create a video URL by concatenating post's id with page_name
video_element = post.find_element_by_tag_name("video")
video_element = post.find_element(By.TAG_NAME,"video")
video = "https://www.facebook.com/{}/videos/{}".format(page_name,status)

except NoSuchElementException:
Expand All @@ -250,7 +249,7 @@ def __find_image_url(post):
"""finds all image of the facebook post using selenium's webdriver's method"""
try:
#find all img tag that looks like <img class="scaledImageFitWidth img" src="">
images = post.find_elements_by_css_selector("img.scaledImageFitWidth.img")
images = post.find_elements(By.CSS_SELECTOR,"img.scaledImageFitWidth.img")
#extract src attribute from all the img tag,store it in list
sources = [image.get_attribute("src") for image in images] if len(images) > 0 else []
except NoSuchElementException:
Expand All @@ -268,10 +267,9 @@ def __find_all_posts(driver,layout):
try:
#find all posts that looks like <div class="userContentWrapper"> </div>
if layout == "old":
all_posts = driver.find_elements_by_css_selector("div.userContentWrapper")
all_posts = driver.find_elements(By.CSS_SELECTOR,"div.userContentWrapper")
elif layout == "new":
all_posts = driver.find_elements_by_css_selector(
'[aria-posinset]')
all_posts = driver.find_elements(By.CSS_SELECTOR,'[aria-posinset]')
return all_posts
except NoSuchElementException:
print("Cannot find any posts! Exiting!")
Expand All @@ -288,17 +286,17 @@ def __find_name(driver,layout):
"""finds name of the facebook page using selenium's webdriver's method"""
try:
if layout == "old":
name = driver.find_element_by_css_selector('a._64-f').get_attribute('textContent')
name = driver.find_element(By.CSS_SELECTOR,'a._64-f').get_attribute('textContent')
elif layout == "new":
name = driver.find_element_by_tag_name("strong").get_attribute("textContent")
name = driver.find_element(By.TAG_NAME,"strong").get_attribute("textContent")
return name
except Exception as ex:
print("error at __find_name method : {}".format(ex))

@staticmethod
def __detect_ui(driver):
try:
driver.find_element_by_id("pagelet_bluebar")
driver.find_element(By.ID,"pagelet_bluebar")
return "old"
except NoSuchElementException:
return "new"
Expand All @@ -311,10 +309,10 @@ def __detect_ui(driver):
def __find_reaction(layout, reactions_all):
try:
if layout == "old":
return reactions_all.find_elements_by_tag_name(
return reactions_all.find_elements(By.TAG_NAME,
"a")
elif layout == "new":
return reactions_all.find_elements_by_tag_name(
return reactions_all.find_elements(By.TAG_NAME,
"div")

except Exception as ex:
Expand Down
39 changes: 22 additions & 17 deletions facebook_page_scraper/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import json
import csv
import os
import time

except Exception as ex:
print(ex)
Expand Down Expand Up @@ -38,9 +39,8 @@ class Facebook_scraper:
#on each iteration __close_after_retry is called to check if retry have turned to 0
# if it returns true,it will break the loop. After coming out of loop,driver will be closed and it will return post whatever was found

retry = 10

def __init__(self,page_name,posts_count=10,browser="chrome",proxy=None):
def __init__(self,page_name,posts_count=10,browser="chrome",proxy=None,timeout=600):
self.page_name = page_name
self.posts_count = int(posts_count)
#self.URL = "https://en-gb.facebook.com/pg/{}/posts".format(self.page_name)
Expand All @@ -49,21 +49,30 @@ def __init__(self,page_name,posts_count=10,browser="chrome",proxy=None):
self.__driver = ''
self.proxy = proxy
self.__layout = ''
self.timeout = timeout

def __start_driver(self):
"""changes the class member __driver value to driver on call"""
self.__driver = Initializer(self.browser,self.proxy).init()
def __handle_popup_old_layout(self,layout):
def __handle_popup(self,layout):
#while scrolling, wait for login popup to show, it can be skipped by clicking "Not Now" button
try:
Utilities._Utilities__close_popup(self.__driver)
except:
pass
if layout == "old":
#if during scrolling any of error or signup popup shows
Utilities._Utilities__close_error_popup(self.__driver)
Utilities._Utilities__close_popup(self.__driver)
elif layout == "new":
Utilities._Utilities__close_modern_layout_signup_modal(self.__driver)
except Exception as ex:
print(ex)

def __check_timeout(self,start_time,current_time):
return (current_time-start_time) > self.timeout

def scrap_to_json(self):
#call the __start_driver and override class member __driver to webdriver's instance
self.__start_driver()

starting_time = time.time()
#navigate to URL
self.__driver.get(self.URL)

Expand All @@ -75,21 +84,18 @@ def scrap_to_json(self):
Utilities._Utilities__wait_for_element_to_appear(self.__driver,self.__layout)
#scroll down to bottom most
Utilities._Utilities__scroll_down(self.__driver,self.__layout)
self.__handle_popup_old_layout(self.__layout)
self.__handle_popup(self.__layout)


name = Finder._Finder__find_name(self.__driver,self.__layout) #find name element

while len(self.__data_dict) <= self.posts_count:

#if during scrolling any of error or signup popup shows
Utilities._Utilities__close_error_popup(self.__driver)
self.__handle_popup_old_layout(self.__layout)
self.__handle_popup(self.__layout)
self.__find_elements(name)

if self.__close_after_retry() is True:
#keep a check if posts are available, if retry is 0, than it breaks loop
break
current_time = time.time()
if self.__check_timeout(starting_time,current_time) is True:
print("Timeout...")
break
Utilities._Utilities__scroll_down(self.__driver, self.__layout) #scroll down
#print(len(self.__data_dict))
#close the browser window after job is done.
Expand Down Expand Up @@ -163,7 +169,6 @@ def __find_elements(self,name):
all_posts = Finder._Finder__find_all_posts(self.__driver,self.__layout) #find all posts
all_posts = self.__remove_duplicates(all_posts) #remove duplicates from the list

self.__no_post_found(all_posts) #after removing duplicates if length is 0, retry will decrease by 1
#iterate over all the posts and find details from the same
for post in all_posts:
try:
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
selenium==3.141.0
selenium==4.1.0
webdriver-manager==3.2.2
selenium-wire==4.3.1
python-dateutil==2.8.2
Loading

0 comments on commit 6f836f6

Please sign in to comment.