forked from eracle/linkedin
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsequential_run.py
69 lines (52 loc) · 2.15 KB
/
sequential_run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import logging
import time
from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings
from twisted.internet import defer, reactor
from linkedin.integrations.selenium import build_driver, is_security_check
from linkedin.spiders.companies import CompaniesSpider
# Define the number of seconds for the security check
SECURITY_CHECK_DURATION = 30
file_name = f"data/companies/data.csv"
logging.basicConfig(level=logging.DEBUG)
@defer.inlineCallbacks
def run_spiders_sequentially(runner, urls, driver):
for url in urls:
yield runner.crawl(CompaniesSpider, start_url=url, driver=driver)
try:
driver.get("https://www.google.com")
assert "Google" in driver.title
except Exception as e:
driver = build_driver(login=True)
perform_security_check(driver)
yield driver.close()
def perform_security_check(driver):
if is_security_check(driver):
# Print instructions with fancy characters for user attention
logging.info("***** SECURITY CHECK IN PROGRESS *****")
logging.info(
f"Please perform the security check on selenium, you have {SECURITY_CHECK_DURATION} seconds..."
)
for _ in range(SECURITY_CHECK_DURATION):
time.sleep(1)
logging.info("***** SECURITY CHECK COMPLETED *****")
else:
logging.debug("Security check not asked, continuing")
if __name__ == "__main__":
driver = build_driver(login=True)
perform_security_check(driver)
# Erase the past content of the file
open(file_name, "w").close()
settings = get_project_settings()
settings.set('LOG_LEVEL', 'DEBUG')
settings.set('LOG_ENABLED', True)
settings.set('LOG_STDOUT', True)
settings.set("FEEDS", {file_name: {"format": "csv"}})
runner = CrawlerRunner(settings)
urls = [url.strip() for url in open("data/companies.txt", "r")]
# Run the spiders sequentially
sequential_spiders = run_spiders_sequentially(runner, urls, driver)
sequential_spiders.addBoth(
lambda _: reactor.stop()
) # Stop the reactor when all spiders are done
reactor.run() # Start the reactor