-
Notifications
You must be signed in to change notification settings - Fork 25
/
ARGUS_noGUI.py
42 lines (33 loc) · 903 Bytes
/
ARGUS_noGUI.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# -*- coding: utf-8 -*-
"""
Code to steer ARGUS
Created on Mon Jun 8 16:41:04 2020
@author: JDO
"""
# Modules
import os
import sys
import time
# get path to directory
script_dir = os.path.dirname(__file__)
class argus_settings:
os.chdir(script_dir) # change working directory to project folder
filepath = sys.argv[1] # file path for list containing URLs
# settings for ARGUS spider
delimiter = "\t"
encoding = "utf-8"
index_col = "id" # column with IDs
url_col = "url" # column with URLs
lang = "German" # language
n_cores = 1 # number of cores
limit = 10 # scraping limit
log_level = "INFO"
prefer_short_urls = "on"
pdfscrape = "off"
# Execute scraping
if __name__ == "__main__":
os.startfile(script_dir + r"\bin\start_server.bat") # start scrapyd server
time.sleep(2)
# Start crawling
from bin import start_crawl_steering
start_crawl_steering.start_crawl()