-
Notifications
You must be signed in to change notification settings - Fork 2
/
main.py
101 lines (71 loc) · 3.31 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import os
from datetime import datetime
from src.scidirect import ScienceDirect, Paper as SDP
from src.acm import ACM, Paper as ACMP
from src.ieee import IEEE, Paper as IXP
from src.utils import *
if __name__ == "__main__":
config = read_json("./config.json")
assert validate(config)
if not os.path.isdir('temp'):
os.mkdir('temp')
if not os.path.isdir('abs'):
os.mkdir('abs')
scrappers = {'IEEE', 'ACM', 'SCIDIR'}.intersection(set(config.keys()))
for s in scrappers:
if s == 'IEEE':
# get links to individual search results
ieee = IEEE(config['IEEE']['search_term'])
ieee.get_links_to_papers()
# dump links
if config['IEEE']['keep_link_file']:
ieee.to_json(config['IEEE']['link_file_save_to'])
# get abstract of the and every search results
ieee_paper = IXP(config['IEEE']['link_file_save_to'])
if config['IEEE']['use_batches']:
ieee_paper.batch_update_details(config['IEEE']['batch_size'])
else:
ieee_paper.update_paper_details()
ieee_paper.to_json(config['IEEE']['abs_file_save_to'])
if not config['IEEE']['keep_link_file']:
os.remove(config['IEEE']['link_file_save_to'])
elif s == 'ACM':
# get links to individual search results
current_year = datetime.now().year
acm = ACM((current_year - 5), current_year, config['ACM']['search_term'])
acm.get_links_to_papers()
# dump links
if config['ACM']['keep_link_file']:
acm.to_json(config['ACM']['link_file_save_to'])
# get abstract of the and every search results
acm_paper = ACMP(config['ACM']['link_file_save_to'])
if config['ACM']['use_batches']:
acm_paper.batch_update_details(config['ACM']['batch_size'])
else:
acm_paper.update_paper_details()
acm_paper.to_json(config['ACM']['abs_file_save_to'])
if not config['ACM']['keep_link_file']:
os.remove(config['ACM']['link_file_save_to'])
elif s == 'SCIDIR':
# get links to individual search results
current_year = datetime.now().year
sd = ScienceDirect((current_year - 5), current_year, config['SCIDIR']['search_term'])
if sd.driver is not None:
sd.driver.delete_all_cookies()
sd.get_links_to_papers()
# dump links
if config['SCIDIR']['keep_link_file']:
sd.to_json(config['SCIDIR']['link_file_save_to'])
# get abstract of the and every search results
sd_paper = SDP(config['SCIDIR']['link_file_save_to'])
if sd_paper.driver is not None:
sd_paper.driver.delete_all_cookies()
if config['SCIDIR']['use_batches']:
sd_paper.batch_update_details(config['SCIDIR']['batch_size'])
else:
sd_paper.update_paper_details()
sd_paper.to_json(config['SCIDIR']['abs_file_save_to'])
if not config['SCIDIR']['keep_link_file']:
os.remove(config['SCIDIR']['link_file_save_to'])
else:
raise ConfigurationError(f"wrong scrapper {s}.")