-
Notifications
You must be signed in to change notification settings - Fork 0
/
scan_s2orc_pdf.py
33 lines (31 loc) · 1.26 KB
/
scan_s2orc_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from module.query_pdf import GettingPDFs
import argparse
if __name__ == "__main__":
# Design arguments parser
parser = argparse.ArgumentParser(description="""Get pdf entries of interest
from the S2ORC pdf_parse archives""")
parser.add_argument('-i',
'--metadata_input',
metavar='',
required=True,
help='Provide metadata jsonl FILE')
parser.add_argument('-a',
'--pdf_archives',
metavar='',
required=True,
help='Provide PATH to S2ORC pdf_parse archives')
parser.add_argument('-o',
'--output_file',
required=True,
help="""Provide PATH for jsonl file with
the extracted S2ORC pdf_parse entries""")
args = parser.parse_args()
# Pipeline per se
# Init GettingPDFs class
record = GettingPDFs(args.metadata_input, args.pdf_archives,
args.output_file)
# Get paper_ids from metadata jsonl file
ids = record.open_input
# Extract pdf_parse entries
articles_interest = record.get_articles(ids)
record.parallel_process(articles_interest)