-
Notifications
You must be signed in to change notification settings - Fork 4
/
config.template.ini
301 lines (245 loc) · 11.5 KB
/
config.template.ini
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
[general]
# Dumps stats on the papers downloaded from the year range above
# It can be time consuming (~ 5 minutes) and is purely informative.
report_on_start = False
logging_level = INFO
report_start_year = 2020
report_end_year = 2020
exit_after_report = False
# For testing, generally.
#------------------------------------------------------
download_single_doi_mode = False
download_single_doi = 10.3390/plants10102053
#------------------------------------------------------
# iterates only items that failed to download with populated
# unpaywall_downloader url fields. Recommended with firefox_downloader enabled
# When enabled, it skips all other steps and ONLY does the retry, and exits
# once all failed downloads have been retried.
retry_failed_unpaywall_links = False
#------------------------------------------------------
# Used to generate a new output file - if there are dois for the
# journal, then it makes it into this file.
# The workflow idea is that we add all posisble jounrals (which picks up web
# issns, paper, electronic, etc). Most of these have no actual paper dois associated
# with them, so they clutter up the journals.tsv file. Run a full DOI query against
# crossref for a single year, then enable this funciton to limit journals.tsv to only
# interesting ISSNs, copy the generated .tsv over journals.tsv
#and that will speed up subsequent year queries against crossref.
write_used_journals = True
used_journals_only_file = used_journals_only_file.tsv
[journal population]
# Scans GBIF for journal ISSNs to populate journals.tsv.
# uses GBIF's pattern matching for instutitoins (CAS) which tends to cast a too-wide
# net, if using this method, manually editing journals.tsv to remove non-germane publications
# is advised.
populate_journals = False
gbif_api_collection_links = ["https://api.gbif.org/v1/literature/search?contentType=literature&year=2021,2022&literatureType=journal&gbifDatasetKey=f934f8e2-32ca-46a7-b2f8-b032a4740454&limit=1000",
"https://api.gbif.org/v1/literature/search?contentType=literature&year=2021,2022&literatureType=journal&gbifDatasetKey=5d6c10bd-ea31-4363-8b79-58c96d859f5b&limit=1000",
"https://api.gbif.org/v1/literature/search?contentType=literature&year=2021,2022&literatureType=journal&gbifDatasetKey=5e848873-567f-42dd-8a29-88056b9b33a4&limit=1000"
]
# Source for the DOIs
[crossref]
# Skip this step entirely - do not update dois from crossref.
skip_crossref = True
# This is a reporting step that is for information only, and can take a while
# to run, but will let you know what will be downloaded from crossref.
# enable to get a full report on what needs to be downloded before the download starts
skip_crossref_precheck = True
# Force update is useful if re-scanning an incomplete year - e.g.:
# Last time we ran was march, 2022 and it is now 2023. The standard
# "scan_for_dois_after_year" and "scan_for_dois_before_year" would skip
# 2022 in this case, because entries in that year already exist.
# Doi entries are self checking and will never be duplicated, so this is safe.
# Note: force update will override "skip_crossref"!
force_update = False
force_update_year = 2020
scan_for_dois_after_year = 2020
scan_for_dois_before_year = 2021
# The 'download' step comes after the DOIs have been downloaded.
# For each DOI, the system checks to see if it's marked in the database as downloaded
# If not, it checks to see if the PDF exists on disk. If not, it
# attempts to download using the enabled downloader(s).
# if both the download_single_journal and download_all_journals are false, this step is skipped.
[download]
# if disabled, skips all paper downloading (will still update_pdf_file_link if enabled)
enable_paper_download = False
download_start_year = 2021
download_end_year = 2021
#------------------------------------------------------
# This is used to rebuild database links, if need be
# Slow! Iterates over every DOI and checks the file database in
# ./{pdf_directory}. If there is a pdf available, link it to the doi record. If not,
# purge what's there and mark it as missing.
# Note - if enabled, this will run even if enable_paper_download is enabled.
# runs before paper download and only covers the years from download_start_year
# to download_end_year
update_pdf_file_link = False
#------------------------------------------------------
# it is possible to enable single journal AND all journals mode, in which case
# single journal will run first.
download_single_journal = False
download_single_journal_issn = 0373-6687
# note, many options under "download" apply here.
[downloaders]
# Use full path
pdf_directory = /home/jrussack/citations_finder/pdf
header_email = curator@museum.org
modules = ["unpaywall_downloader"]
#requires absolute path. Will get a .{pid} suffix. Directory and contents will be deleted.
tmp_firefox_pdf_directory = /Users/joe/citations_finder/temp_firefox_pdf
# 0 is an acceptable value
firefox_page_load_timeout = 10
# TODO: Archive.org downloader
# desirable if there are multiple downloaders running concurrently
# minimizes the possibility of conflict
randomize_download_order = False
# Suppress journal report header (nice to see, but it's slow)
suppress_journal_report_header = True
[unpaywall_downloader]
# Do not retry
# If there's any entry at all in unpaywall_downloader for this DOI (i.e: we attenpted in the
# past at ANY time) don't try again.
do_not_retry=True
# If true, checks the database field "most_recent_attempt" in the
# relevant download table (usually "unpaywall_downloader"). If
# a "recent" (i.e.: after "retry after datetime" field below) attempt
# was made, don't retry. The intent is that nothing is likely to have
# changed if we tried recently, so don't keep hammering on closed doors.
use_datetime_restriction = False
# %m/%d/%Y %H:%M:%S
retry_after_datetime = 8/1/2023 12:00:00
#------------------------------------------------------
# When we scrape unpaywall, we sometimes don't get a hit on a DOI.
# We still create a database entry, but leave the "open_url" column blank.
# If we retry an unpawywall download, we can re-scrape unpaywall for this
# doi in case there's now a link when previously there was not.
# Most of the time, it doesn't make sense to requery unpaywall again
# - no reaon to think we have a link now when we didn't previously.
# However, if it's been a while, it might be worth requerying unpaywall
# in case a link to this paper has been added.
# Note: ignores retry_after_datetime restrictions
retry_only_failures_with_link = False
# normally we only attempt to fetch download link from
# unpaywall when the link isn't already in the database
force_open_url_update = False
# If an entry is marked as having been attemped and there's no unpaywall link,
# do not attempt to re-fetch it under any circumstances.
do_not_refetch_links = False
#------------------------------------------------------
# to get a sense of what percentages of papers have unpaywall links, usually
# Does not attempt a paper download. This is an override - rarely do
# we want this to be true.
force_update_link_only = False
# populate_not_available_only - no effect if force_update_link_only is False
# Skips all steps if the unpaywall_downloader -> 'not_available' column is not null
# note, this also has the side effect of scanning files for the "already downloaded" cases
# that haven't been marked.
populate_not_available_only = False
#------------------------------------------------------
# Often, open source journals will provide a direct link, but this usually goes
# to an HTML version. Picking the actual URL out of that isn't implemented because it's
# easier to just go straight to unpaywall.
attempt_direct_link = False
# Use selenium to download firefiox - attempted if we get html response, which
# likely indicates an interception by cloudflare. Try it with a full user browser
# controlled through selenium
# requires gekodriver. Install on mac with "brew install geckodriver"
# Enable terminal and/or pycharm to control keyboard via security
#------------------------------------------------------
firefox_downloader = True
retry_firefox_failure = True
re_used_direct_url_sleep_time = 30
# (re)Scans the existing PDFs for regex matches.
[scan]
enabled = False
scan_start_year = 2016
scan_end_year = 2016
# "reset" causes a the whole scan database to be rebuilt.
# required most of the time- it won't pick up new PDFs without it
# (this comment taken from the code- needs verification)
reset_scan_database = False
# re-run scoring algorithm (typically after code changes)
rescore = False
# (re)scans papers for specimen IDs to map specimens back to papers published.
disable_txt_generation = False
# location for the pdf->txt file conversion
# Rescans do not regenerate the conversion.
# Use full path
scan_text_directory = /home/jrussack/citations_finder/pdf/txt
#
# Use numbers > 1 to parallelize conversion
#
max_pdf_conversion_threads = 20
[scan_for_specimen_ids]
enabled = True
reset_scan_database = False
# interactive validate step
[validate]
enabled = False
regular_prompts = False
# doesn't work until the regular validation sequence is completed.
digital_prompts = False
validate_start_year = 2015
validate_end_year = 2021
# copy identified PDFs to unique directory
# export TSV summary of scan results
[copyout]
enabled = False
target_dir = ./publish
copyout_start_year = 2016
copyout_end_year = 2016
copyout_pdfs = True
copyout_txt = True
export_tsv = True
[scan_search_keys]
collections_regex_match = (?i)(([ \(\[])+|^)cas(ent)*(c)*(iz)*[: ]+\d+(?![\d\-])
collection_manager_names = [
("jens vindum", 1000),
("christopher grinter", 1000),
("christina piotrowski", 1000),
("johanna loacker", 1000),
("maricela abarca", 200),
("Christine Garcia",200),
("David Catania", 200),
("lauren scheinberg", 1000),
("shevock", 200),
("james shevock", 300),
("jim shevock", 300),
("seth cotterell", 300),
("jon fong", 200),
("wojciech pulawski", 100),
("rebekah kim",200),
("michele esposito",200)
]
scored_strings = [
('california academy of science[s]?', 200),
('southern california academy of science[s]?', -200),
("CASC", 60),
("CASIZ", 200),
("izcas", -200),
("CAS-SUA", 200),
("CAS-SUR", 200),
("CAS-SU", 200),
("CAS:SU", 200),
("CASG", 200),
("CASG-FOSSIL", 1000),
("CAS:ICH", 200),
("CAS-ICH", 200),
("CASTYPE", 200),
("CASENT", 200),
("antweb", 400),
("antcat", 400),
("catalog of fishes", 400),
("CAS", 20),
("center for comparative genomics", 100),
("CAS HERP", 100),
("CAS MAM", 100),
("CAS ORN", 100),
("chinese", -20),
("Chinese Academy of Sciences", -100),
("institute of botany cas", -200),
("biology centre cas", -200),
("crispr", -100),
("inaturalist", -100),
("università di catania",-200)
]