-
Notifications
You must be signed in to change notification settings - Fork 1
/
import_arxiv_2021.py
103 lines (69 loc) · 2.85 KB
/
import_arxiv_2021.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# Download HTML
import requests
from lxml import html
import codecs
def download_list(page, filename):
search= ['guido de croon', 'christophe de wagter', 'ewoud smeur', 'sjoerd tijmons', 'julien dupeyroux', 'salua hamaza', 'marija popovic']
url = 'https://arxiv.org/search/?query='
if page == 0:
bibf = codecs.open(filename,'w', 'utf-8')
bibf.write(u'\ufeff')
bibf.write('# AUTOGENERATED\n')
bibf.close()
listofcodes = []
for person in search:
u = 'https://arxiv.org/search/?searchtype=author&size=200&query=' + person.replace(' ','+')
bibf = codecs.open(filename,'a', 'utf-8')
bibf.write('# Import from: ' + u + '\n')
print('- Page', u)
p = requests.get(u)
# Get lines with dataset links
interest = [s.strip("\r\n").strip().split('arXiv:')[1].strip('</a>').strip('</span') for s in p.text.splitlines(True) if 'arXiv:' in s ]
for i in interest:
bibf.write('# - ' + i + '\n')
if not i in listofcodes:
listofcodes.append(i)
print(i)
#for p in interest:
bibf.close()
papernr = 0
for code in listofcodes:
if True:
pp = code
print(pp)
uu = 'https://arxiv2bibtex.org/?format=bibtex&q='+pp
pa = requests.get(uu)
start = False
bib = []
for s in pa.text.splitlines(True):
if '</textarea>' in s:
break # only 1
if start:
bib.append(s.strip('\r\n'))
if '<textarea' in s:
start = True
bib.append('\turl = {https://arxiv.org/abs/'+pp+'},')
bib.append('\tpdf = {https://arxiv.org/pdf/'+pp+'.pdf},')
bib.append('}')
bib = '\n'.join(bib).replace('</textarea>','')
bib = bib.replace('Title = {','\ttitle = {')
bib = bib.replace('Year = {','\tyear = {')
bib = bib.replace('Author = {','\tauthor = {')
bib = bib.replace('Eprint = {','\teprint = {')
bib = bib.replace('Howpublished = {','\thowpublished = {')
bib = bib.replace('Doi = {','\tdoi = {')
#bib = bib.replace('@article','@misc')
# open and add, in case of error one can continue
bibf = codecs.open(filename,'a', 'utf-8')
#bibf.write('# '+str(papernr)+'\n')
bibf.write('# '+code+ ': ' + uu + '\n\n')
bibf.write(bib)
bibf.write('\n\n\n')
bibf.close()
papernr += 1
# continue is at least 1 paper was found.
#done = False
# debug: stop after 1 page
#if pageno >= 1:
# done = True
download_list(0, 'arxiv.bib')