-
Notifications
You must be signed in to change notification settings - Fork 1
/
import_pure_2021.py
110 lines (82 loc) · 3.39 KB
/
import_pure_2021.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# Download HTML
import sys
import requests
from lxml import html
# Parse RSS-XML
import xml.etree.ElementTree as ET
# Strip HTML
from bs4 import BeautifulSoup
# Store FILE
import codecs
sys.stdout.reconfigure(encoding='utf-8')
#root = ET.parse('https://research.tudelft.nl/en/organisations/control-operations/publications/?format=rss&page=5').getroot()
def download_list(page, filename):
if page == 0:
bibf = codecs.open(filename,'w', 'utf-8')
bibf.write(u'\ufeff')
bibf.write('# AUTOGENERATED\n# Import from: https://research.tudelft.nl/en/organisations/control-simulation/publications/\n\n\n')
bibf.close()
papernr = 1
pageno = page
done = False
while not done:
print('- Page',pageno)
#p = requests.get('https://research.tudelft.nl/en/organisations/control-operations/publications/?format=rss&page=%d' % pageno)
p = requests.get('https://research.tudelft.nl/en/organisations/control-simulation/publications/?format=rss&page=%d' % pageno)
done = True
root = ET.fromstring(p.text)
for pub in root.findall('channel/item'):
title = pub.findall('title')[0].text
link = pub.findall('link')[0].text
#description = pub.findall('description')[0].text
#dom = html.fromstring(description)
#journal = dom.body.find_class('journal')
htmltxt = '<html><body>txt</body></html>'
# Try 5 times
for i in range(0,5):
try:
p = requests.get(link)
htmltxt = p.text
break
except:
print('Error attempt',i, link)
pass
dom = html.fromstring(htmltxt)
bib = dom.body.get_element_by_id('cite-BIBTEX').getchildren()[0]
print(str(papernr) + ' ',title)
# open and add, in case of error one can continue
bibf = codecs.open(filename,'a', 'utf-8')
#bibf.write('# '+str(pageno)+', '+str(papernr)+'\n')
bibf.write('# '+title+'\n# '+link+'\n\n')
# dump bibtex into file
for b in bib.getchildren():
# Try 5 times to download:
txt = '<html><body>txt</body></html>'
for i in range(0,5):
try:
soup = BeautifulSoup(html.tostring(b),features="lxml")
txt = soup.get_text()
break
except:
print('Error attempt',i, link)
pass
if '}' in txt:
bibf.write('\turl = "'+ link +'",\n')
if ' url ' in txt:
bibf.write(txt.replace(' url ', ' url2 ')+'\n')
elif not ' abstract ' in txt:
#print(txt)
bibf.write(txt+'\n')
#print('')
bibf.write('\n')
bibf.close()
papernr += 1
# continue is at least 1 paper was found.
done = False
pageno += 1
# debug: stop after 1 page
#if pageno >= 1:
# done = True
# To continue downloading, type a non-zero page.
# page=0 resets the output
download_list(0, './pure/cs.bib')