-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake-anthology-json.py
47 lines (38 loc) · 1.37 KB
/
make-anthology-json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import json
import tqdm
from pybtex.database.input import bibtex
import re
import gzip
simplify_re = re.compile(r'[A-Za-z0-9]+')
remove_chars = ['\\', '{', '}', '$']
name_types = ['first', 'middle', 'last', 'prelast', 'lineage']
def simplify_title(title):
for remove in remove_chars:
title = title.replace(remove, '')
title = title.lower()
return ' '.join(simplify_re.findall(title))
def main():
bib_file = 'anthology.bib'
parser = bibtex.Parser()
print('Parsing bib file')
bib_data = parser.parse_file(bib_file)
resulting = {}
for entry in tqdm.tqdm(bib_data.entries.values()):
new_key = simplify_title(entry.fields['title'])
entry.fields['bibType'] = entry.type
entry.fields['people'] = {}
for person_type, people in entry.persons.items():
people_list = []
for person in people:
person_dict = {}
for n in name_types:
if len(person.get_part(n)):
person_dict[n] = person.get_part(n)
people_list.append(person_dict)
entry.fields['people'][person_type] = people_list
resulting[new_key] = dict(entry.fields)
with gzip.open('anthology_data.json.gz', 'wb') as f:
json_s = json.dumps(resulting)
f.write(json_s.encode('utf-8'))
if __name__ == '__main__':
main()