forked from 26hzhang/DBLPParser
-
Notifications
You must be signed in to change notification settings - Fork 2
/
dblp_parser.py
79 lines (64 loc) · 2.5 KB
/
dblp_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
from collections import defaultdict
from lxml import etree
# Ignore the following fields
ignore_field = ['volume', 'pages', 'series', 'note', 'number']
# The following fields can have multiple values
mv_field = ['author', 'address', 'cdrom', 'pages', 'cite', 'crossref',
'editor', 'school', 'ee', 'isbn', 'url', 'publisher']
# The following field can only have a single value
sv_field = ['booktitle', 'chapter', 'journal', 'month', 'title', 'year']
# Possible genre of the doc
genre = {'article', 'inproceedings', 'proceedings', 'book', 'incollection',
'phdthesis', 'mastersthesis', 'www'}
def iterate_dblp(dblp):
"""Iterate through xml file and read doc by doc.
Code from stackoverflow: https://stackoverflow.com/a/42193997.
Parameters
----------
dblp : str
Path to dblp data file.
"""
docs = etree.iterparse(dblp, events=('start', 'end'), dtd_validation=True,
load_dtd=True)
_, root = next(docs)
start_tag = None
for event, doc in docs:
if event == 'start' and start_tag is None: # a new start
start_tag = doc.tag
if event == 'end' and doc.tag == start_tag:
yield start_tag, doc
start_tag = None
root.clear()
def parse_record(dblp, output):
"""Parse each record in dblp dataset.
Parameters
----------
dblp : str
Path to dblp path.
output : str
Path to output path.
"""
with open(output, 'w') as ofp:
for genre, record in iterate_dblp(dblp):
attrs = defaultdict(list)
for attr in record:
if attr not in ignore_field: # field to ignore
attrs[attr.tag].append(attr.text)
attrs['genre'] = genre
try: # make sure the record has valid format
for key in set(attrs.keys()) & set(sv_field):
if not attrs[key]: # empty is okay
continue
if len(attrs[key]) > 1: # should have only one value
raise ValueError('Record {} has multi-value in {}. '
'Ignored.'.format(attrs['title'],
key))
attrs[key] = attrs[key][0]
json.dump(dict(attrs), ofp)
ofp.write('\n')
except ValueError as err:
print(err)
continue