forked from eregs/regulations-parser
-
Notifications
You must be signed in to change notification settings - Fork 0
/
doc2xml.py
185 lines (146 loc) · 5.72 KB
/
doc2xml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
""" doc2xml.py
Converts docx files representing a proposed rule into the type of XML we'd
expect from the Federal Register.
Executing: python doc2xml.py file.docx
Writes XML to stdout
Installation:
* Install libxml2 via a package manager
* pip install -e git+https://github.com/savoirfairelinux/python-docx.git#egg=docx
Known limitations:
* Ignores images, tables, equations, similar
* Isn't aware of some bullets and other paragraph markers
* Uses bold and italics (along with string matching) to determine what
headers exist. If the docx uses custom style sheets instead, it won't
work
* Only processes the preamble data, not the CFR changes
""" # noqa
from __future__ import print_function
import re
import sys
from itertools import tee
from lxml import etree
import docx
h2_re = re.compile('[A-Z]\.')
h3_re = re.compile('\d\d?\.')
def text_subel(root, tag, text, **attrs):
"""Util method for allowing a one-liner"""
subel = etree.SubElement(root, tag, **attrs)
subel.text = text
return subel
def has_inline_label(par):
return len(par.runs) > 1 and par.runs[0].bold
def is_heading(par, level):
bold = all(run.bold for run in par.runs if run.text.strip())
italics = all(run.italic for run in par.runs if run.text.strip())
l2_marker = bool(h2_re.match(par.text.strip()))
l3_marker = bool(h3_re.match(par.text.strip()))
if level == 1:
return bold
elif level == 2:
return italics and l2_marker
elif level == 3:
return l3_marker
else:
return False
class Builder(object):
def __init__(self, paragraphs, xml_root):
self._paragraphs = iter(paragraphs) # always iterable
self.xml_root = xml_root
def takewhile(self, fn):
while fn(self.head_p):
yield next(self._paragraphs)
def dropwhile(self, fn):
while fn(self.head_p):
next(self._paragraphs)
return self
def skip_header(self):
def not_header(par):
return not (par.text.strip() and par.runs[0].bold and
not any(c.isdigit() for c in par.text))
self.dropwhile(not_header)
return self
def skip_whitespace(self):
self.dropwhile(lambda p: not p.text.strip())
return self
@property
def head_p(self): # peek; non-destructive
copy1, copy2 = tee(self._paragraphs)
self._paragraphs = copy2
return next(copy1)
def consume_text(self):
return next(self._paragraphs).text.strip()
def intro_header(self, parent, start_p):
label_to_tag = {
'AGENCY': 'AGY',
'ACTION': 'ACT',
'SUMMARY': 'SUM',
'DATES': 'DATES',
'ADDRESSES': 'ADD',
'FOR FURTHER INFORMATION CONTACT': 'FURINF',
}
label = next((l for l in label_to_tag if start_p.text.startswith(l)),
None)
if label:
sub_el = etree.SubElement(parent, label_to_tag[label])
text_subel(sub_el, 'HD', label + ':', SOURCE='HED')
else:
sub_el = etree.SubElement(parent, "UNKNOWN")
text_subel(sub_el, 'HD', start_p.runs[0].text, SOURCE='HED')
return sub_el
def intro_sections(self, preamb):
intro = self.takewhile(
lambda p: not p.text.startswith('SUPPLEMENTARY'))
current_section = None
for par in intro:
if has_inline_label(par):
current_section = self.intro_header(preamb, par)
sub_p = etree.SubElement(current_section, 'P')
text = ''.join(r.text for r in par.runs[1:])
# strip the beginning colon as it's part of the label
sub_p.text = text.lstrip(':').strip()
elif current_section is not None:
sub_p = etree.SubElement(current_section, 'P')
sub_p.text = par.text.strip()
def preamble(self):
preamb = etree.SubElement(self.xml_root, 'PREAMB')
text_subel(preamb, 'AGENCY', self.consume_text())
self.skip_whitespace()
if not self.head_p.text[:1].isdigit():
text_subel(preamb, 'SUBAGENCY', self.consume_text())
self.skip_whitespace()
for tag in ('CFR', 'DEPDOC', 'RIN', 'SUBJECT'):
text_subel(preamb, tag, self.consume_text())
self.skip_whitespace()
self.intro_sections(preamb)
return self
def suplinf(self):
suplinf = etree.SubElement(self.xml_root, 'SUPLINF')
text_subel(suplinf, 'HD', self.consume_text(), SOURCE='HED')
self.dropwhile(lambda p: not is_heading(p, 1))
non_cfr = self.takewhile(
lambda p: not p.text.startswith('List of Subjects'))
for par in non_cfr:
if not par.text.strip():
continue
elif is_heading(par, 1):
text_subel(suplinf, 'HD', par.text.strip(), SOURCE='HD1')
elif is_heading(par, 2):
text_subel(suplinf, 'HD', par.text.strip(), SOURCE='HD2')
elif is_heading(par, 3):
text_subel(suplinf, 'HD', par.text.strip(), SOURCE='HD3')
else:
text_subel(suplinf, 'P', par.text.strip())
def parse(filename):
"""Pulls out and prints some fields/paragraphs from an FR notice"""
builder = Builder(docx.Document(filename).paragraphs,
etree.Element('PRORULE'))
builder.skip_header()
builder.preamble()
builder.skip_whitespace()
builder.suplinf()
return builder.xml_root
if __name__ == '__main__':
if len(sys.argv) != 2:
print("Usage: python doc2xml.py file.docx") # noqa
else:
print(etree.tounicode(parse(sys.argv[1]), pretty_print=True)) # noqa