forked from BigDevil82/docx_tag_parser
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tag_parser.py
171 lines (135 loc) · 5.65 KB
/
tag_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import enum
import re
import docx
from data_manager import DataContextManager
from util import remove_element, replace_text
class IfElseSearchStatus(enum.Enum):
"""
search status for if else tag
"""
NONE = 0
FOUND_IF = 1
FOUND_ELSE = 2
FOUND_END = 3
class IfElseTagParser:
"""
parse IF ELSE tags in an docx tempate document
replace content in it based on the condition
remain the content if the condition is satisfied else remove the content
Args:
- doc (docx.Document): the docx document to parse
- data_manager (DataContextManager): the data manager to get the data from
which is used to check the if else condition
"""
def __init__(self, doc, data_manager: DataContextManager) -> None:
self.doc = doc
self.data_manager = data_manager
self.status = IfElseSearchStatus.NONE # search status
self.if_satisfied = False # if the if condition is satisfied
self.element_discard = False # if the element should be discarded
self._comiple_regex()
def _comiple_regex(self):
self.if_start = re.compile(r"^\s*<IF *=([\w=]+)>")
self.else_tag = re.compile(r"^\s*<ELSE>") # else tag is optional
self.if_end = re.compile(r"^\s*</IF>")
self.inline_if = re.compile(r"<IF *=([\w=]+)>(.+)</IF>")
self.inline_if_else = re.compile(r"<IF *=([\w=]+)>(.+)<ELSE>(.+)</IF>")
def check_if_condition(self, condition: str):
"""
check if the condition is satisfied
Args:
- condition (str): the condition str in IF tag
"""
if "=" in condition:
self.check_key, check_value = condition.split("=")
else:
self.check_key, check_value = condition, None
real_value = self.data_manager.get_json_value(self.check_key)
if check_value is None:
return real_value
else:
return real_value == check_value
def check_inline_if(self, paragraph):
if paragraph is None:
return
if res := self.inline_if_else.search(paragraph.text):
for res in self.inline_if_else.finditer(paragraph.text):
key, if_val, else_val = res.group(1), res.group(2), res.group(3)
if_satisfied = self.check_if_condition(key)
if if_satisfied:
replace_text(paragraph, res.group(), if_val)
else:
replace_text(paragraph, res.group(), else_val)
elif res := self.inline_if.search(paragraph.text):
for res in self.inline_if.finditer(paragraph.text):
key, value = res.group(1), res.group(2)
if_satisfied = self.check_if_condition(key)
if if_satisfied:
replace_text(paragraph, res.group(), value)
else:
replace_text(paragraph, res.group(), "")
def remove_element(self, element):
remove_element(element)
self.element_discard = True
def _check_status(self, docx_element, paragraph=None):
"""
check if the element is in the IF-ELSE tag
Args:
- docx_element: the docx element to check
- paragraph: the paragraph docx element
Returns:
- bool: if the element is in the IF-ELSE tag
"""
if self.status == IfElseSearchStatus.FOUND_IF and not self.if_satisfied:
self.remove_element(docx_element)
return True
if self.status == IfElseSearchStatus.FOUND_IF and self.if_satisfied:
self.check_inline_if(paragraph)
return True
if self.status == IfElseSearchStatus.FOUND_ELSE and self.if_satisfied:
self.remove_element(docx_element)
return True
if self.status == IfElseSearchStatus.FOUND_ELSE and not self.if_satisfied:
self.check_inline_if(paragraph)
return True
return False
def process_element(self, docx_element):
"""
entry point to process the docx element
those elements satisfying the condition will be kept
and those not satisfying the condition will be removed
IF-ELSE tag will be removed as well
Args:
- docx_element: the docx element to process
"""
self.element_discard = False
if isinstance(docx_element, docx.oxml.text.paragraph.CT_P):
paragraph = docx.text.paragraph.Paragraph(docx_element, self.doc)
text = paragraph.text
if res := self.if_start.search(text):
# self.current_tag = res.group(1)
self.status = IfElseSearchStatus.FOUND_IF
self.if_satisfied = self.check_if_condition(res.group(1))
self.remove_element(docx_element)
return
if self.else_tag.search(text):
self.status = IfElseSearchStatus.FOUND_ELSE
self.remove_element(docx_element)
return
if self.if_end.search(text):
self.status = IfElseSearchStatus.FOUND_END
self.remove_element(docx_element)
return
is_in_existing_tag = self._check_status(docx_element, paragraph)
if is_in_existing_tag:
return
self.check_inline_if(paragraph)
else:
self._check_status(docx_element, None)
if __name__ == "__main__":
doc = docx.Document("template.docx")
data_manager = DataContextManager("data.json") # not provided in this repo
parser = IfElseTagParser(doc, data_manager)
for element in doc.element.body:
parser.process_element(element)
doc.save("report_parsed.docx")