-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxml_parser.py
37 lines (28 loc) · 1.67 KB
/
xml_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import re
import xml.etree.ElementTree as ElementTree
# watch a url
def html_to_xml_parser(text):
# (REMOVE <SCRIPT> to </script> and variations)
pattern = r'<[ ]*script.*?\/[ ]*script[ ]*>' # mach any char zero or more times
text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
# (REMOVE HTML <STYLE> to </style> and variations)
pattern = r'<[ ]*style.*?\/[ ]*style[ ]*>' # mach any char zero or more times
text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
# (REMOVE HTML <META> to </meta> and variations)
pattern = r'<[ ]*meta.*?>' # mach any char zero or more times
text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
# (REMOVE HTML COMMENTS <!-- to --> and variations)
pattern = r'<[ ]*!--.*?--[ ]*>' # mach any char zero or more times
text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
# (REMOVE HTML DOCTYPE <!DOCTYPE html to > and variations)
pattern = r'<[ ]*\![ ]*DOCTYPE.*?>' # mach any char zero or more times
text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
# (REMOVE HTML <HEAD> to </HEAD> and variations)
pattern = r'<[ ]*head.*?\/[ ]*head[ ]*>' # mach any char zero or more times
text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
# for some reason & is not a valid char in XML parse
text = text.replace('&', '&')
# HTML/XML is now "clean" so we can create an binary XML tree
parser = ElementTree.XMLParser(encoding="utf-8")
xml = ElementTree.fromstring(text, parser=parser)
return xml