-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcanada_scrape.py
79 lines (66 loc) · 2.42 KB
/
canada_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
import csv
import xml.etree.ElementTree as ET
csv_content_length = 32750
def get_text(section):
text = section.text + " " if section.text else ""
for child in list(section):
if not list(child):
if child.tag == 'DefinedTermFr':
text += ""
else:
text += child.text + " " if child.text else ""
else:
text += get_text(child)
return text
def generate():
xml_filename = 'canada.xml'
xml_file = os.path.join(xml_filename)
root = ET.parse(xml_file).getroot()
body = root.find('Body')
column1 = ""
column2 = ""
column3 = ""
column4 = ""
column5 = ""
filename = "canada_records.csv"
with open(filename, 'w') as csv_file:
csv_writer = csv.writer(csv_file)
for child in list(body):
if child.tag == 'Heading':
# Heading
heading_level = int(child.get('level'))
if child.find('TitleText') is None:
continue
if child.find('TitleText').text:
title_text = child.find('TitleText').text
else:
title_text = ""
if heading_level == 1:
column1 = "\"" + title_text + "\""
column2 = ""
column3 = ""
column4 = ""
elif heading_level == 2:
column2 = "\"" + title_text + "\""
column3 = ""
column4 = ""
elif heading_level == 3:
column3 = "\"" + title_text + "\""
column4 = ""
elif heading_level == 4:
column4 = "\"" + title_text + "\""
else:
# Section
column6 = ""
for sub_section in list(child):
if sub_section.tag == 'Label':
column5 = "\"" + sub_section.text + "\""
elif sub_section.tag == 'HistoricalNote':
continue
else:
column6 += get_text(sub_section)
column6 = "\"" + (column6[:csv_content_length] + '...') if len(column6) > csv_content_length else column6 + "\""
fields = [column1, column2, column3, column4, column5, column6]
csv_writer.writerow(fields)
generate()