-
Notifications
You must be signed in to change notification settings - Fork 0
/
compute.py
188 lines (155 loc) · 5.85 KB
/
compute.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import argparse
import numpy as np
import pandas as pd
import re
import sys
import urllib.request
import yaml
from urllib.parse import urljoin, urlencode
from bs4 import BeautifulSoup
from transliterate import translit
"""
Check for version of Python and exit with error when version is older than 3
"""
def check_version():
major_version = int(re.search('^\d+', sys.version).group(0))
if major_version < 3:
sys.exit('Use version 3 at least\n')
"""
Get content from specified URL
"""
def get_content(url, language):
req = urllib.request.Request(
url,
data=None,
headers={
'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' +
'(KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36',
'Cookie': 'lang={}'.format(language)
}
)
f = urllib.request.urlopen(req)
return f.read().decode('utf-8')
"""
Parse command line arguments or print usage information
"""
def parse_arguments():
parser = argparse.ArgumentParser(description='Gather and process public transportation data from transphoto.org')
parser.add_argument('-c', '--city',
help='number or name (default city is Moscow)',
default='')
parser.add_argument('-t', '--type',
help='transportation type (1-9 or name, default value is tram)',
default='')
parser.add_argument('-l', '--language',
dest='code',
help='ISO 639-1 language code (default is ru for Russian)',
default='')
parser.add_argument('-o', '--output',
dest='file',
help='output frequency table to file')
return parser.parse_args()
"""
Main function
"""
def main():
check_version()
years = []
total = 0
title = ''
with open('config.yml') as file:
config = yaml.full_load(file)
"""
Parse arguments
and search for appropriate codes of city and transportation type
"""
args = parse_arguments()
language = args.code
lang = re.search(r'^([a-z]{2})', language)
language = lang.group(0) if lang else config.get('language');
city = args.city
if not re.match(r'^\d+$', city):
# try to guess id by name
cities = config.get('cities')
# append dictionary of cities, read data from web
soup = BeautifulSoup(get_content(config.get('url')['cities'], language), 'lxml')
table = soup.find('div', attrs={'class': 'p20w'})
links = table.find_all('a')
for link in links:
city_name = link.text.lower()
id = re.search(r'\d+', link.get('href'))
if id:
city_id = id.group(0)
cities[city_name] = city_id
if re.match(r'[А-ЯЁ]', city_name, re.I):
transliterated = translit(city_name, reversed=True)
cities[transliterated] = city_id
if 'j' in transliterated:
cities[transliterated.replace('j', 'y')] = city_id
city = cities.get(city.lower(), cities['default'])
kind = args.type
if not re.match(r'^\d+$', kind):
kinds = config.get('types')
kind = kinds.get(kind.lower(), kinds['default'])
output = args.file
if output and not re.search(r'\.(csv|html?|js(on)?|xlsx?)$', output, re.I):
sys.exit('Output file {} has unknown type. '
+ 'Only CSV, HTML, and XSLX are available.\n'.format(output))
service = 0 # only for passengers
url = urljoin(
config.get('url')['wagons'],
'?' + urlencode({'t': kind, 'cid': city, 'serv': service}))
while url:
soup = BeautifulSoup(get_content(url, language), 'lxml')
next_link = soup.find('a', attrs={'id': 'NextLink'})
url = urljoin(url, next_link.get('href')) if next_link else None
if not title:
title = soup.find('h2').text
# get tables wrapped by div.rtable
tables = soup.find_all('div', attrs={'class': 'rtable'})
for table in tables:
rows = table.find_all('tr')
for row in rows:
row_classes = row.get('class')
# suitable rows has class s1 or s11
if not ('s1' in row_classes or 's11' in row_classes):
continue
cells = row.find_all('td')
if not cells:
continue
# header row doesn't contain any <td> cells
built = cells[3].text # YYYY, mm.YYYY or YYYY-mm
if built:
matched = re.search(r'\d{4}', built)
if matched:
years.append(int(matched.group(0)))
if years:
series = pd.Series(years)
counts = pd.DataFrame({'count': series.value_counts()}).sort_index()
if output:
if re.search(r'\.csv$', output, re.I):
counts.to_csv(output)
elif re.search(r'\.html?$', output, re.I):
counts.to_html(output)
elif re.search(r'\.js(on)?$', output, re.I):
counts.to_json(output)
elif re.search(r'\.xlsx?$', output, re.I):
counts.to_excel(output)
else:
print(title, '-' * len(title), sep='\n')
for year, count in counts['count'].items():
print('{:<4} {:>6} {}'.format(year, count, '#' * count))
print('-' * 12) # year + gap + count
print('Total {:>6}'.format(len(years)))
print(
'Mean: {:.5}, median: {:.5}, modes: {}'.format(
series.mean(),
series.quantile(), # median
series.mode().to_list(),
)
)
else:
print('No data')
if __name__ == '__main__':
main()