-
Notifications
You must be signed in to change notification settings - Fork 0
/
trafic-frontiere-list.py
111 lines (89 loc) · 4.21 KB
/
trafic-frontiere-list.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
"""
Intrare: 5 artere; Iesire: 7 artere. Trafic intens - numar mare de autoturisme prezente la verificarile de frontiera pe sensul de iesire din tara. RECOMANDARE: pentru evitarea aglomerarilor, puteti folosi si alte puncte de trecere a frontierei!
Denumire | Timp | Info | Unelte
- [x] fetch tables/list
- [ ] fetch map
- [x] export CSV
- [ ] Detect fields inside Info:
- [ ] nr artere intrare / ieșire
- [ ] recomandari:
- [ ] trafic intens:
- [ ] tonaj admis
- [ ] orar
- [x] export json
- [ ] export GeoJson - see trafic-frontiere-markers.py
"""
url = 'https://www.politiadefrontiera.ro/ro/traficonline/'
filename = 'data/politia-de-frontiera/trafic-frontiere'
import requests, csv, re
import pandas as pd
from bs4 import BeautifulSoup
zicolumns = ['Denumire', 'Timp', 'Info', 'Latitude', 'Longitude', 'Status', 'Tip vehicul', 'Sens']
# Define the base URL
base_url = url
# Define the sources with different url_vars
sources = [
{"tip_vehicul": "Autoturisme", "sens": "Intrare", "url_vars": "?vw=2&vt=1&dt=1"},
{"tip_vehicul": "Autoturisme", "sens": "Ieșire", "url_vars": "?vw=2&vt=1&dt=2"},
{"tip_vehicul": "Camioane", "sens": "Intrare", "url_vars": "?vw=2&vt=2&dt=1"},
{"tip_vehicul": "Camioane", "sens": "Ieșire", "url_vars": "?vw=2&vt=2&dt=2"}
]
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
def extract_fields(html):
combined_data = []
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('div', {'id': 'maplist'}).find('table')
for row in table.find('tbody').find_all('tr'):
columns = row.find_all('td')
denumire = columns[0].text.strip()
timp_text = columns[1].text.strip()
timp_match = re.search(r'(\d+) min\.', timp_text)
timp = int(timp_match.group(1)) if timp_match else None
info = columns[2].text.strip()
# TODO: detect details in info, recomandări etc
"""
Intrare: 5 artere; Iesire: 6 artere. Trafic intens - numar mare de autoturisme prezente la verificarile de frontiera pe ambele sensuri. RECOMANDARE: pentru evitarea aglomerarilor, puteti folosi si alte puncte de trecere a frontierei!
- split by '.' into sentences
check for: recomandare, trafic intens -, arter
"""
infoz=info.split('.')
unelte_url = columns[3].find('a')['href'] if columns[3].find('a') else ''
lat_long_match = re.search(r'(\d+\.\d+),(\d+\.\d+)', unelte_url)
if lat_long_match:
latitude = lat_long_match.group(1)
longitude = lat_long_match.group(2)
else:
latitude = ''
longitude = ''
timp_span = columns[1].find('span', {'class': re.compile(r'iwrow iws iw_\w+')})
color_match = re.search(r'iw_(\w+)', timp_span['class'][2]) if timp_span else None
level = color_match.group(1) if color_match else None
tip_vehicul = source['tip_vehicul']
sens = source['sens']
combined_data.append([denumire, timp, info, latitude, longitude, level, tip_vehicul, sens])
return combined_data
allnice = 1
for source in sources:
url = base_url + source['url_vars']
try:
response = requests.get(url, headers=headers)
except requests.exceptions.ConnectionError:
allnice = 0
# response.status_code = "Connection refused"
# TODO: logging
if allnice and response.status_code == 200:
combined_data = extract_fields(response.text)
else:
print(f'Failed to fetch the URL for source: {source}. Connection refused')
# TODO: logging
if allnice:
with open(filename + '.csv', 'w', newline='', encoding='utf-8') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(zicolumns)
writer.writerows(combined_data)
csv_obj = pd.DataFrame(combined_data, columns = zicolumns)
# csv_obj.to_json(filename + '.json', orient = "records", date_format = "epoch", double_precision = 10, force_ascii = True, date_unit = "ms", default_handler = None)
csv_obj.to_json(filename + '.json', orient = "records", force_ascii = False, indent=2 )
print(f'Scraped & saved to {filename}.csv/json')
else:
print('meh buba')