diff --git a/.gitignore b/.gitignore
index e9691a4..8f40220 100644
--- a/.gitignore
+++ b/.gitignore
@@ -129,3 +129,4 @@ dmypy.json
.pyre/
/venv/
/resources/
+/config.yaml
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index 9c39d0c..9bade09 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -14,6 +14,13 @@
+
+
+
+
+
+
+
@@ -22,6 +29,7 @@
+
diff --git a/functions.py b/functions.py
index 6bf4562..c384f3c 100644
--- a/functions.py
+++ b/functions.py
@@ -1,10 +1,14 @@
import requests
import datetime as dt
from bs4 import BeautifulSoup
-import json
-import os
+import yaml
+from sqlalchemy import create_engine
+import pandas as pd
-TODAY = dt.datetime.today().strftime('%Y-%m-%d')
+with open('config.yaml') as f:
+ CONFIG = yaml.load(f, Loader=yaml.FullLoader)
+
+YESTERDAY = (dt.datetime.today() - dt.timedelta(days=1)).strftime('%Y-%m-%d')
HOST = 'https://www.bustime.ru'
HEADERS = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,'
@@ -13,57 +17,98 @@
'Chrome/87.0.4280.141 Safari/537.36'
}
+pg_engine = create_engine(f'postgresql+psycopg2://postgres:{CONFIG["db_pass"]}@localhost/postgres')
+
-def get_cities(url: str = 'https://www.bustime.ru') -> None:
+def get_cities(url: str = 'https://www.bustime.ru') -> dict:
"""
- gets list of cities' hrefs from bustime.ru
- writes to resources/cities.txt
- returns None
+ gets dict of cities from database (if available)
+ else scrapes from website and writes to db
+ returns dict or None
"""
- soup = BeautifulSoup(requests.get(url).text, features="html.parser")
+ cities = pg_engine.execute("""select * from bustime.cities
+ where id in (7,21,28,46,54,5573,80,101,109,120,123,132,136,143)""").\
+ fetchall()
+
+ if len(cities) != 0:
+ return dict(tuple([tuple([x[1], x[0]]) for x in cities]))
+
+ else:
+
+ soup = BeautifulSoup(requests.get(url).text, features="html.parser")
+
+ cities = [x.get('href').strip('/') for x in soup.find("div", {"aria-label": " Список городов "}). \
+ find_all("a", {"class": 'item'})]
- cities = [x.get('href') for x in soup.find("div", {"aria-label": " Список городов "}). \
- find_all("a", {"class": 'item'})]
+ cities_df = pd.DataFrame(cities, columns=['name']).reset_index().rename(columns={'index': 'id'})
- if not os.path.exists('resources/'):
- os.mkdir('resources/')
+ cities_df.to_sql('cities',
+ pg_engine,
+ schema='bustime',
+ if_exists='append',
+ index=False)
- with open("resources/cities.txt", "w") as file:
- file.write(';'.join(cities))
+ cities = pg_engine.execute("select * from bustime.cities").fetchall()
+ return dict(tuple([tuple([x[1], x[0]]) for x in cities]))
-def get_routes(city: str) -> None:
+
+def get_routes(city: str, cities_dict: dict) -> list:
"""
- gets dict of route ids matched with route names
- writes to resources/*city*/routes.json
- return None
+ gets list of dicts with routes data from database (if available)
+ else scrapes from website and writes to db
+ returns list of dicts
"""
- soup = BeautifulSoup(requests.get(HOST + city + 'transport/' + TODAY).text, features="html.parser")
- routes = {int(x.get('value')): x.text for x in soup.find('select', {'name': 'bus_id'}). \
- find_all('option') if x.get('value') != '0'}
- if not os.path.exists('resources/' + city.strip('/')):
- os.mkdir('resources/' + city.strip('/'))
+ routes = pd.read_sql(f"select * from bustime.routes where city_id={cities_dict[city]}", pg_engine)
+
+ if len(routes) != 0:
+ routes_dict = routes.to_dict(orient='index')
+ return [routes_dict[i] for i in routes_dict.keys()]
+
+ else:
+ soup = BeautifulSoup(requests.get(HOST + '/' + city + '/' + 'transport/' + YESTERDAY).text, features="html.parser")
+ routes = {int(x.get('value')): x.text for x in soup.find('select', {'name': 'bus_id'}). \
+ find_all('option') if x.get('value') != '0'}
- with open('resources/' + city.strip('/') + '/routes.json', 'w', encoding='utf-8') as file:
- json.dump(routes, file, ensure_ascii=False)
+ routes_df = pd.DataFrame(routes.items(), columns=['id', 'name'])
+ routes_df['city_id'] = cities_dict[city]
+ routes_df.to_sql('routes',
+ pg_engine,
+ schema='bustime',
+ if_exists='append',
+ index=False)
-def post_ajax(city: str, bus_id: str = 0, date: str = TODAY) -> dict:
+ routes = pd.read_sql(f"select * from bustime.routes where city_id={cities_dict[city]}", pg_engine)
+ routes_dict = routes.to_dict(orient='index')
+ return [routes_dict[i] for i in routes_dict.keys()]
+
+
+def get_telemetry(city: str, bus_id: str = 0, date: str = YESTERDAY) -> None:
"""
- get point data from bustime.ru
+ gets telemetry data from bustime.ru
+ loads to database
:param city: str # city of search
- :param uid: str # vehicle unique id
:param bus_id: str # route id
:param date: str # date of search
- :return: list # with dicts of data
+ :return: None
credit: github.com/az09
"""
- data = {'city_slug': city.strip('/'),
+ data = {'city_slug': city,
'bus_id': bus_id,
'day': date}
- return requests.post(HOST + '/ajax/transport/', data=data).json()
-
+ response_df = pd.DataFrame(requests.post(HOST + '/ajax/transport/', data=data).json())
+ if len(response_df)!=0:
+ response_df['timestamp'] = date + ' ' + response_df['timestamp']
+ response_df['timestamp'] = pd.to_datetime(response_df['timestamp'])
+ response_df['upload_date'] = dt.datetime.today()
+
+ response_df.to_sql(f'telemetry_{YESTERDAY.replace("-", "_")}',
+ pg_engine,
+ schema='bustime',
+ if_exists='append',
+ index=False)
diff --git a/parser.py b/parser.py
index 6f07330..7f2d84c 100644
--- a/parser.py
+++ b/parser.py
@@ -1,42 +1,45 @@
-import os
-
from functions import *
from tqdm import tqdm
print("Start") # LOG
-print(TODAY)
-
-print("Loading cities.txt")
-if os.path.exists('resources/cities.txt'):
- print('LOG: found local file')
- with open('resources/cities.txt') as f:
- cities = f.read().split(';')
-else:
- print('LOG: fetching from bustime.ru')
- get_cities()
- with open('resources/cities.txt') as f:
- cities = f.read().split(';')
-
-print(f'LOG: fetched {len(cities)} cities')
-
-
-print('Getting route lists')
-for city in tqdm(cities):
- if not os.path.exists('resources'+city+'routes.json'):
- get_routes(city)
-print('Success\n\n')
-
-print('Getting telemetry')
-for city in cities:
- with open('resources'+city+'routes.json', encoding='utf-8') as f:
- routes = json.load(f)
-
- print('\t', city)
-
- for route in routes.keys(): # Move with clause to function?
- directory = 'resources' + city + TODAY
- if not os.path.exists(directory):
- os.mkdir(directory)
- with open(directory+'/'+route+'.json', 'w', encoding='utf-8') as file:
- response = post_ajax(city, route, TODAY)
- json.dump(response, file, ensure_ascii=False)
+print(YESTERDAY)
+
+print("Creating table")
+
+pg_engine.execute(f"""
+-- DROP TABLE bustime.telemetry_{YESTERDAY.replace('-', '_')};
+
+CREATE TABLE bustime.telemetry_{YESTERDAY.replace('-', '_')} (
+ uniqueid varchar(8) NOT NULL,
+ "timestamp" timestamp NOT NULL,
+ bus_id int4 NOT NULL,
+ heading int4 NULL,
+ speed int4 NULL,
+ lon float8 NOT NULL,
+ lat float8 NOT NULL,
+ direction int4 NULL,
+ gosnum varchar(64) NULL,
+ bortnum varchar(64) NULL,
+ probeg int4 NULL,
+ upload_date timestamp NOT NULL
+);
+
+
+-- bustime.telemetry_{YESTERDAY.replace('-', '_')} foreign keys
+
+ALTER TABLE bustime.telemetry_{YESTERDAY.replace('-', '_')} ADD CONSTRAINT telemetry_fk FOREIGN KEY (bus_id) REFERENCES bustime.routes(id);
+""")
+
+print('Loading cities')
+
+cities = get_cities()
+
+print(f'LOG: fetched {len(list(cities.keys()))} cities')
+
+for city in cities.keys():
+ print(city)
+ routes = get_routes(city, cities)
+
+ for route in tqdm(routes):
+ get_telemetry(city, route['id'])
+
diff --git a/requirements.txt b/requirements.txt
index af0774a..55f3152 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,8 +2,11 @@ beautifulsoup4==4.10.0
bs4==0.0.1
certifi==2021.10.8
charset-normalizer==2.0.7
-geojson==2.5.0
idna==3.3
requests==2.26.0
soupsieve==2.2.1
urllib3==1.26.7
+tqdm~=4.62.3
+PyYAML~=6.0
+SQLAlchemy~=1.4.26
+pandas~=1.3.4
\ No newline at end of file