Skip to content

Commit

Permalink
Merge pull request #1 from redcat2885/dev
Browse files Browse the repository at this point in the history
DB integration
  • Loading branch information
mkovalyshev authored Nov 5, 2021
2 parents ca6aaf9 + c78f09b commit cf47842
Show file tree
Hide file tree
Showing 5 changed files with 131 additions and 71 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -129,3 +129,4 @@ dmypy.json
.pyre/
/venv/
/resources/
/config.yaml
8 changes: 8 additions & 0 deletions .idea/workspace.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

109 changes: 77 additions & 32 deletions functions.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import requests
import datetime as dt
from bs4 import BeautifulSoup
import json
import os
import yaml
from sqlalchemy import create_engine
import pandas as pd

TODAY = dt.datetime.today().strftime('%Y-%m-%d')
with open('config.yaml') as f:
CONFIG = yaml.load(f, Loader=yaml.FullLoader)

YESTERDAY = (dt.datetime.today() - dt.timedelta(days=1)).strftime('%Y-%m-%d')
HOST = 'https://www.bustime.ru'
HEADERS = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,'
Expand All @@ -13,57 +17,98 @@
'Chrome/87.0.4280.141 Safari/537.36'
}

pg_engine = create_engine(f'postgresql+psycopg2://postgres:{CONFIG["db_pass"]}@localhost/postgres')


def get_cities(url: str = 'https://www.bustime.ru') -> None:
def get_cities(url: str = 'https://www.bustime.ru') -> dict:
"""
gets list of cities' hrefs from bustime.ru
writes to resources/cities.txt
returns None
gets dict of cities from database (if available)
else scrapes from website and writes to db
returns dict or None
"""

soup = BeautifulSoup(requests.get(url).text, features="html.parser")
cities = pg_engine.execute("""select * from bustime.cities
where id in (7,21,28,46,54,5573,80,101,109,120,123,132,136,143)""").\
fetchall()

if len(cities) != 0:
return dict(tuple([tuple([x[1], x[0]]) for x in cities]))

else:

soup = BeautifulSoup(requests.get(url).text, features="html.parser")

cities = [x.get('href').strip('/') for x in soup.find("div", {"aria-label": " Список городов "}). \
find_all("a", {"class": 'item'})]

cities = [x.get('href') for x in soup.find("div", {"aria-label": " Список городов "}). \
find_all("a", {"class": 'item'})]
cities_df = pd.DataFrame(cities, columns=['name']).reset_index().rename(columns={'index': 'id'})

if not os.path.exists('resources/'):
os.mkdir('resources/')
cities_df.to_sql('cities',
pg_engine,
schema='bustime',
if_exists='append',
index=False)

with open("resources/cities.txt", "w") as file:
file.write(';'.join(cities))
cities = pg_engine.execute("select * from bustime.cities").fetchall()

return dict(tuple([tuple([x[1], x[0]]) for x in cities]))

def get_routes(city: str) -> None:

def get_routes(city: str, cities_dict: dict) -> list:
"""
gets dict of route ids matched with route names
writes to resources/*city*/routes.json
return None
gets list of dicts with routes data from database (if available)
else scrapes from website and writes to db
returns list of dicts
"""
soup = BeautifulSoup(requests.get(HOST + city + 'transport/' + TODAY).text, features="html.parser")
routes = {int(x.get('value')): x.text for x in soup.find('select', {'name': 'bus_id'}). \
find_all('option') if x.get('value') != '0'}

if not os.path.exists('resources/' + city.strip('/')):
os.mkdir('resources/' + city.strip('/'))
routes = pd.read_sql(f"select * from bustime.routes where city_id={cities_dict[city]}", pg_engine)

if len(routes) != 0:
routes_dict = routes.to_dict(orient='index')
return [routes_dict[i] for i in routes_dict.keys()]

else:
soup = BeautifulSoup(requests.get(HOST + '/' + city + '/' + 'transport/' + YESTERDAY).text, features="html.parser")
routes = {int(x.get('value')): x.text for x in soup.find('select', {'name': 'bus_id'}). \
find_all('option') if x.get('value') != '0'}

with open('resources/' + city.strip('/') + '/routes.json', 'w', encoding='utf-8') as file:
json.dump(routes, file, ensure_ascii=False)
routes_df = pd.DataFrame(routes.items(), columns=['id', 'name'])
routes_df['city_id'] = cities_dict[city]

routes_df.to_sql('routes',
pg_engine,
schema='bustime',
if_exists='append',
index=False)

def post_ajax(city: str, bus_id: str = 0, date: str = TODAY) -> dict:
routes = pd.read_sql(f"select * from bustime.routes where city_id={cities_dict[city]}", pg_engine)
routes_dict = routes.to_dict(orient='index')
return [routes_dict[i] for i in routes_dict.keys()]


def get_telemetry(city: str, bus_id: str = 0, date: str = YESTERDAY) -> None:
"""
get point data from bustime.ru
gets telemetry data from bustime.ru
loads to database
:param city: str # city of search
:param uid: str # vehicle unique id
:param bus_id: str # route id
:param date: str # date of search
:return: list # with dicts of data
:return: None
credit: github.com/az09
"""

data = {'city_slug': city.strip('/'),
data = {'city_slug': city,
'bus_id': bus_id,
'day': date}

return requests.post(HOST + '/ajax/transport/', data=data).json()

response_df = pd.DataFrame(requests.post(HOST + '/ajax/transport/', data=data).json())
if len(response_df)!=0:
response_df['timestamp'] = date + ' ' + response_df['timestamp']
response_df['timestamp'] = pd.to_datetime(response_df['timestamp'])
response_df['upload_date'] = dt.datetime.today()

response_df.to_sql(f'telemetry_{YESTERDAY.replace("-", "_")}',
pg_engine,
schema='bustime',
if_exists='append',
index=False)
79 changes: 41 additions & 38 deletions parser.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,45 @@
import os

from functions import *
from tqdm import tqdm

print("Start") # LOG
print(TODAY)

print("Loading cities.txt")
if os.path.exists('resources/cities.txt'):
print('LOG: found local file')
with open('resources/cities.txt') as f:
cities = f.read().split(';')
else:
print('LOG: fetching from bustime.ru')
get_cities()
with open('resources/cities.txt') as f:
cities = f.read().split(';')

print(f'LOG: fetched {len(cities)} cities')


print('Getting route lists')
for city in tqdm(cities):
if not os.path.exists('resources'+city+'routes.json'):
get_routes(city)
print('Success\n\n')

print('Getting telemetry')
for city in cities:
with open('resources'+city+'routes.json', encoding='utf-8') as f:
routes = json.load(f)

print('\t', city)

for route in routes.keys(): # Move with clause to function?
directory = 'resources' + city + TODAY
if not os.path.exists(directory):
os.mkdir(directory)
with open(directory+'/'+route+'.json', 'w', encoding='utf-8') as file:
response = post_ajax(city, route, TODAY)
json.dump(response, file, ensure_ascii=False)
print(YESTERDAY)

print("Creating table")

pg_engine.execute(f"""
-- DROP TABLE bustime.telemetry_{YESTERDAY.replace('-', '_')};
CREATE TABLE bustime.telemetry_{YESTERDAY.replace('-', '_')} (
uniqueid varchar(8) NOT NULL,
"timestamp" timestamp NOT NULL,
bus_id int4 NOT NULL,
heading int4 NULL,
speed int4 NULL,
lon float8 NOT NULL,
lat float8 NOT NULL,
direction int4 NULL,
gosnum varchar(64) NULL,
bortnum varchar(64) NULL,
probeg int4 NULL,
upload_date timestamp NOT NULL
);
-- bustime.telemetry_{YESTERDAY.replace('-', '_')} foreign keys
ALTER TABLE bustime.telemetry_{YESTERDAY.replace('-', '_')} ADD CONSTRAINT telemetry_fk FOREIGN KEY (bus_id) REFERENCES bustime.routes(id);
""")

print('Loading cities')

cities = get_cities()

print(f'LOG: fetched {len(list(cities.keys()))} cities')

for city in cities.keys():
print(city)
routes = get_routes(city, cities)

for route in tqdm(routes):
get_telemetry(city, route['id'])

5 changes: 4 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@ beautifulsoup4==4.10.0
bs4==0.0.1
certifi==2021.10.8
charset-normalizer==2.0.7
geojson==2.5.0
idna==3.3
requests==2.26.0
soupsieve==2.2.1
urllib3==1.26.7
tqdm~=4.62.3
PyYAML~=6.0
SQLAlchemy~=1.4.26
pandas~=1.3.4

0 comments on commit cf47842

Please sign in to comment.