-
Notifications
You must be signed in to change notification settings - Fork 2
/
utils.py
288 lines (239 loc) · 9.73 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
'''
UCF Crimes: utils.py
Various functions to assist with backend & discord bot functionality.
Written by Ethan Frakes and Maverick Reynolds
'''
import discord
import json
import re
import pandas as pd
import requests
import editdistance
from math import radians, sin, cos, sqrt, atan2
from datetime import datetime
from sqlalchemy import create_engine
from sqlalchemy.engine.base import Engine
from configparser import ConfigParser
async def bot_help(interaction: discord.Interaction):
'''
Help command for discord bot with embed showing all possible commands.
'''
embed = discord.Embed(
title = "UCF Crimes Help",
description = "Available bot commands:\n\n"
+ "/crimes {MM/DD/YY} or {location} or {address} or {crime title} or {disposition} \n"
+ " - View all crime reports searched with the given parameter. \n"
+ " - Ex: /crimes 3/30/24 OR /crimes Nike 106 \n\n"
+ "/heatmap {main} or {downtown} or {rosen} \n"
+ " - View heatmap of all crime reports in database within the area of the selected campus. \n"
+ " - Ex: /crimes main \n\n"
+ "/locations \n"
+ " - List all available locations and addresses available to query. \n\n"
+ "/ping \n"
+ " - Test if bot is online by pinging.",
color = discord.Color.red()
)
await interaction.response.send_message(embed=embed)
def setup_db(main_config: ConfigParser) -> Engine:
'''
Connect to the PostgreSQL database
'''
host = main_config.get('POSTGRESQL', 'host')
database = main_config.get('POSTGRESQL', 'database')
user = main_config.get('POSTGRESQL', 'user')
password = main_config.get('POSTGRESQL', 'password')
db_uri = f'postgresql://{user}:{password}@{host}/{database}'
engine = create_engine(db_uri)
return engine
def is_valid_date(date_string: str) -> bool:
'''
Returns if date string token passed is valid.
'''
# First is for mm/dd/yy and second is for mm/dd/yyyy
valid_formats = ['%m/%d/%y', '%m/%d/%Y']
for date_format in valid_formats:
try:
datetime.strptime(date_string, date_format)
return True
except ValueError:
pass
return False
def is_valid_time_label(time_str: str) -> bool:
'''
Returns if time string token passed is valid.
'''
try:
datetime.strptime(time_str, '%H:%M')
return True
except ValueError:
return False
def is_valid_case_id(case_id_str: str) -> bool:
'''
Checks if input string is or is not a case id.
Used in tokenizer to make sure that delimiter is indeed the disposition and not word in crime title.
'''
id_patterns = [r'^\d{4}-\d{4}$', r'^\d{4}-[A-Za-z]{3}\d{2}$']
for pattern in id_patterns:
if re.match(pattern, case_id_str):
return True
return False
def titlize(title: str) -> str:
'''
Returns title function applied to string with exceptions found in title_exceptions.json
'''
with open('title_exceptions.json') as f:
exceptions: dict = json.load(f)
new_title = ''
for token in title.split():
if token in exceptions.keys():
new_title += exceptions[token]
else:
new_title += token.title()
new_title += ' '
return new_title.strip()
def get_emojis(title: str) -> str:
'''
Because emojis are fun
'''
with open('emojis.json', 'r', encoding="utf-8") as f:
emojis: dict = json.load(f)
emojis_suffix = ''
for emoji_txt in emojis.keys():
if re.search(emoji_txt, title, re.IGNORECASE):
# If a match, go through each emoji in the value and add it to the suffix
for emoji in emojis[emoji_txt]:
if emoji not in emojis_suffix:
# Must use += and not .join() to preserve unicode
emojis_suffix += f'{emoji}'
return emojis_suffix
def get_lat_lng_from_address(address: str, google_maps_api_key: str) -> tuple[float, float] | tuple[None, None]:
'''
Uses google geocoding endpoint to get lat, lng from address.
Will prefer locations within box containing UCF and Downtown
(Does not totally restrict results to this box).
If address includes predetermined keyword(s), it will use given results
'''
BL_BOUND = (28.522318, -81.407249)
TR_BOUND = (28.644500, -81.155722)
KEY_PHRASES = {
'B8': (28.5939606, -81.2014182),
'36 PINE ST W': (28.5412345, -81.3797360),
'ON CAMPUS': (28.6024367, -81.2000568),
'PLAZA DR E':(28.6069698, -81.1967868),
'PLAZA DR W': (28.6074074, -81.1980356),
'GEMINI/SCORPIUS': (28.6018854, -81.1944728),
'KINGS KNIGHT': (28.6104158, -81.2154757),
'KROSSING': (28.6113891, -81.2113697),
'410 TERRY AVE N': (28.537944, -81.386917)
}
endpoint = 'https://maps.googleapis.com/maps/api/geocode/json'
params = {
'address': address,
'key': google_maps_api_key,
'bounds': f'{BL_BOUND[0]},{BL_BOUND[1]}|{TR_BOUND[0]},{TR_BOUND[1]}'
}
# Use key phrases if they are in address
for phrase in KEY_PHRASES.keys():
if phrase in address:
return KEY_PHRASES[phrase]
result = requests.get(endpoint, params=params)
if result.status_code not in range(200, 299) or result.json()['status'] == 'ZERO_RESULTS':
# Request failed or no results found
return None, None
else:
# Return lat, lng
lat, lng = result.json()['results'][0]['geometry']['location'].values()
return round(lat, 7), round(lng, 7)
def haversine_form(latitude_center: float, longitude_center: float, latitude_place: float, longitude_place: float) -> float:
lat1 = radians(latitude_center)
lon1 = radians(longitude_center)
lat2 = radians(latitude_place)
lon2 = radians(longitude_place)
# Haversine formula
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
c = 2 * atan2(sqrt(a), sqrt(1 - a))
distance = 6371 * c # Radius of the Earth in kilometers
return distance
def get_place_name(lat: float, lng: float, GMAPS_API_KEY: str, radius: int=100) -> str | None:
'''
Sends request to Google Places API for all place markers within 100 meters
of address coordinates; finds closest that is not a road or transit station.
'''
url = f'https://maps.googleapis.com/maps/api/place/nearbysearch/json?location={lat},{lng}&radius={radius}&key={GMAPS_API_KEY}'
r = requests.get(url)
# print(r.text)
json_r = json.loads(r.text)
places_dict = {}
for result in json_r["results"]:
if "route" not in result["types"] and "transit_station" not in result["types"]:
lat_place = result["geometry"]["location"]["lat"]
lon_place = result["geometry"]["location"]["lng"]
distance = haversine_form(lat, lng, lat_place, lon_place)
# print(f"Distance for {result['name']} = {distance}")
places_dict[distance] = result['name']
if len(places_dict.keys()) > 0:
return places_dict[min(places_dict.keys())]
else:
return None
def change_all_addresses(GMAPS_API_KEY: str) -> None:
crimes = pd.read_csv('crimes.csv', index_col=0)
for idx, row in crimes.iterrows():
if row["address"] == row["place"].upper():
lat, lng = get_lat_lng_from_address(row["address"], GMAPS_API_KEY)
if place_name := get_place_name(lat, lng, GMAPS_API_KEY):
crimes.at[idx, "place"] = f"near {place_name}"
print(place_name)
crimes.to_csv('crimes.csv')
def address_to_place(address: str,
lat: float, lng: float,
GMAPS_API_KEY: str,
typo_tolerance=1) -> str | None:
'''
Takes address and compares it to the locations.json file
Robust against varying word positions and typo errors
If selenium scraping is enabled and a path is given, the function will use that as a backup
Otherwise, returns the titled version of the address
'''
# Make proper substitutions to change cardinal directions and other syntax
address = ' ' + address + ' '
address = re.sub('\.', '', address) # Remove periods
subs = {
' W ': ' WEST ',
' E ': ' EAST ',
' N ': ' NORTH ',
' S ': ' SOUTH ',
'BLD' : 'BLVD',
' ?(?: and |\/) ?': ' & ' # Change intersection of streets to &
}
for key, value in subs.items():
address = re.sub(key, value, address, flags=re.IGNORECASE)
address = address.strip()
# Tokenize address
txt_tokens = address.split()
# Start by looking through the address
with open('locations.json') as f:
locations: dict[str] = json.load(f)
# Do this with every key
for key in locations.keys():
key_tokens = key.split()
is_match = True
# If numerical start, make sure it matches exactly
if re.match('\d', txt_tokens[0]) and re.match('\d', key_tokens[0]) and txt_tokens[0] != key_tokens[0]:
is_match = False
continue
# Go through each token in key
for key_token in key_tokens:
token_distances = [editdistance.eval(key_token, txt_token) for txt_token in txt_tokens]
# Fail if no token is within tokerance for all key tokens
if min(token_distances) > typo_tolerance:
is_match = False
break
# If all tokens are within tolerance, locations is found
if is_match:
return locations[key]
if (place_name := get_place_name(lat, lng, GMAPS_API_KEY)):
print(place_name)
return f"near {place_name}"
return titlize(address)