-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
357 lines (321 loc) · 17.9 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
# This code is also really heavily commented, because this is project is likely
# to use a wide variety of volunteer maintainers, it may change hands many
# times, other docs may get lost, and many of the volunteers may not have
# a lot of Python or Flask experience. So, please excuse the verbosity, and,
# when adding comments, please err on the side of verbosity.
# The libraries are intentionally minimized so that we can deploy without
# a complex environment
import requests
from bs4 import BeautifulSoup
from flask import Flask, render_template, jsonify
import pandas as pd
import time
import re
app = Flask(__name__)
# The function below courtesy of Geoff Boeing
# Urban planning professor at Northeastern University
# published at https://geoffboeing.com/2015/10/exporting-python-data-geojson/
def df_to_geojson(df, properties, lat='latitude', lon='longitude'):
"""Generates geojson file from DataFrame, pass column names for properties
as a list"""
geojson = {'type':'FeatureCollection', 'features':[]}
for _, row in df.iterrows():
feature = {'type':'Feature',
'properties':{},
'geometry':{'type':'Point',
'coordinates':[]}}
feature['geometry']['coordinates'] = [row[lon],row[lat]]
for prop in properties:
feature['properties'][prop] = row[prop]
geojson['features'].append(feature)
return geojson
# This is the Flask part of the app.
@app.route("/")
def greetings():
"""Displays main page"""
return render_template("index.html")
@app.route("/get_mural_data")
def get_mural_data():
"""Retrieves mural data by parsing data delivered by web request. Calls
df_to_geojson to return data as a geojson object"""
# Default to an empty object to return
mural_data = dict()
# Last checked 27 June 2019 -- should be a page listing all murals in
# the Oakland LocalWiki
url = 'https://localwiki.org/oakland/Murals'
# Retrieve the page via http
response = requests.get(url).text
# Processing section
# Init data holders
base_url = 'https://localwiki.org/oakland/'
site_url = 'https://localwiki.org'
mural_names = []
mural_links = []
lats = []
lons = []
first_snippet = True #Flag for discarding the first part of the page
# The text is split at the "GEOMETRYCOLLECTION" keyword. Following the key
# word are lat and lon coordinates inside keywords such as Point, Polygon,
# Linestring. Following this is an <a> tag that we can use for a breakpoint.
# After the tag, the mural name is given so that also can be collected.
for snippet in response.split('["SRID=4326;GEOMETRYCOLLECTION ('):
# Skip first snippet that comes before any GEOMETRYCOLLECTION is found
if first_snippet:
first_snippet = False
continue
else:
# Text parsing
# Split at tag ... forming two pieces [0] and [1]
temp_split = snippet.split('>')
# The last part [1] is the name, after lopping off the final three
# characters, which are '</a' (the left-over part of the tag closure)
mural_name = temp_split[1][0:-3]
# To grab the link, we take the first part, split it at '/oakland/', which is where
# the path starts, take the part after that, and leave out the last two characters (\")
# This string is added to the LocalWiki base_url
mural_link = base_url + temp_split[0].split('/oakland/')[1][:-2]
# To get the coordinates, we'll go "back a step" to temp_split
# TODO: Improve this to not go backwards!
# The part before the tag wil be a lon and lat
# These are just given as two numbers separated by a space
# Conveniently, the longitude will always be negative, so
# we can count on the '-' sign as a more reliable indicator than
# a space as to where the longitude starts. Because the longitude
# comes first, this also conveniently lets us know where the coordinates
# we want start, so we split at the '-'
geo1 = temp_split[0].split('-')
# Upon splitting, item [0] will be whatever preceding stuff
# mostly parentheses, that we don't want, so ignore it.
# Item [1] will have a number (stripped of a minus sign) followed
# by a space followed by another number and possibly more.
# So, if we split item [1] at the space, we will get the longitude
# at the new item 0, and the latitude (plus some extra) at item [1]
geo2 = geo1[1].split(' ')
# We just need to convert the longitude to a number and multiply
# by -1 to put back the negative sign that got stripped off
lon = float(geo2[0]) * -1
# For the latitude, we might get some extra characters at the
# end such as ) " , -- so we will strip these from the right
# It is tricky to format, but we need ) " and , in single quotes
# as the argument for rstrip()
lat = float(geo2[1].rstrip('),"'))
# Now that we got everything, just append each one
mural_names.append(mural_name)
mural_links.append(mural_link)
lats.append(lat)
lons.append(lon)
# This procedure gets applied for all the GEOMETRYCOLLECTION objects
# The final object is where we should check for issues, but because
# a geometry collection always needs at least two numbers and an <a>
# tag in this context, all the rest gets thrown out when splitting on
# '>', so it works well.
# We now need to get images for each mural on the list.
# These will be converted to html for the pop-ups that will appear for each mural
# The pop-up html will be stored in a list called popup_list
# We will also keep a list of murals that will be put in the reserved map because
# they are not available for viewing
popup_list = []
put_on_reserve_list = []
favored_image_found_list = []
artist_found_list = []
mural_gone_list = []
# We can conveniently use all the links we just gathered to visit pages one by one
# but we will need their index to match things like thier name
for ix, mural_page in enumerate(mural_links):
# Grab page data
page_data = requests.get(mural_page).text
# Deliberately slow down the process so as not to overwhelm LocalWiki with requests
time.sleep(0.2)
print(f'Getting page {ix}')
# BeautifulSoup will take the page data and parse it very conveniently for us
soup = BeautifulSoup(page_data, 'html.parser')
# First, we will look for any Wiki tag that OAM has left on the page
# Wiki tags with text that starts with 'oam_uses_' should include the name of an image
# that they would like to use.
# Wiki tags that start with 'not currently visible' should be put on the reserve map
# Wiki tags are found in list item (li) html tags, with a class of 'tag', that is,
# <li class="tag">... to find these, we just pass a keyword argument (class_, because
# 'class' itself is a reserved word), to the Beautiful Soup find_all function
page_tags = soup.find_all('li', class_='tag')
# Default to not use favored image
use_favored_image = False
mural_not_visible = False
artist_found = False
artist_name = None
artist_link = None
favored_image_name = None
# If no Wiki tags were found, the for loop will be skipped
for page_tag in page_tags: #page_tag is a Beautiful Soup object, with many helpful featuers
tag_text = page_tag.text # a convenient Beautiful Soup attribute
if tag_text[0:13].lower() == 'photo_display': # Check for the special OAM prefix, ignoring case
favored_image_name = tag_text[10:] # What follows the prefix should be a name
if favored_image_name is not None:
use_favored_image = True
if tag_text[0:21].lower() == 'not currently visible':
mural_not_visible = True
if tag_text[0:6].lower() == 'artist':
artist_name = tag_text[7:].lstrip().rstrip()
if artist_name is not None:
artist_found = True
for target_tag in page_tags:
if target_tag.text.lower().lstrip().rstrip() == artist_name.lower().lstrip().rstrip():
artist_link = 'https://localwiki.org/oakland/tags/' + target_tag.text.lower().replace(' ','')
#Append True or False to put_on_reserve_map_list
put_on_reserve_list.append(mural_not_visible)
# Now we will collect the image info
# We wukk start by putting 'None' in link_for_image, to track if we've found anything
link_for_image = None
# The image names are stored in <span class="image_frame image_frame_border" tags
# We can search using just 'image_frame' for the class thanks to Beautiful Soup
image_tags = soup.find_all('span',class_='image_frame')
# First case, if a favored image is found
if use_favored_image:
# If the name has a file suffix (e.g. '.jpg', we will ignore it)
# We do that by splitting at the dot and taking everything prior
favored_image_name = favored_image_name.split('.')[0]
# Within these tags, the name we want to search for will be inside the <a> tag
# so we search through image_tags one at a time, see if there are any <a> tags
# where the href contains the favored_image_name (using regular expressions),
# and, if so, assign the <img> tag's src attribute to the image link and the
# matching tag's href attribute to the info link, then stop searching
for image_tag in image_tags:
matching_tag = image_tag.find('a',href=re.compile(favored_image_name))
if matching_tag is not None:
link_for_image = site_url + matching_tag.img['src']
link_for_img_info = site_url + matching_tag['href']
break
# This part will be triggered if we found nothing even though a favored image
# was specified, or if no favored image was specified
if link_for_image is None:
# Just use the first image by default (use try to exclude cases where there is no image)
# To get it, take the first a tag, then first img tag, then the src attribute value
# In addition, the info page will be the first tag's <a> href attribute.
# All very easy to do with Beautiful Soup
try:
link_for_image = site_url + image_tags[0].a.img['src']
link_for_img_info = site_url + image_tags[0].a['href']
except (TypeError, IndexError): #happens if no image tags are located, use default urls
link_for_image = 'https://upload.wikimedia.org/wikipedia/commons/a/ac/No_image_available.svg'
link_for_img_info = 'https://localwiki.org/oakland/Murals'
# Now we will put the mural name, link, and image link in pop-up text
# according to a pre-specified format
popup_text_string = f'<a href="{mural_page}" target="blank">'
popup_text_string += f'{mural_names[ix]}</a><br>'
if artist_found:
if (artist_link is not None):
popup_text_string += f'by <a href="{artist_link}" target="blank">{artist_name}</a><br>'
else: # artist found, but no link found
popup_text_string += f'by {artist_name}<br>'
else: # artist not found
popup_text_string += f'Help us <a href="https://oaklandartmurmur.org/help-oakland-art-murmur-identify-mural-artists/" target="blank">give credit</a> to the artist.<br>'
popup_text_string += f'<a href="{link_for_img_info}" target="blank">'
popup_text_string += f'<img src="{link_for_image}"></a><br>'
popup_text_string += f'<a href="{mural_page}" target="blank">More info '
popup_text_string += f'</a><a href="{link_for_img_info}" target="blank">Larger image</a>'
popup_list.append(popup_text_string)
# Finally, include info for tracking
favored_image_found_list.append(favored_image_name)
artist_found_list.append(artist_name)
mural_gone_list.append(mural_not_visible)
# This ends our big for loop (visitng all pages to get info)
# For test purposes only, append blank values to popup_list
popup_list_length = len(popup_list)
popup_list_missing_length = len(mural_links) - popup_list_length
if popup_list_missing_length > 0:
for _ in range(popup_list_missing_length):
popup_list.append('not collected')
# Now, take these lists and make a Pandas DataFrame
# The coordinates need to be in two colums called 'latitude' and
# 'longitude' for the geojson generator to work properly
mural_df = pd.DataFrame({'name':mural_names,
'latitude':lats,
'longitude':lons})
# Now we need to add columns to the DataFrame to capture the
# properties needed for the OAM maps software. These properties
# include an 'id', 'name', 'address', 'zoom', 'icon', 'popup',
# 'link', 'blank', and 'maps'
# 'name' is already done from the mural name
# The 'id' needs to be a sequence. To avoid conflicts with other
# map markers, the number range 101-999 is reserved for this purpose.
# First, get the list length
list_length = len(mural_names)
# Generate a list of each number starting at 101
id_list = [num for num in range(101,101+list_length)]
# Now, add the list to the DataFrame
mural_df['id'] = id_list
# The 'address' is required but not available. We don't want to just
# leave it blank because a geocoder might get ahold of it, so we want
# to give it something that won't lead a user too far astray.
# We'll use 'Oakland, CA' -- pandas will broadcast to each row of the
# dataframe
mural_df['address'] = 'Oakland, CA'
# 'zoom' is a default map zoom setting. For the OAM mapper, 13 works
# well as a default
mural_df['zoom'] = 13
# 'icon' needs to be a file name. OAM uses 'art_blank_t.png'
mural_df['icon'] = 'art_black_t.png'
# 'popup' controls what goes in the pop-up when the marker is selected
# We will use the list of carefully formatted strings we put together
mural_df['popup'] = popup_list
# 'link' controls the link to be displayed. We will keep blank so that
# users get a popup instead
mural_df['link'] = ''
# 'blank' is an internal indicator for the OAM software. We think it is
# needed for proper transparency, but unsure. We do know that 1 works.
mural_df['blank'] = 1
# 'maps' is a configuration variable to tell the OAM software which map to
# put the markers on. Map #17 is currently for used markers, and map 21
# is for unused. We will use put_on_reserve_list converted to int (1 if true,
# 0 if false), multiply by 4, then add to 17, so that the list comprehension will
# generate 17 if put_on_reserve_list is False or 21 if True
map_list = [17 + 4 * int(list_item) for list_item in put_on_reserve_list]
mural_df['maps'] = map_list
# Create a new DataFrame for tracking purposes -- use .copy() so that original
# data is not modified
tracking_df = mural_df.copy()
# Add the tracking info to this DataFrame
tracking_df['specific_image_name'] = favored_image_found_list
tracking_df['artist_name'] = artist_found_list
tracking_df['mural_removal_confirmed'] = mural_gone_list
# Save the tracking DataFrame
tracking_df.to_csv('mural_tracker.csv')
# For custom murals that do not appear on the scraped list but that we want to add,
# we will import a csv with all the right colums
try:
extra_murals_df = pd.read_csv('extra_murals.csv')
mural_df = mural_df.append(extra_murals_df)
except IOError:
pass # Ignore this step if it generates an error
# Now what we need to do is reserve space for future mural maps
# To do this, we'll make a list of dictionaries with the column names as
# keys, and then convert to a DataFrame that will be appended to the one
# we just made
list_of_dicts_for_reserve = []
temp_dict = dict() # Empty temporary dict to append to list
for num in range(101+len(mural_df),1001):
temp_dict['name'] = 'reserved'
temp_dict['latitude'] = 37.8 # This is a stand-in for Oakland
temp_dict['longitude'] = -122.4 # This puts the marker safely out to sea so no one sees it by accident
temp_dict['id'] = int(num)
temp_dict['address'] = 'Oakland, CA'
temp_dict['zoom'] = 13
temp_dict['icon'] = 'art_blank_t.png'
temp_dict['popup'] = 'reserved'
temp_dict['link'] = 'https://localwiki.org/oakland/Murals'
temp_dict['blank'] = 1
temp_dict['maps'] = 21 # This is a map to store the 'reserved' markers
list_of_dicts_for_reserve.append(temp_dict)
# Convert to DataFrame is now super easy
reserve_df = pd.DataFrame(list_of_dicts_for_reserve)
# Now append this to the original DataFrame, note the need for assignment
mural_df = mural_df.append(reserve_df)
# Now that the DataFrame is complete, we can turn it into geojson
# We will make a list of all the properties column names
props = ['id','name','address','zoom','icon','popup','link','blank','maps']
# Now just call the function with the df and props names as arguments
mural_data = df_to_geojson(mural_df, props)
# Last step, we 'jsonify' the data for output to the client, which will be
# a Javascript routine on index.html
return jsonify(mural_data)
if __name__ == '__main__':
app.run()