-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraping_utils.py
104 lines (80 loc) · 3.52 KB
/
scraping_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import pandas as pd
import geopandas as gpd
import requests
from tqdm import tqdm
import upcp.scrapers.ams_bgt_scraper as ams_bgt_scraper
import upcp.utils.las_utils as las_utils
import upc_sw.poly_utils as poly_utils
CRS = 'epsg:28992'
BGT_use_columns = ['geometry', 'identificatie_lokaalid', 'naam']
BGT_namedict = {'BGT': 'bgt_functie',
'BGTPLUS': 'plus_type'}
TERRAS_WFS_URL = 'https://api.data.amsterdam.nl/v1/wfs/horeca/?'
TERRAS_use_columns = ['geometry', 'zaaknummer', 'naam']
def get_terras_data_for_bbox(bbox, layers=None):
"""Scrape 'terras' data in a given bounding box."""
gdf = gpd.GeoDataFrame(columns=TERRAS_use_columns,
geometry='geometry', crs=CRS)
gdf.index.name = 'id'
params = 'REQUEST=GetFeature&' \
'SERVICE=WFS&' \
'VERSION=2.0.0&' \
'TYPENAMES=exploitatievergunning-terrasgeometrie&'
if bbox is not None:
bbox_string = str(bbox[0][0]) + ',' + str(bbox[0][1]) + ',' \
+ str(bbox[1][0]) + ',' + str(bbox[1][1])
params = params + 'BBOX=' + bbox_string + '&'
params = params + 'OUTPUTFORMAT=geojson'
response = requests.get(TERRAS_WFS_URL + params)
try:
json = response.json()
if json['numberReturned'] > 0:
gdf = gpd.GeoDataFrame.from_features(
response.json(), crs=CRS).set_index('id')
gdf['naam'] = 'terras'
gdf['geometry'] = gdf['geometry'].apply(poly_utils.fix_invalid)
return gdf[TERRAS_use_columns]
else:
return gdf
except ValueError:
return gdf
def get_bgt_data_for_bbox(bbox, layers):
"""Scrape BGT data in a given bounding box."""
gdf = gpd.GeoDataFrame(columns=BGT_use_columns,
geometry='geometry', crs=CRS)
gdf.index.name = 'ogc_fid'
content = []
for layer in layers:
# Scrape data from the Amsterdam WFS, this will return a json response.
json_content = ams_bgt_scraper.scrape_amsterdam_bgt(layer, bbox=bbox)
layer_type = BGT_namedict[layer.split('_')[0]]
# Parse the downloaded json response.
if json_content is not None and len(json_content['features']) > 0:
gdf = gpd.GeoDataFrame.from_features(
json_content, crs=CRS).set_index('ogc_fid')
gdf = gdf[gdf['bgt_status'] == 'bestaand']
gdf['naam'] = gdf[layer_type]
content.append(gdf[BGT_use_columns])
if len(content) > 0:
gdf = pd.concat(content)
return gdf
def process_tiles(tiles, bgt_layers, scraper=get_bgt_data_for_bbox):
"""This method scrapes data precisely for the needed area."""
bgt_data = []
tile_tqdm = tqdm(tiles, unit='tile', smoothing=0)
for tilecode in tile_tqdm:
tile_tqdm.set_postfix_str(tilecode)
bbox = las_utils.get_bbox_from_tile_code(tilecode, padding=0)
bgt_data.append(scraper(bbox, bgt_layers))
bgt_gdf = pd.concat(bgt_data)
return bgt_gdf[~bgt_gdf.duplicated()]
def process_folder(folder, bgt_layers, scraper=get_bgt_data_for_bbox):
"""
This method scrapes all data in an area defined as the bounding box for all
point cloud tiles in a given folder. This results in some unnecessary
data, but is much faster if the folder contains many files, and / or is
densily packed within the bounding box.
"""
bbox = las_utils.get_bbox_from_las_folder(folder, padding=0)
bgt_data = scraper(bbox, bgt_layers)
return bgt_data