-
Notifications
You must be signed in to change notification settings - Fork 6
/
join_by_time.py
380 lines (298 loc) · 13.8 KB
/
join_by_time.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
try:
import os
import argparse
from glob import iglob
from os.path import join
from pathlib import Path
import pickle
import warnings
import xarray as xr
import pandas as pd
import numpy as np
import h5py
from termcolor import colored
from matplotlib import pyplot as plt
import matplotlib.colors as colors
import matplotlib.gridspec as gridspec
import matplotlib.patches as mpatches
import cartopy
import cartopy.crs as ccrs
import cartopy.feature as cf
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER
print(colored("All modules loaded!\n", 'green'))
except ModuleNotFoundError:
print(colored("Module not found: %s"%ModuleNotFoundError, 'red'))
# define harp products of each variable:
# 'keep' first value should be the variable of interest for the product
# (the one that harp has a _validity parameter)
# 'thereshold' is the minimum quality for the values to keep them
# product's 'keep' are based on Google Earth Engine L3 products:
# https://developers.google.com/earth-engine/datasets/catalog/sentinel-5p
VAR_PRODUCT = {
'L2__O3____': {
'keep': 'O3_column_number_density, O3_effective_temperature',
'threshold': 50},
'L2__NO2___': {
'keep': 'tropospheric_NO2_column_number_density, NO2_column_number_density,' +\
'stratospheric_NO2_column_number_density,' +\
'NO2_slant_column_number_density, tropopause_pressure, absorbing_aerosol_index',
'threshold': 50},
'L2__SO2___': {
'keep': 'SO2_column_number_density, SO2_column_number_density_amf, ' +\
'SO2_slant_column_number_density, absorbing_aerosol_index',
'threshold': 50},
'L2__CO____': {
'keep': 'CO_column_number_density, H2O_column_number_density',
'threshold': 50},
'L2__CH4___': {
'keep': 'CH4_column_volume_mixing_ratio_dry_air, aerosol_height, aerosol_optical_depth',
'threshold': 50},
'L2__HCHO__': {
'keep': 'tropospheric_HCHO_column_number_density, tropospheric_HCHO_column_number_density_amf, ' +\
'HCHO_slant_column_number_density',
'threshold': 50},
'L2__CLOUD_': { # 'cloud_fraction' doesn't need _validity, it can't be used with the current script
'keep': 'cloud_fraction, cloud_top_pressure, cloud_top_height, cloud_base_pressure, ' +\
'cloud_base_height, cloud_optical_depth, surface_albedo',
'threshold': 50},
'L2__AER_AI': {
'keep': 'absorbing_aerosol_index',
'threshold': 50},
'L2__AER_LH': { # TODO
'keep': '',
'threshold': 50}
}
#np.seterr('ignore')
warnings.filterwarnings("ignore")
DEBUG = False
def set_parser():
""" set custom parser """
parser = argparse.ArgumentParser(description="")
parser.add_argument("-c", "--city", type=str, required=True,
help="City to process the data from [Moscow, Istanbul, Berlin]")
parser.add_argument("-p", "--product", type=str, required=True,
help="Product to process [\'L2__O3____\', \'L2__NO2___\', \'L2__SO2___\', \
\'L2__CO____\', \'L2__CH4___\', \'L2__HCHO__\', \'L2__CLOUD_\', \
\'L2__AER_AI\', \'L2__AER_LH\'] ")
parser.add_argument("-f", "--folder", type=str, required=False, default='../data/final_tensors',
help="Folder to save the final tensors")
parser.add_argument("-f_grid", "--folder_grid", type=str, required=False, default='../data/crop',
help="Folder with L3 processed data")
parser.add_argument("-f_src", "--folder_src", type=str, required=False, default='../data',
help="Folder with L2 S-5P original data")
return parser
def save_obj(obj, name):
with open(name + '.pkl', 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(name):
with open(name + '.pkl', 'rb') as f:
return pickle.load(f)
def retrieve_files(city, product, folder_source, extension='.zip', verbose=True):
# files to retrieve
path_files = join(folder_source, city, product, '*'+extension)
all_files = sorted(list(iglob(path_files, recursive=True)))
if verbose:
print("looking for files at %s"%(path_files))
print(colored(f"number of {extension} detected: {len(all_files)}", "green"))
return all_files
def create_folder_to_save(folder, city):
""" This function creates a folder if not exists in
'{folder}/{city}/{product}'
to store all processed files by harp
"""
path = f'{folder}/{city}/'
Path(path).mkdir(parents=True, exist_ok=True)
print(f"Processed data will be stored in: {path}\n")
return path
def get_time_attr_old(all_files, verbose=False):
""" this function creates a dict with time atributes for each file """
print(colored("loading time attributes, it can take few minutes...", 'blue'))
attributes = {file_i.split('/')[-1]:
{'time_coverage_start': xr.open_dataset(file_i).attrs['time_coverage_start'],
'time_coverage_end': xr.open_dataset(file_i).attrs['time_coverage_end']
} for file_i in all_files
}
if verbose:
print('attributes:', attributes)
return attributes
def get_time_attr(all_files, path, product, verbose=False):
""" this function creates a dict with time atributes for each file if
it is not already stored in disk
"""
path_dict = create_folder_to_save(path[:-1], 'dicts')
path_dict = f'{path_dict}{product}'
if not os.path.isfile(path_dict+'.pkl'):
print(colored("loading time attributes, it can take few minutes...", 'blue'))
attributes = {file_i.split('/')[-1]:
{'time_coverage_start': xr.open_dataset(file_i).attrs['time_coverage_start'],
'time_coverage_end': xr.open_dataset(file_i).attrs['time_coverage_end']
} for file_i in all_files
}
# save them
save_obj(attributes, path_dict)
else:
print(colored("loading time attributes from memory...", 'blue'))
attributes = load_obj(path_dict)
if verbose:
print('attributes:', attributes)
return attributes
def read_h5(path):
with h5py.File(path, 'r') as hf:
# get the name of the dataset
key = list(hf.keys())[0]
# access to the dataset and get all data
data = hf[key][:]
return data
def read_product_netCDF4(product, path=f'../data/final_tensors/Moscow/'):
''' product should be one of the following:
['L2__O3____', 'L2__NO2___', 'L2__SO2___', 'L2__CO____', 'L2__CH4___',
'L2__HCHO__', 'L2__CLOUD_', 'L2__AER_AI', 'L2__AER_LH']
to open and test it:
h = read_product_netCDF4(product)
img = h['netcdf'].groupby('time.year').mean()[0]
create_save_plot(img, 'foo')
'''
# declare names
netcdf_name = f'{path}/{product}.nc'
tensor_name = f'{path}/{product}_data.h5'
time_name = f'{path}/{product}_time.h5'
# read
netcdf = xr.open_dataarray(netcdf_name)
tensor = read_h5(tensor_name)
time = read_h5(time_name)
return {'data': tensor, 'time': time, 'netcdf': netcdf}
def get_tensors(no2_L3_DATA_mean):
# get data and make longitude first: (time, lon, lat)
tensor = no2_L3_DATA_mean.values
tensor = np.moveaxis(tensor, 1, -1)
# get dates and trim yyyy-mm-dd
time_values = no2_L3_DATA_mean.coords['time'].values
time_values = np.asarray([str(d)[:10] for d in time_values], dtype='S')
return tensor, time_values
def save_tensors(path, product, no2_L3_DATA_mean):
""" save tensor and its date indexes into h5 files """
tensor, time_values = get_tensors(no2_L3_DATA_mean)
# declare names
netcdf_name = f'{path}{product}.nc'
tensor_name = f'{path}{product}_data.h5'
time_name = f'{path}{product}_time.h5'
with h5py.File(tensor_name, 'w') as hf:
hf.create_dataset(f'{product}_data', data=tensor)
with h5py.File(time_name, 'w') as hf:
hf.create_dataset(f'{product}_time', data=time_values)
# save original netcdf file
no2_L3_DATA_mean.to_netcdf(netcdf_name)
print(colored(f'data saved in: {tensor_name}', 'green') )
print(colored(f'time index saved in: {time_name}', 'green'))
print(colored(f'original netcdf saved in {netcdf_name}', 'green'))
def create_save_plot(img, fname):
fig = plt.figure(figsize=(18, 6))
ax = fig.add_subplot(1, 1, 1, projection=ccrs.PlateCarree())
# plot data
im = img.plot.pcolormesh(ax=ax, x='longitude', y='latitude',
cmap='magma_r', add_colorbar=True,
transform=ccrs.PlateCarree(), zorder=1)
#ax.set_title('Centered in %s'%folder)
# add backgorund features
#ax.stock_img()
ax.gridlines()
state_provinces = cf.NaturalEarthFeature(category='cultural', name='admin_1_states_provinces_lines',
scale='10m', facecolor='none')
ax.add_feature(cartopy.feature.LAND, edgecolor='black')
ax.add_feature(state_provinces, linewidth=0.4, edgecolor='black')
# set axis
gl = ax.gridlines(draw_labels=True, linewidth=1, color='gray', alpha=0.3, linestyle=':')
gl.xlabels_top = False
gl.ylabels_right = False
gl.xformatter = LONGITUDE_FORMATTER
gl.yformatter = LATITUDE_FORMATTER
plt.savefig(fname, bbox_inches='tight')
def save_log(city, product, n_after, n_before, all_files, fname_log):
# fill log
cols = ['City', 'Product', 'Unique Days', 'Orbits with Data', 'Total Orbits']
dat = [city, product, n_after, n_before, len(all_files)]
df = pd.DataFrame([dat], columns=cols)
# save log
if not os.path.isfile(fname_log):
df.to_csv(fname_log, header=True)
print(colored(f'created log: {fname_log}', 'green'))
else:
df.to_csv(fname_log, mode='a', header=False)
print(colored(f'appended log to: {fname_log}'), 'green' )
def process(city, product, folder, folder_src, folder_grid, var_product=VAR_PRODUCT):
""" main function to stack grids into 'time' dimension """
#print(xr.show_versions())
## 1. get original files to substract time information & create a folder to store final tensors
all_files = retrieve_files(city, product, folder_src)
path = create_folder_to_save(folder, city)
## 2. create time attributes & a function to access them
attributes = get_time_attr(all_files, path, product)
def preprocess(ds, attributes=attributes):
ds['time'] = pd.to_datetime(np.array([attributes[ds.attrs['source_product']]['time_coverage_start']])).values
return ds
## 3. load & stack all files over time dimension
all_files_L3 = retrieve_files(city, product, folder_grid, '.nc')
L3_DATA = xr.open_mfdataset(all_files_L3, combine='nested', concat_dim='time',
preprocess=preprocess, chunks={'time': 100})
## 4. group the different orbits by day
# set all dates to have time at 00h so multiple measurements in a day have the same label
L3_DATA.coords['time'] = L3_DATA.time.dt.floor('1D')
# group by 'date' using an average (mean)
L3_DATA_mean = L3_DATA.groupby('time').mean()
## 5. get variable of interest and annual average
var_of_interest = var_product[product]['keep'].split(',')[0]
no2_L3_DATA_mean = L3_DATA_mean[var_of_interest]
year_mean = no2_L3_DATA_mean.groupby('time.year').mean()[0]
## 6. get info about aggregation
n_before = L3_DATA[var_of_interest].shape[0]
n_after = no2_L3_DATA_mean.shape[0]
print(colored(f"--> There were {n_before} orbits belonging to {n_after} unique days.\n", 'blue'))
## 7. get and save tensors
save_tensors(path, product, no2_L3_DATA_mean)
## 8. save a plot
name = f'{path}/{city}_{product}.png'
create_save_plot(year_mean, name)
## 9. save a log
save_log(city, product, n_after, n_before, all_files, f'{folder}/LOG.csv')
def main():
if not DEBUG:
parser = set_parser()
options = parser.parse_args()
process(options.city, options.product, options.folder, options.folder_src, options.folder_grid)
else:
city, product = 'Moscow', 'L2__O3____'
folder = '../data/final_tensors'
folder_src = '../data'
folder_grid = '../data/crop'
process(city, product, folder, folder_src, folder_grid)
if __name__ == "__main__":
main()
"""
source conda-bash
conda activate air
Calls that we used:
python join_by_time.py -c Moscow -p L2__NO2___
python join_by_time.py -c Moscow -p L2__O3____
python join_by_time.py -c Moscow -p L2__SO2___
python join_by_time.py -c Moscow -p L2__CO____
python join_by_time.py -c Moscow -p L2__CH4___
python join_by_time.py -c Moscow -p L2__HCHO__
python join_by_time.py -c Istanbul -p L2__NO2___
python join_by_time.py -c Istanbul -p L2__O3____
python join_by_time.py -c Istanbul -p L2__SO2___
python join_by_time.py -c Istanbul -p L2__CO____
python join_by_time.py -c Istanbul -p L2__CH4___
python join_by_time.py -c Istanbul -p L2__HCHO__
python join_by_time.py -c Berlin -p L2__NO2___
python join_by_time.py -c Berlin -p L2__O3____
python join_by_time.py -c Berlin -p L2__SO2___
python join_by_time.py -c Berlin -p L2__CO____
python join_by_time.py -c Berlin -p L2__CH4___
python join_by_time.py -c Berlin -p L2__HCHO__
Problems:
Moscow L2__CO____ has 0 days with values
Moscow L2__HCHO__ has key error on attributes:
'S5P_OFFL_L2__HCHO___20190821T092356_20190821T110525_09605_01_010107_20190827T135123.zip.incomplete'
Istanbul/L2__CO____ has 0 days with values
Berlin/L2__CO____ has 0 days with values
"""