-
Notifications
You must be signed in to change notification settings - Fork 3
/
utils.py
126 lines (95 loc) · 3.43 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import numpy as np
import os
import pandas as pd
import pathlib
import requests
def download_file(
url: str,
filename: str,
base: str = ".",
dir: str = "data",
overwrite: bool = False,
):
"""Method for downloading data
Arguments:
filename {str} -- File access of the ENCODE data file
Keyword Arguments:
base {str} -- Base directory (default: {"."})
dir {str} -- Download directory (default: {"data"})
overwrite {bool} -- If {True} existing files with be overwritten (default: {False})
Returns:
{str} -- Returns a pointer to `filename`.
"""
filepath = os.path.join(base, dir, filename)
if pathlib.Path(filepath).is_file() and not overwrite:
print("File already exist. To overwrite pass `overwrite=True`")
return
chunkSize = 1024
name, _ = os.path.splitext(filename)
r = requests.get(url, stream=True)
print("Download {}...".format(filename), end='')
with open(filepath, "wb") as f:
for chunk in r.iter_content(chunk_size=chunkSize):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
print(" done!")
return filename
def parse_ncei_temperature_data(files, station: str = None):
'''
https://www.ncei.noaa.gov/access/search/data-search/global-hourly
'''
tables = [
pd.read_csv(file, delimiter=',', quotechar='"', low_memory=False)
for file in files
]
if station is not None:
tables = [t[t['STATION'] == station] for t in tables]
df = pd.concat(tables).drop_duplicates()
df['DATETIME'] = pd.to_datetime(df['DATE'])
df['TEMP_C'] = df['TMP'].str.split(',').apply(lambda x: int(x[0]) / 10.)
df = df[df['TEMP_C'] < 800]
return df
def table_to_arrays(datetimes, temps):
hours = datetimes.apply(lambda x: int(x.total_seconds() // 3600))
arr = np.empty((max(hours)+1,))
arr[:] = np.nan
arr[hours] = temps
return arr
def get_ncei_temperature_data_as_array(definitions):
temps = {}
for city in definitions:
temps[city] = parse_ncei_temperature_data(
definitions[city]['files'],
definitions[city]['station']
)
# Get the maximum-minimum and the minimum-maximum date time
min_datetime = pd.to_datetime('1900-01-01T00:00:00')
max_datetime = pd.to_datetime('2020-01-01T00:00:00')
for city in definitions:
min_datetime = max(min_datetime, min(temps[city]['DATETIME']))
max_datetime = min(max_datetime, max(temps[city]['DATETIME']))
# Convert the tables to numerical arrays
temp_arrays = {}
for city in temps:
selection = (
(temps[city]['DATETIME'] >= min_datetime) &
(temps[city]['DATETIME'] <= max_datetime)
)
temp_arrays[city] = table_to_arrays(
temps[city][selection]['DATETIME'] - min_datetime,
temps[city][selection]['TEMP_C']
)
return temp_arrays, (min_datetime, max_datetime)
def bed2ddb(filepath, name):
"""Factory function for bed2ddb tilesets"""
from higlass.tilesets import Tileset
from clodius.tiles import bed2ddb
from clodius.tiles.utils import tiles_wrapper_2d
return Tileset(
tileset_info=lambda: bed2ddb.get_2d_tileset_info(filepath),
tiles=lambda tile_ids: tiles_wrapper_2d(
tile_ids,
lambda z, x, y: bed2ddb.get_2D_tiles(filepath, z, x, y)[x, y]
),
name=name
)