From e18272d8cfb45e6f66a9e79d532d31824d83cfb6 Mon Sep 17 00:00:00 2001 From: GridGrapher <127969728+GridGrapher@users.noreply.github.com> Date: Tue, 21 Mar 2023 23:36:32 +0100 Subject: [PATCH 01/29] Adding methods for efficient processing of raster --- scripts/build_shapes.py | 228 ++++++++++++++++++++++++++++++---------- 1 file changed, 173 insertions(+), 55 deletions(-) diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index ee7d07733..cfd857ddf 100644 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -281,7 +281,7 @@ def load_EEZ(countries_codes, geo_crs, EEZ_gpkg="./data/eez/eez_v11.gpkg"): f"File EEZ {EEZ_gpkg} not found, please download it from https://www.marineregions.org/download_file.php?name=World_EEZ_v11_20191118_gpkg.zip and copy it in {os.path.dirname(EEZ_gpkg)}" ) - geodf_EEZ = gpd.read_file(EEZ_gpkg).to_crs(geo_crs) + geodf_EEZ = gpd.read_file(EEZ_gpkg, engine="pyogrio").to_crs(geo_crs) geodf_EEZ.dropna(axis=0, how="any", subset=["ISO_TER1"], inplace=True) # [["ISO_TER1", "TERRITORY1", "ISO_SOV1", "ISO_SOV2", "ISO_SOV3", "geometry"]] geodf_EEZ = geodf_EEZ[["ISO_TER1", "geometry"]] @@ -652,7 +652,7 @@ def add_gdp_data( logger.info("Stage 4/4: Add gdp data to GADM GeoDataFrame") # initialize new gdp column - df_gadm["gdp"] = 0.0 + # df_gadm["gdp"] = 0.0 GDP_tif, name_tif = load_GDP(year, update, out_logging, name_file_nc) @@ -705,6 +705,103 @@ def _process_func_download_pop(c_code): ) +def get_worldpop_features(WorldPop_inputfile): + """ + Function + + + + """ + worldpop_features = [] + # Open the file using rasterio + with rasterio.open(WorldPop_inputfile) as src: + # Add affine transform + worldpop_features.append(src.meta["transform"]) + # Add dimensions of image + worldpop_features.append(src.shape[0]) + worldpop_features.append(src.shape[1]) + + # --- Process the pixels in the image for population data --- + # Read the gray layer (1) to get an np.array of this band + np_pop_raster = src.read(1) + + # Set np_pop_valid to all pixels in np_pop_raster that aren't 'nodata' + np_pop_valid = np_pop_raster[np_pop_raster != src.nodata] + + # Set 'nodata' values to 0 + np_pop_raster[np_pop_raster == src.nodata] = 0 + + # Set np_pop_xy to pixel locations of non zero values (same order as np_pop_valid) + np_pop_xy = np_pop_raster.nonzero() + + # Combine arrays to get [ value, x, y ] array + np_pop = np.array([np_pop_valid, np_pop_xy[0], np_pop_xy[1]]).T + + return worldpop_features, np_pop + + +def compute_geomask_country(country_rows, worldpop_features): + """ + Function + + + + """ + affine_transform, y_axis_len, x_axis_len = worldpop_features + + # Set an empty numpy array with the dimensions of the country .tif file + # np_map_ID will contain a ID for each location (undefined is 0) + # ID corresponds to a specific geometry in country_rows + np_map_ID = np.zeros((y_axis_len, x_axis_len)) + + # List to contain the mappings of id to GADM_ID + id_to_GADM_ID = [] + # Loop the country_rows geoDataFrame + for i in range(len(country_rows)): + # Set the current geometry + cur_geometry = country_rows["geometry"][i] + # Generate a mask for the specific geometry + temp_mask = rasterio.features.geometry_mask( + [cur_geometry], (y_axis_len, x_axis_len), affine_transform, invert=True + ) + + # Map the values of counter value to np_map_ID + np_map_ID[temp_mask] = i + 1 + + # Store the id -> GADM_ID mapping + id_to_GADM_ID.append([i + 1, country_rows["GADM_ID"][i]]) + + return np_map_ID.astype(int), pd.DataFrame(id_to_GADM_ID).set_index(0) + + +def compute_population(np_pop, country_geomask, id_mapping): + """ + Function + + + + """ + # Declare an array to contain population counts + np_pop_count = np.zeros(len(id_mapping) + 1) + + # Loop the population data + for i in range(len(np_pop)): + cur_value, cur_x, cur_y = np_pop[i] + + # Set the current id to the id at the same coordinate of the geomask + cur_id = country_geomask[int(cur_x)][int(cur_y)] + + # Add the current value to the population + np_pop_count[cur_id] += cur_value + + df_pop_count = pd.DataFrame(np_pop_count) + df_pop_count = pd.DataFrame(df_pop_count).join(id_mapping) + + df_pop_count.columns = ["pop", "GADM_ID"] + + return df_pop_count.drop(0) + + def add_population_data( df_gadm, country_codes, @@ -742,62 +839,83 @@ def add_population_data( ascii=False, desc="Compute population ", ) - if (nprocesses is None) or (nprocesses == 1): - with tqdm(total=df_gadm.shape[0], **tqdm_kwargs) as pbar: - for c_code in country_codes: - # get subset by country code - country_rows = df_gadm.loc[df_gadm["country"] == c_code] - - # get worldpop image - WorldPop_inputfile, WorldPop_filename = download_WorldPop( - c_code, worldpop_method, year, update, out_logging - ) - - with rasterio.open(WorldPop_inputfile) as src: - for i, row in country_rows.iterrows(): - df_gadm.loc[i, "pop"] = _sum_raster_over_mask(row.geometry, src) - pbar.update(1) + with tqdm(total=len(country_codes), **tqdm_kwargs) as pbar: + for c_code in country_codes: + # get subset by country code + country_rows = df_gadm.loc[df_gadm["country"] == c_code] + + # get worldpop image + WorldPop_inputfile, WorldPop_filename = download_WorldPop( + c_code, worldpop_method, year, update, out_logging + ) - else: - # generator function to split a list ll into n_objs lists - def divide_list(ll, n_objs): - for i in range(0, len(ll), n_objs): - yield ll[i : i + n_objs] + worldpop_features, np_pop = get_worldpop_features(WorldPop_inputfile) - id_parallel_chunks = list( - divide_list(df_gadm.index, ceil(len(df_gadm) / nchunks)) - ) + # get the geomask with id mappings + country_geomask, id_mapping = compute_geomask_country( + country_rows, worldpop_features + ) - kwargs = { - "initializer": _init_process_pop, - "initargs": (df_gadm, year, worldpop_method), - "processes": nprocesses, - } - with mp.get_context("spawn").Pool(**kwargs) as pool: - if disable_progressbar: - list(pool.map(_process_func_download_pop, country_codes)) - _ = list(pool.map(_process_func_pop, id_parallel_chunks)) - for elem in _: - df_gadm.loc[elem.index, "pop"] = elem["pop"] - else: - list( - tqdm( - pool.imap(_process_func_download_pop, country_codes), - total=len(country_codes), - ascii=False, - desc="Download WorldPop ", - unit=" countries", - ) - ) - _ = list( - tqdm( - pool.imap(_process_func_pop, id_parallel_chunks), - total=len(id_parallel_chunks), - **tqdm_kwargs, - ) - ) - for elem in _: - df_gadm.loc[elem.index, "pop"] = elem["pop"] + df_pop_count = compute_population(np_pop, country_geomask, id_mapping) + + pbar.update(1) + + # if (nprocesses is None) or (nprocesses == 1): + # with tqdm(total=df_gadm.shape[0], **tqdm_kwargs) as pbar: + # for c_code in country_codes: + # # get subset by country code + # country_rows = df_gadm.loc[df_gadm["country"] == c_code] + + # # get worldpop image + # WorldPop_inputfile, WorldPop_filename = download_WorldPop( + # c_code, worldpop_method, year, update, out_logging + # ) + + # with rasterio.open(WorldPop_inputfile) as src: + # for i, row in country_rows.iterrows(): + # df_gadm.loc[i, "pop"] = _sum_raster_over_mask(row.geometry, src) + # pbar.update(1) + + # else: + # # generator function to split a list ll into n_objs lists + # def divide_list(ll, n_objs): + # for i in range(0, len(ll), n_objs): + # yield ll[i : i + n_objs] + + # id_parallel_chunks = list( + # divide_list(df_gadm.index, ceil(len(df_gadm) / nchunks)) + # ) + + # kwargs = { + # "initializer": _init_process_pop, + # "initargs": (df_gadm, year, worldpop_method), + # "processes": nprocesses, + # } + # with mp.get_context("spawn").Pool(**kwargs) as pool: + # if disable_progressbar: + # list(pool.map(_process_func_download_pop, country_codes)) + # _ = list(pool.map(_process_func_pop, id_parallel_chunks)) + # for elem in _: + # df_gadm.loc[elem.index, "pop"] = elem["pop"] + # else: + # list( + # tqdm( + # pool.imap(_process_func_download_pop, country_codes), + # total=len(country_codes), + # ascii=False, + # desc="Download WorldPop ", + # unit=" countries", + # ) + # ) + # _ = list( + # tqdm( + # pool.imap(_process_func_pop, id_parallel_chunks), + # total=len(id_parallel_chunks), + # **tqdm_kwargs, + # ) + # ) + # for elem in _: + # df_gadm.loc[elem.index, "pop"] = elem["pop"] def gadm( From 802fe65577bcc295a595622bb9b6f3ae8f934d52 Mon Sep 17 00:00:00 2001 From: GridGrapher <127969728+GridGrapher@users.noreply.github.com> Date: Wed, 22 Mar 2023 12:20:46 +0100 Subject: [PATCH 02/29] Adjust dataframe to iloc and write to df_gadm --- scripts/build_shapes.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index cfd857ddf..dd440bbbe 100644 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -756,10 +756,11 @@ def compute_geomask_country(country_rows, worldpop_features): # List to contain the mappings of id to GADM_ID id_to_GADM_ID = [] + # Loop the country_rows geoDataFrame for i in range(len(country_rows)): # Set the current geometry - cur_geometry = country_rows["geometry"][i] + cur_geometry = country_rows.iloc[i]["geometry"] # Generate a mask for the specific geometry temp_mask = rasterio.features.geometry_mask( [cur_geometry], (y_axis_len, x_axis_len), affine_transform, invert=True @@ -769,7 +770,7 @@ def compute_geomask_country(country_rows, worldpop_features): np_map_ID[temp_mask] = i + 1 # Store the id -> GADM_ID mapping - id_to_GADM_ID.append([i + 1, country_rows["GADM_ID"][i]]) + id_to_GADM_ID.append([i + 1, country_rows.iloc[i]["GADM_ID"]]) return np_map_ID.astype(int), pd.DataFrame(id_to_GADM_ID).set_index(0) @@ -799,7 +800,7 @@ def compute_population(np_pop, country_geomask, id_mapping): df_pop_count.columns = ["pop", "GADM_ID"] - return df_pop_count.drop(0) + return df_pop_count def add_population_data( @@ -856,8 +857,14 @@ def add_population_data( country_rows, worldpop_features ) + # Calculate the population for each region df_pop_count = compute_population(np_pop, country_geomask, id_mapping) + # Loop the regions and write population to df_gadm + for i in range(len(df_pop_count)): + pop_count, gadm_id = df_pop_count.iloc[i] + df_gadm.loc[df_gadm["GADM_ID"] == gadm_id, "pop"] = pop_count + pbar.update(1) # if (nprocesses is None) or (nprocesses == 1): From 8b92c59991f19c7187f0537bf26753e631d287a3 Mon Sep 17 00:00:00 2001 From: GridGrapher <127969728+GridGrapher@users.noreply.github.com> Date: Wed, 22 Mar 2023 15:58:35 +0100 Subject: [PATCH 03/29] Memory reduction output compute_geomask_country --- scripts/build_shapes.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index dd440bbbe..c7875f0cc 100644 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -772,7 +772,10 @@ def compute_geomask_country(country_rows, worldpop_features): # Store the id -> GADM_ID mapping id_to_GADM_ID.append([i + 1, country_rows.iloc[i]["GADM_ID"]]) - return np_map_ID.astype(int), pd.DataFrame(id_to_GADM_ID).set_index(0) + # Return np_map_ID as type 'H' np.ushort + # 'H' -> https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.ushort + # This lowers memory usage, note: ID has to be within the range [0,65535] + return np_map_ID.astype("H"), pd.DataFrame(id_to_GADM_ID).set_index(0) def compute_population(np_pop, country_geomask, id_mapping): From fb9359e5026f107064b04ac31105819e42f15f60 Mon Sep 17 00:00:00 2001 From: GridGrapher <127969728+GridGrapher@users.noreply.github.com> Date: Wed, 22 Mar 2023 16:00:21 +0100 Subject: [PATCH 04/29] df_gadm operation adjustment --- scripts/build_shapes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index c7875f0cc..61bebd97a 100644 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -652,7 +652,7 @@ def add_gdp_data( logger.info("Stage 4/4: Add gdp data to GADM GeoDataFrame") # initialize new gdp column - # df_gadm["gdp"] = 0.0 + df_gadm["gdp"] = 0.0 GDP_tif, name_tif = load_GDP(year, update, out_logging, name_file_nc) From e8c9b3b0098ac62202b501bbd53a110a0bea5570 Mon Sep 17 00:00:00 2001 From: GridGrapher <127969728+GridGrapher@users.noreply.github.com> Date: Fri, 24 Mar 2023 00:41:01 +0100 Subject: [PATCH 05/29] Restructure methods to implement scaling --- scripts/build_shapes.py | 305 +++++++++++++++++++++++++++------------- 1 file changed, 210 insertions(+), 95 deletions(-) diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index 61bebd97a..a4b693522 100644 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -28,6 +28,7 @@ two_digits_2_name_country, ) from rasterio.mask import mask +from rasterio.windows import Window from shapely.geometry import LineString, MultiPolygon, Point, Polygon from shapely.geometry.base import BaseGeometry from shapely.ops import unary_union @@ -39,6 +40,26 @@ sets_path_to_root("pypsa-earth") +# Imports for profiling [temporary] +import cProfile +import os + +import psutil + + +# Function for profiling functions [temporary] +def profile(func): + """Decorator for run function profile""" + + def wrapper(*args, **kwargs): + profile_filename = func.__name__ + ".prof" + profiler = cProfile.Profile() + result = profiler.runcall(func, *args, **kwargs) + profiler.dump_stats(profile_filename) + return result + + return wrapper + def download_GADM(country_code, update=False, out_logging=False): """ @@ -71,7 +92,7 @@ def download_GADM(country_code, update=False, out_logging=False): if not os.path.exists(GADM_inputfile_gpkg) or update is True: if out_logging: logger.warning( - f"Stage 4/4: {GADM_filename} of country {two_digits_2_name_country(country_code)} does not exist, downloading to {GADM_inputfile_gpkg}" + f"Stage 4 of 4: {GADM_filename} of country {two_digits_2_name_country(country_code)} does not exist, downloading to {GADM_inputfile_gpkg}" ) # create data/osm directory os.makedirs(os.path.dirname(GADM_inputfile_gpkg), exist_ok=True) @@ -422,7 +443,7 @@ def download_WorldPop_standard( Name of the file """ if out_logging: - logger.info("Stage 3/4: Download WorldPop datasets") + logger.info("Stage 3 of 4: Download WorldPop datasets (standard)") if country_code == "XK": WorldPop_filename = f"srb_ppp_{year}_UNadj_constrained.tif" @@ -445,7 +466,7 @@ def download_WorldPop_standard( if not os.path.exists(WorldPop_inputfile) or update is True: if out_logging: logger.warning( - f"Stage 4/4: {WorldPop_filename} does not exist, downloading to {WorldPop_inputfile}" + f"Stage 3 of 4: {WorldPop_filename} does not exist, downloading to {WorldPop_inputfile}" ) # create data/osm directory os.makedirs(os.path.dirname(WorldPop_inputfile), exist_ok=True) @@ -459,7 +480,7 @@ def download_WorldPop_standard( loaded = True break if not loaded: - logger.error(f"Stage 4/4: Impossible to download {WorldPop_filename}") + logger.error(f"Stage 3 of 4: Impossible to download {WorldPop_filename}") return WorldPop_inputfile, WorldPop_filename @@ -489,7 +510,7 @@ def download_WorldPop_API( Name of the file """ if out_logging: - logger.info("Stage 3/4: Download WorldPop datasets") + logger.info("Stage 3 of 4: Download WorldPop datasets (API)") WorldPop_filename = f"{two_2_three_digits_country(country_code).lower()}_ppp_{year}_UNadj_constrained.tif" # Request to get the file @@ -513,7 +534,7 @@ def download_WorldPop_API( loaded = True break if not loaded: - logger.error(f"Stage 4/4: Impossible to download {WorldPop_filename}") + logger.error(f"Stage 3 of 4: Impossible to download {WorldPop_filename}") return WorldPop_inputfile, WorldPop_filename @@ -525,7 +546,7 @@ def convert_GDP(name_file_nc, year=2015, out_logging=False): """ if out_logging: - logger.info("Stage 4/4: Access to GDP raster data") + logger.info("Stage 4 of 4: Access to GDP raster data") # tif namefile name_file_tif = name_file_nc[:-2] + "tif" @@ -552,7 +573,7 @@ def convert_GDP(name_file_nc, year=2015, out_logging=False): if year not in list_years: if out_logging: logger.warning( - f"Stage 3/4 GDP data of year {year} not found, selected the most recent data ({int(list_years[-1])})" + f"Stage 4 of 4 GDP data of year {year} not found, selected the most recent data ({int(list_years[-1])})" ) year = float(list_years[-1]) @@ -576,7 +597,7 @@ def load_GDP( """ if out_logging: - logger.info("Stage 4/4: Access to GDP raster data") + logger.info("Stage 4 of 4: Access to GDP raster data") # path of the nc file name_file_tif = name_file_nc[:-2] + "tif" @@ -587,7 +608,7 @@ def load_GDP( if update | (not os.path.exists(GDP_tif)): if out_logging: logger.warning( - f"Stage 4/4: File {name_file_tif} not found, the file will be produced by processing {name_file_nc}" + f"Stage 4 of 4: File {name_file_tif} not found, the file will be produced by processing {name_file_nc}" ) convert_GDP(name_file_nc, year, out_logging) @@ -649,7 +670,7 @@ def add_gdp_data( - Includes a new column ["gdp"] """ if out_logging: - logger.info("Stage 4/4: Add gdp data to GADM GeoDataFrame") + logger.info("Stage 4 of 4: Add gdp data to GADM GeoDataFrame") # initialize new gdp column df_gadm["gdp"] = 0.0 @@ -706,24 +727,40 @@ def _process_func_download_pop(c_code): def get_worldpop_features(WorldPop_inputfile): + """ + Function to extract data from .tif input file + ------- + Inputs: + WorldPop_inputfile: String pointing to location of file + -------- + Outputs: + src.meta["transform"]: Representation of the affine transform + src.shape: Dimensions of the input image + """ + # Open the file using rasterio + with rasterio.open(WorldPop_inputfile) as src: + return src.meta["transform"], src.shape + + +def get_worldpop_val_xy(WorldPop_inputfile, window_dimensions): """ Function """ - worldpop_features = [] + col_off, row_off, width, height = window_dimensions + + current_window = Window(col_off, row_off, width, height) + # Open the file using rasterio with rasterio.open(WorldPop_inputfile) as src: - # Add affine transform - worldpop_features.append(src.meta["transform"]) - # Add dimensions of image - worldpop_features.append(src.shape[0]) - worldpop_features.append(src.shape[1]) - # --- Process the pixels in the image for population data --- # Read the gray layer (1) to get an np.array of this band - np_pop_raster = src.read(1) + # Rasterio doesn't support lower than float32 readout + # Hence np_pop_raster will have nbytes = 4 * shape[0] * shape[1] + + np_pop_raster = src.read(1, window=current_window) # Set np_pop_valid to all pixels in np_pop_raster that aren't 'nodata' np_pop_valid = np_pop_raster[np_pop_raster != src.nodata] @@ -734,20 +771,22 @@ def get_worldpop_features(WorldPop_inputfile): # Set np_pop_xy to pixel locations of non zero values (same order as np_pop_valid) np_pop_xy = np_pop_raster.nonzero() - # Combine arrays to get [ value, x, y ] array - np_pop = np.array([np_pop_valid, np_pop_xy[0], np_pop_xy[1]]).T + # Transform to get [ x, y ] array + # np_pop_xy as 'I' (uintc), see + # https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.uintc + np_pop_xy = np.array([np_pop_xy[0], np_pop_xy[1]]).T.astype("I") - return worldpop_features, np_pop + return np_pop_valid, np_pop_xy -def compute_geomask_country(country_rows, worldpop_features): +def compute_geomask_region(country_rows, affine_transform, dimensions): """ Function """ - affine_transform, y_axis_len, x_axis_len = worldpop_features + y_axis_len, x_axis_len = dimensions # Set an empty numpy array with the dimensions of the country .tif file # np_map_ID will contain a ID for each location (undefined is 0) @@ -778,7 +817,140 @@ def compute_geomask_country(country_rows, worldpop_features): return np_map_ID.astype("H"), pd.DataFrame(id_to_GADM_ID).set_index(0) -def compute_population(np_pop, country_geomask, id_mapping): +def compute_population(country_rows, WorldPop_inputfile, out_logging=False): + """ + Function computes the population for the given country rows + + Inputs: + ------- + country_rows: + + WorldPop_inputfile: + + + Outputs: + -------- + df_pop_count: Dataframe with columns + - "pop" containing population of GADM_ID region + - "GADM_ID" + + + """ + # Get the features of the worldpop input file + transform, worldpop_dim = get_worldpop_features(WorldPop_inputfile) + + worldpop_y_dim, worldpop_x_dim = worldpop_dim + + # Rasterio doesn't support lower than float32 readout + # Hence reading the file will take up: nbytes = 4 * y_dim * x_dim + expected_bytes_input_read = 4 * worldpop_y_dim * worldpop_x_dim + + # Introduce a max byte size to avoid overfilling RAM + worldpop_byte_limit = 100 * 10**6 + + # If the rasterio read will be within byte limit + if expected_bytes_input_read < worldpop_byte_limit: + # Call functions with input dimensions for window + window_dim = [0, 0, worldpop_x_dim, worldpop_y_dim] + + # Get population values and corresponding x,y coords + np_pop_val, np_pop_xy = get_worldpop_val_xy(WorldPop_inputfile, window_dim) + + # get the geomask with id mappings + country_geomask, id_mapping = compute_geomask_region( + country_rows, transform, [worldpop_y_dim, worldpop_x_dim] + ) + + # Calculate the population for each region + df_pop_count = old_compute_population( + np_pop_val, np_pop_xy, country_geomask, id_mapping + ) + + else: + if out_logging: + logger.info( + "Stage 3 of 4: compute_population for " + + str(country_rows["country"][0]) + + ": Expected size of file readout was " + + str(expected_bytes_input_read // 10**6) + + " Megabytes. As the limit is " + + str(worldpop_byte_limit // 10**6) + + " Megabytes switching to windowed approach" + ) + # Calculate the population using windows + df_pop_count = windowed_compute_population( + country_rows, WorldPop_inputfile, worldpop_byte_limit + ) + + return df_pop_count + + +def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_limit): + """ + Function + + + + """ + # Open the file using rasterio + with rasterio.open(WorldPop_inputfile) as src: + transform = src.meta["transform"] + worldpop_y_dim, worldpop_x_dim = src.shape + block_y_dim, block_x_dim = src.block_shapes[0] + + # Calculate the bytes for reading one block from the image (float32) + read_block_size = 4 * block_y_dim * block_x_dim + + # Calculate the amount of blocks that fit into the memory budget + window_block_count = worldpop_byte_limit // read_block_size + + # The input files have blocks that vary the x dimension and keep y dimension to 512 + # Hence set the dimension of the windows to the same x dimension (width) + window_x_dim = block_x_dim + window_col_off = 0 + + # Multiply the y_dimension by the amount of blocks in the window + # window_y_dim will be height of the window + window_y_dim = window_block_count * block_y_dim + + # Calculate the y ranges of the blocks to scan the image + # y_range_start will serve as row offset + y_range_start = np.arange(0, worldpop_y_dim, window_y_dim) + + print(y_range_start) + + for row_off in y_range_start: + window_dimensions = [window_col_off, row_off, window_x_dim, window_y_dim] + + print("Running window: ", window_dimensions) + + np_pop_val, np_pop_xy = get_worldpop_val_xy( + WorldPop_inputfile, window_dimensions + ) + + # get the geomask with id mappings + region_geomask, id_mapping = compute_geomask_region( + country_rows, transform, [window_y_dim, window_x_dim] + ) + + print("geo_mask size is (MB) ", region_geomask.nbytes / 10**6) + print( + "Current RAM usage: ", + psutil.Process(os.getpid()).memory_info().rss / 1024**2, + ) + + print("Running: old_compute_population") + # Calculate the population for each region + df_pop_count = old_compute_population( + np_pop_val, np_pop_xy, region_geomask, id_mapping + ) + + print(df_pop_count) + + return df_pop_count + + +def old_compute_population(np_pop_val, np_pop_xy, region_geomask, id_mapping): """ Function @@ -789,11 +961,12 @@ def compute_population(np_pop, country_geomask, id_mapping): np_pop_count = np.zeros(len(id_mapping) + 1) # Loop the population data - for i in range(len(np_pop)): - cur_value, cur_x, cur_y = np_pop[i] + for i in range(len(np_pop_val)): + cur_value = np_pop_val[i] + cur_x, cur_y = np_pop_xy[i] # Set the current id to the id at the same coordinate of the geomask - cur_id = country_geomask[int(cur_x)][int(cur_y)] + cur_id = region_geomask[int(cur_x)][int(cur_y)] # Add the current value to the population np_pop_count[cur_id] += cur_value @@ -834,7 +1007,7 @@ def add_population_data( """ if out_logging: - logger.info("Stage 4/4 POP: Add population data to GADM GeoDataFrame") + logger.info("Stage 3 of 4: Add population data to GADM GeoDataFrame") # initialize new population column df_gadm["pop"] = 0.0 @@ -848,85 +1021,27 @@ def add_population_data( # get subset by country code country_rows = df_gadm.loc[df_gadm["country"] == c_code] - # get worldpop image + # Download worldpop image (if required) and get file location WorldPop_inputfile, WorldPop_filename = download_WorldPop( c_code, worldpop_method, year, update, out_logging ) - worldpop_features, np_pop = get_worldpop_features(WorldPop_inputfile) + if out_logging: + logger.info("Stage 3 of 4: Calculating population of " + str(c_code)) - # get the geomask with id mappings - country_geomask, id_mapping = compute_geomask_country( - country_rows, worldpop_features + # Calculate the population for each geometry given in country_rows + df_pop_count = compute_population( + country_rows, WorldPop_inputfile, out_logging ) - # Calculate the population for each region - df_pop_count = compute_population(np_pop, country_geomask, id_mapping) - # Loop the regions and write population to df_gadm for i in range(len(df_pop_count)): pop_count, gadm_id = df_pop_count.iloc[i] + # Select the row with the same "GADM_ID" and set the population count df_gadm.loc[df_gadm["GADM_ID"] == gadm_id, "pop"] = pop_count pbar.update(1) - # if (nprocesses is None) or (nprocesses == 1): - # with tqdm(total=df_gadm.shape[0], **tqdm_kwargs) as pbar: - # for c_code in country_codes: - # # get subset by country code - # country_rows = df_gadm.loc[df_gadm["country"] == c_code] - - # # get worldpop image - # WorldPop_inputfile, WorldPop_filename = download_WorldPop( - # c_code, worldpop_method, year, update, out_logging - # ) - - # with rasterio.open(WorldPop_inputfile) as src: - # for i, row in country_rows.iterrows(): - # df_gadm.loc[i, "pop"] = _sum_raster_over_mask(row.geometry, src) - # pbar.update(1) - - # else: - # # generator function to split a list ll into n_objs lists - # def divide_list(ll, n_objs): - # for i in range(0, len(ll), n_objs): - # yield ll[i : i + n_objs] - - # id_parallel_chunks = list( - # divide_list(df_gadm.index, ceil(len(df_gadm) / nchunks)) - # ) - - # kwargs = { - # "initializer": _init_process_pop, - # "initargs": (df_gadm, year, worldpop_method), - # "processes": nprocesses, - # } - # with mp.get_context("spawn").Pool(**kwargs) as pool: - # if disable_progressbar: - # list(pool.map(_process_func_download_pop, country_codes)) - # _ = list(pool.map(_process_func_pop, id_parallel_chunks)) - # for elem in _: - # df_gadm.loc[elem.index, "pop"] = elem["pop"] - # else: - # list( - # tqdm( - # pool.imap(_process_func_download_pop, country_codes), - # total=len(country_codes), - # ascii=False, - # desc="Download WorldPop ", - # unit=" countries", - # ) - # ) - # _ = list( - # tqdm( - # pool.imap(_process_func_pop, id_parallel_chunks), - # total=len(id_parallel_chunks), - # **tqdm_kwargs, - # ) - # ) - # for elem in _: - # df_gadm.loc[elem.index, "pop"] = elem["pop"] - def gadm( worldpop_method, @@ -942,7 +1057,7 @@ def gadm( nchunks=None, ): if out_logging: - logger.info("Stage 4/4: Creation GADM GeoDataFrame") + logger.info("Stage 3 of 4: Creation GADM GeoDataFrame") # download data if needed and get the desired layer_id df_gadm = get_GADM_layer(countries, layer_id, geo_crs, contended_flag, update) From 0b85814a62f9501cf4f4fecd4c12868c86998460 Mon Sep 17 00:00:00 2001 From: GridGrapher <127969728+GridGrapher@users.noreply.github.com> Date: Sun, 26 Mar 2023 21:54:09 +0200 Subject: [PATCH 06/29] Windowed mode operational --- scripts/build_shapes.py | 96 ++++++++++++++++++++++++++++++----------- 1 file changed, 72 insertions(+), 24 deletions(-) diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index a4b693522..fd5aeb2d5 100644 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -779,14 +779,40 @@ def get_worldpop_val_xy(WorldPop_inputfile, window_dimensions): return np_pop_valid, np_pop_xy -def compute_geomask_region(country_rows, affine_transform, dimensions): +def compute_geomask_region( + country_rows, affine_transform, window_dimensions, windowed=False +): """ Function """ - y_axis_len, x_axis_len = dimensions + col_off, row_off, x_axis_len, y_axis_len = window_dimensions + + if windowed: + # Declare a transformer with given affine_transform + transformer = rasterio.transform.AffineTransformer(affine_transform) + + # Obtain the coordinates of the upper left corner of window + window_topleft_longitude, window_topleft_latitude = transformer.xy( + row_off, col_off + ) + + # Obtain the coordinates of the bottom right corner of window + window_botright_longitude, window_botright_latitude = transformer.xy( + row_off + y_axis_len, col_off + x_axis_len + ) + + # Set the current transform to the correct lat and long + affine_transform = rasterio.Affine( + affine_transform[0], + affine_transform[1], + window_topleft_longitude, + affine_transform[3], + affine_transform[4], + window_topleft_latitude, + ) # Set an empty numpy array with the dimensions of the country .tif file # np_map_ID will contain a ID for each location (undefined is 0) @@ -800,9 +826,27 @@ def compute_geomask_region(country_rows, affine_transform, dimensions): for i in range(len(country_rows)): # Set the current geometry cur_geometry = country_rows.iloc[i]["geometry"] + + # In windowed mode we check if bounds of geometry overlap the window + if windowed: + latitude_min = cur_geometry.bounds[1] + latitude_max = cur_geometry.bounds[3] + + # In the following cases we don't have to continue the loop + # If the geometry is above the window + if latitude_min > window_topleft_latitude: + continue + # If the geometry is below the window + if latitude_max < window_botright_latitude: + continue + # Generate a mask for the specific geometry temp_mask = rasterio.features.geometry_mask( - [cur_geometry], (y_axis_len, x_axis_len), affine_transform, invert=True + [cur_geometry], + (y_axis_len, x_axis_len), + affine_transform, + invert=True, + all_touched=True, ) # Map the values of counter value to np_map_ID @@ -846,7 +890,7 @@ def compute_population(country_rows, WorldPop_inputfile, out_logging=False): expected_bytes_input_read = 4 * worldpop_y_dim * worldpop_x_dim # Introduce a max byte size to avoid overfilling RAM - worldpop_byte_limit = 100 * 10**6 + worldpop_byte_limit = 3096 * 10**6 # If the rasterio read will be within byte limit if expected_bytes_input_read < worldpop_byte_limit: @@ -858,11 +902,11 @@ def compute_population(country_rows, WorldPop_inputfile, out_logging=False): # get the geomask with id mappings country_geomask, id_mapping = compute_geomask_region( - country_rows, transform, [worldpop_y_dim, worldpop_x_dim] + country_rows, transform, window_dim ) # Calculate the population for each region - df_pop_count = old_compute_population( + df_pop_count = sum_values_using_geomask( np_pop_val, np_pop_xy, country_geomask, id_mapping ) @@ -892,6 +936,9 @@ def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_ """ + # Create a dataframe to store the population data + df_pop_count = country_rows.loc[:, ["GADM_ID", "pop"]].copy() + # Open the file using rasterio with rasterio.open(WorldPop_inputfile) as src: transform = src.meta["transform"] @@ -917,8 +964,6 @@ def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_ # y_range_start will serve as row offset y_range_start = np.arange(0, worldpop_y_dim, window_y_dim) - print(y_range_start) - for row_off in y_range_start: window_dimensions = [window_col_off, row_off, window_x_dim, window_y_dim] @@ -930,33 +975,38 @@ def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_ # get the geomask with id mappings region_geomask, id_mapping = compute_geomask_region( - country_rows, transform, [window_y_dim, window_x_dim] + country_rows, transform, window_dimensions, windowed=True ) - print("geo_mask size is (MB) ", region_geomask.nbytes / 10**6) - print( - "Current RAM usage: ", - psutil.Process(os.getpid()).memory_info().rss / 1024**2, - ) - - print("Running: old_compute_population") # Calculate the population for each region - df_pop_count = old_compute_population( + windowed_pop_count = sum_values_using_geomask( np_pop_val, np_pop_xy, region_geomask, id_mapping ) - print(df_pop_count) + # Loop the regions and write population to df_pop_count + for i in range(len(windowed_pop_count)): + pop_count, gadm_id = windowed_pop_count.iloc[i] + # Select the row with the same "GADM_ID" and set the population count + df_pop_count.loc[df_pop_count["GADM_ID"] == gadm_id, "pop"] = pop_count return df_pop_count -def old_compute_population(np_pop_val, np_pop_xy, region_geomask, id_mapping): +def sum_values_using_geomask(np_pop_val, np_pop_xy, region_geomask, id_mapping): """ Function """ + # Initialize a dictionary + dict_id = {0: 0} + counter = 1 + # Loop over ip mapping and add indicies to the dictionary + for ID_index in np.array(id_mapping.index): + dict_id[ID_index] = counter + counter += 1 + # Declare an array to contain population counts np_pop_count = np.zeros(len(id_mapping) + 1) @@ -969,12 +1019,10 @@ def old_compute_population(np_pop_val, np_pop_xy, region_geomask, id_mapping): cur_id = region_geomask[int(cur_x)][int(cur_y)] # Add the current value to the population - np_pop_count[cur_id] += cur_value - - df_pop_count = pd.DataFrame(np_pop_count) - df_pop_count = pd.DataFrame(df_pop_count).join(id_mapping) + np_pop_count[dict_id[cur_id]] += cur_value - df_pop_count.columns = ["pop", "GADM_ID"] + df_pop_count = pd.DataFrame(np_pop_count, columns=["pop"]) + df_pop_count["GADM_ID"] = np.append(np.array("NaN"), id_mapping.values) return df_pop_count From 2c9833810d0fc8ad5c39ef665acc4ce658323252 Mon Sep 17 00:00:00 2001 From: GridGrapher <127969728+GridGrapher@users.noreply.github.com> Date: Wed, 29 Mar 2023 23:40:44 +0200 Subject: [PATCH 07/29] Refine due to bugs with 'US', now fixed --- scripts/build_shapes.py | 49 +++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index fd5aeb2d5..d9c58dc05 100644 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -756,19 +756,16 @@ def get_worldpop_val_xy(WorldPop_inputfile, window_dimensions): # Open the file using rasterio with rasterio.open(WorldPop_inputfile) as src: # --- Process the pixels in the image for population data --- + # Read the gray layer (1) to get an np.array of this band # Rasterio doesn't support lower than float32 readout - # Hence np_pop_raster will have nbytes = 4 * shape[0] * shape[1] - + # Hence np_pop_raster will have nbytes = 4 * width * height np_pop_raster = src.read(1, window=current_window) - # Set np_pop_valid to all pixels in np_pop_raster that aren't 'nodata' - np_pop_valid = np_pop_raster[np_pop_raster != src.nodata] - # Set 'nodata' values to 0 np_pop_raster[np_pop_raster == src.nodata] = 0 - # Set np_pop_xy to pixel locations of non zero values (same order as np_pop_valid) + # Set np_pop_xy to pixel locations of non zero values np_pop_xy = np_pop_raster.nonzero() # Transform to get [ x, y ] array @@ -776,6 +773,9 @@ def get_worldpop_val_xy(WorldPop_inputfile, window_dimensions): # https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.uintc np_pop_xy = np.array([np_pop_xy[0], np_pop_xy[1]]).T.astype("I") + # Extract the values from the locations of non zero pixels + np_pop_valid = np_pop_raster[np_pop_xy.T[0], np_pop_xy.T[1]] + return np_pop_valid, np_pop_xy @@ -890,6 +890,7 @@ def compute_population(country_rows, WorldPop_inputfile, out_logging=False): expected_bytes_input_read = 4 * worldpop_y_dim * worldpop_x_dim # Introduce a max byte size to avoid overfilling RAM + # Ensure worldpop_byte_limit > 883 * 10**6 (minimum memory for 'US') worldpop_byte_limit = 3096 * 10**6 # If the rasterio read will be within byte limit @@ -914,7 +915,7 @@ def compute_population(country_rows, WorldPop_inputfile, out_logging=False): if out_logging: logger.info( "Stage 3 of 4: compute_population for " - + str(country_rows["country"][0]) + + str(country_rows.iloc[0]["country"]) + ": Expected size of file readout was " + str(expected_bytes_input_read // 10**6) + " Megabytes. As the limit is " @@ -945,26 +946,31 @@ def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_ worldpop_y_dim, worldpop_x_dim = src.shape block_y_dim, block_x_dim = src.block_shapes[0] - # Calculate the bytes for reading one block from the image (float32) - read_block_size = 4 * block_y_dim * block_x_dim + # Set the windows x dimension to the input x dimension (width) + window_x_dim = worldpop_x_dim + # From testing we can assume max x dimension will always fit in memory: + # Largest memory requirement is 'US' at ~882.1 MB = 4 * 512 * window_x_dim + # Hence worldpop_byte_limit has to be greater than 883 MB + + # As the window spans the x dimension, set column offset to 0 + window_col_off = 0 + + # Calculate the bytes for reading the window using window_x_dim (float32) + read_block_size = 4 * block_y_dim * window_x_dim # Calculate the amount of blocks that fit into the memory budget + # Using the calculated x dimension window_block_count = worldpop_byte_limit // read_block_size - # The input files have blocks that vary the x dimension and keep y dimension to 512 - # Hence set the dimension of the windows to the same x dimension (width) - window_x_dim = block_x_dim - window_col_off = 0 - # Multiply the y_dimension by the amount of blocks in the window # window_y_dim will be height of the window window_y_dim = window_block_count * block_y_dim # Calculate the y ranges of the blocks to scan the image # y_range_start will serve as row offset - y_range_start = np.arange(0, worldpop_y_dim, window_y_dim) + window_row_off = np.arange(0, worldpop_y_dim, window_y_dim) - for row_off in y_range_start: + for row_off in window_row_off: window_dimensions = [window_col_off, row_off, window_x_dim, window_y_dim] print("Running window: ", window_dimensions) @@ -973,6 +979,10 @@ def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_ WorldPop_inputfile, window_dimensions ) + # If no values are present in the current window skip the remaining steps + if len(np_pop_val) == 0: + continue + # get the geomask with id mappings region_geomask, id_mapping = compute_geomask_region( country_rows, transform, window_dimensions, windowed=True @@ -985,9 +995,9 @@ def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_ # Loop the regions and write population to df_pop_count for i in range(len(windowed_pop_count)): - pop_count, gadm_id = windowed_pop_count.iloc[i] + gadm_id, pop_count = windowed_pop_count.iloc[i] # Select the row with the same "GADM_ID" and set the population count - df_pop_count.loc[df_pop_count["GADM_ID"] == gadm_id, "pop"] = pop_count + df_pop_count.loc[df_pop_count["GADM_ID"] == gadm_id, "pop"] += pop_count return df_pop_count @@ -1023,6 +1033,7 @@ def sum_values_using_geomask(np_pop_val, np_pop_xy, region_geomask, id_mapping): df_pop_count = pd.DataFrame(np_pop_count, columns=["pop"]) df_pop_count["GADM_ID"] = np.append(np.array("NaN"), id_mapping.values) + df_pop_count = df_pop_count[["GADM_ID", "pop"]] return df_pop_count @@ -1084,7 +1095,7 @@ def add_population_data( # Loop the regions and write population to df_gadm for i in range(len(df_pop_count)): - pop_count, gadm_id = df_pop_count.iloc[i] + gadm_id, pop_count = df_pop_count.iloc[i] # Select the row with the same "GADM_ID" and set the population count df_gadm.loc[df_gadm["GADM_ID"] == gadm_id, "pop"] = pop_count From 369084a9f5abce1a40044baa0ececf6a8550b403 Mon Sep 17 00:00:00 2001 From: GridGrapher <127969728+GridGrapher@users.noreply.github.com> Date: Sat, 1 Apr 2023 00:19:08 +0200 Subject: [PATCH 08/29] Kosovo naming difference GADM --- scripts/build_shapes.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index d9c58dc05..e20a7de48 100644 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -77,8 +77,11 @@ def download_GADM(country_code, update=False, out_logging=False): gpkg file per country """ + if country_code == "XK": + GADM_filename = f"gadm41_XKO" + else: + GADM_filename = f"gadm41_{two_2_three_digits_country(country_code)}" - GADM_filename = f"gadm41_{two_2_three_digits_country(country_code)}" GADM_url = f"https://geodata.ucdavis.edu/gadm/gadm4.1/gpkg/{GADM_filename}.gpkg" GADM_inputfile_gpkg = os.path.join( From ad38dae436eb62b636c33a44fcafab9e4d964521 Mon Sep 17 00:00:00 2001 From: GridGrapher <127969728+GridGrapher@users.noreply.github.com> Date: Sun, 2 Apr 2023 21:07:32 +0200 Subject: [PATCH 09/29] gadm_layer_id bug fix --- scripts/build_shapes.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index e20a7de48..47b6d9127 100644 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -177,6 +177,9 @@ def get_GADM_layer( geodf_list = [] for country_code in country_list: + # Set the current layer id (cur_layer_id) to global layer_id + cur_layer_id = layer_id + # download file gpkg file_gpkg, name_file = download_GADM(country_code, update, outlogging) @@ -184,18 +187,18 @@ def get_GADM_layer( list_layers = fiona.listlayers(file_gpkg) # get layer name - if (layer_id < 0) or (layer_id >= len(list_layers)): + if (cur_layer_id < 0) or (cur_layer_id >= len(list_layers)): # when layer id is negative or larger than the number of layers, select the last layer - layer_id = len(list_layers) - 1 + cur_layer_id = len(list_layers) - 1 # read gpkg file - geodf_temp = gpd.read_file(file_gpkg, layer="ADM_ADM_" + str(layer_id)).to_crs( - geo_crs - ) + geodf_temp = gpd.read_file( + file_gpkg, layer="ADM_ADM_" + str(cur_layer_id), engine="pyogrio" + ).to_crs(geo_crs) geodf_temp = filter_gadm( geodf=geodf_temp, - layer=layer_id, + layer=cur_layer_id, cc=country_code, contended_flag=contended_flag, output_nonstd_to_csv=False, @@ -203,7 +206,7 @@ def get_GADM_layer( # create a subindex column that is useful # in the GADM processing of sub-national zones - geodf_temp["GADM_ID"] = geodf_temp[f"GID_{layer_id}"] + geodf_temp["GADM_ID"] = geodf_temp[f"GID_{cur_layer_id}"] # append geodataframes geodf_list.append(geodf_temp) From 7524b98f8fe662af90325b063b154fa6112cff39 Mon Sep 17 00:00:00 2001 From: GridGrapher <127969728+GridGrapher@users.noreply.github.com> Date: Sun, 2 Apr 2023 22:44:19 +0200 Subject: [PATCH 10/29] intro numba ~14x speedup sum_values_using_geomask --- scripts/build_shapes.py | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index 47b6d9127..741763ef9 100644 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -44,7 +44,9 @@ import cProfile import os -import psutil +from numba import njit +from numba.core import types +from numba.typed import Dict # Function for profiling functions [temporary] @@ -1016,7 +1018,11 @@ def sum_values_using_geomask(np_pop_val, np_pop_xy, region_geomask, id_mapping): """ # Initialize a dictionary - dict_id = {0: 0} + dict_id = Dict.empty( + key_type=types.int64, + value_type=types.int64, + ) + dict_id[0] = 0 counter = 1 # Loop over ip mapping and add indicies to the dictionary for ID_index in np.array(id_mapping.index): @@ -1026,6 +1032,28 @@ def sum_values_using_geomask(np_pop_val, np_pop_xy, region_geomask, id_mapping): # Declare an array to contain population counts np_pop_count = np.zeros(len(id_mapping) + 1) + np_pop_count = loop_and_extact_val_x_y( + np_pop_count, np_pop_val, np_pop_xy, region_geomask, dict_id + ) + + df_pop_count = pd.DataFrame(np_pop_count, columns=["pop"]) + df_pop_count["GADM_ID"] = np.append(np.array("NaN"), id_mapping.values) + df_pop_count = df_pop_count[["GADM_ID", "pop"]] + + return df_pop_count + + +@njit +def loop_and_extact_val_x_y( + np_pop_count, np_pop_val, np_pop_xy, region_geomask, dict_id +): + """ + Function + + + + """ + # Loop the population data for i in range(len(np_pop_val)): cur_value = np_pop_val[i] @@ -1037,11 +1065,7 @@ def sum_values_using_geomask(np_pop_val, np_pop_xy, region_geomask, id_mapping): # Add the current value to the population np_pop_count[dict_id[cur_id]] += cur_value - df_pop_count = pd.DataFrame(np_pop_count, columns=["pop"]) - df_pop_count["GADM_ID"] = np.append(np.array("NaN"), id_mapping.values) - df_pop_count = df_pop_count[["GADM_ID", "pop"]] - - return df_pop_count + return np_pop_count def add_population_data( From 2ba62a60a6121074b95d42b2d9f995a640cc505b Mon Sep 17 00:00:00 2001 From: GridGrapher <127969728+GridGrapher@users.noreply.github.com> Date: Mon, 3 Apr 2023 23:49:13 +0200 Subject: [PATCH 11/29] Documentation update 1 --- scripts/build_shapes.py | 86 ++++++++++++++++++++++++++++------------- 1 file changed, 59 insertions(+), 27 deletions(-) diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index 741763ef9..02d8e8f4a 100644 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -745,17 +745,21 @@ def get_worldpop_features(WorldPop_inputfile): src.meta["transform"]: Representation of the affine transform src.shape: Dimensions of the input image """ - # Open the file using rasterio with rasterio.open(WorldPop_inputfile) as src: return src.meta["transform"], src.shape def get_worldpop_val_xy(WorldPop_inputfile, window_dimensions): """ - Function - - - + Function to extract data from .tif input file + ------- + Inputs: + WorldPop_inputfile: file location of worldpop file + window_dimensions: dimensions of window used when reading file + -------- + Outputs: + np_pop_valid: array filled with values for each nonzero pixel in the worldpop file + np_pop_xy: array with [x,y] coordinates of the corresponding nonzero values in np_pop_valid """ col_off, row_off, width, height = window_dimensions @@ -791,10 +795,19 @@ def compute_geomask_region( country_rows, affine_transform, window_dimensions, windowed=False ): """ - Function - - - + Function to mask geometries into np_map_ID using an incrementing counter + ------- + Inputs: + country_rows: geoDataFrame filled with geometries and their GADM_ID + affine_transform: affine transform of current image + window_dimensions: dimensions of window used when reading file + windowed: True if called during windowed processing (default = False) + -------- + Outputs: + np_map_ID.astype("H"): np_map_ID contains an ID for each location (undefined is 0) + dimensions are taken from window_dimensions, .astype("H") for memory savings + pd.DataFrame(id_to_GADM_ID).set_index(0): + DataFrame of the mapping from id (from counter) to GADM_ID """ col_off, row_off, x_axis_len, y_axis_len = window_dimensions @@ -823,7 +836,7 @@ def compute_geomask_region( ) # Set an empty numpy array with the dimensions of the country .tif file - # np_map_ID will contain a ID for each location (undefined is 0) + # np_map_ID will contain an ID for each location (undefined is 0) # ID corresponds to a specific geometry in country_rows np_map_ID = np.zeros((y_axis_len, x_axis_len)) @@ -872,21 +885,16 @@ def compute_geomask_region( def compute_population(country_rows, WorldPop_inputfile, out_logging=False): """ Function computes the population for the given country rows - + ------- Inputs: - ------- - country_rows: - - WorldPop_inputfile: - - + country_rows: geoDataFrame filled with geometries and their GADM_ID + WorldPop_inputfile: file location of worldpop file + out_logging: Boolean to enable logging + -------- Outputs: - -------- df_pop_count: Dataframe with columns - "pop" containing population of GADM_ID region - "GADM_ID" - - """ # Get the features of the worldpop input file transform, worldpop_dim = get_worldpop_features(WorldPop_inputfile) @@ -941,8 +949,16 @@ def compute_population(country_rows, WorldPop_inputfile, out_logging=False): def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_limit): """ Function - - + ------- + Inputs: + country_rows: geoDataFrame filled with geometries and their GADM_ID + WorldPop_inputfile: file location of worldpop file + worldpop_byte_limit: Memory limit which determines the window size + -------- + Outputs: + df_pop_count: Dataframe with columns + - "pop" containing population of GADM_ID region + - "GADM_ID" """ # Create a dataframe to store the population data @@ -1013,8 +1029,17 @@ def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_ def sum_values_using_geomask(np_pop_val, np_pop_xy, region_geomask, id_mapping): """ Function - - + ------- + Inputs: + np_pop_val: array filled with values for each nonzero pixel in the worldpop file + np_pop_xy: array with [x,y] coordinates of the corresponding nonzero values in np_pop_valid + region_geomask: + id_mapping: + -------- + Outputs: + df_pop_count: Dataframe with columns + - "GADM_ID" + - "pop" containing population of GADM_ID region """ # Initialize a dictionary @@ -1049,9 +1074,16 @@ def loop_and_extact_val_x_y( ): """ Function - - - + ------- + Inputs: + np_pop_count: + np_pop_val: + np_pop_xy: + region_geomask: + dict_id: + -------- + Outputs: + np_pop_count: np.array containing population counts """ # Loop the population data From 5517827f712505342cb5f565844fe58de73f75e1 Mon Sep 17 00:00:00 2001 From: GridGrapher <127969728+GridGrapher@users.noreply.github.com> Date: Thu, 13 Apr 2023 18:22:41 +0200 Subject: [PATCH 12/29] Removal of profile function --- scripts/build_shapes.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index 02d8e8f4a..14114d92f 100644 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -40,29 +40,11 @@ sets_path_to_root("pypsa-earth") -# Imports for profiling [temporary] -import cProfile -import os - from numba import njit from numba.core import types from numba.typed import Dict -# Function for profiling functions [temporary] -def profile(func): - """Decorator for run function profile""" - - def wrapper(*args, **kwargs): - profile_filename = func.__name__ + ".prof" - profiler = cProfile.Profile() - result = profiler.runcall(func, *args, **kwargs) - profiler.dump_stats(profile_filename) - return result - - return wrapper - - def download_GADM(country_code, update=False, out_logging=False): """ Download gpkg file from GADM for a given country code From b41bdc8f773b0e5c555c9b5393e1ed2eccaf20d4 Mon Sep 17 00:00:00 2001 From: GridGrapher <127969728+GridGrapher@users.noreply.github.com> Date: Sun, 16 Apr 2023 14:53:28 +0200 Subject: [PATCH 13/29] Split download worldpop, logging stages update --- scripts/build_shapes.py | 72 ++++++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 30 deletions(-) diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index 14114d92f..52486bb93 100644 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -79,7 +79,7 @@ def download_GADM(country_code, update=False, out_logging=False): if not os.path.exists(GADM_inputfile_gpkg) or update is True: if out_logging: logger.warning( - f"Stage 4 of 4: {GADM_filename} of country {two_digits_2_name_country(country_code)} does not exist, downloading to {GADM_inputfile_gpkg}" + f"Stage 5 of 5: {GADM_filename} of country {two_digits_2_name_country(country_code)} does not exist, downloading to {GADM_inputfile_gpkg}" ) # create data/osm directory os.makedirs(os.path.dirname(GADM_inputfile_gpkg), exist_ok=True) @@ -224,7 +224,7 @@ def countries(countries, geo_crs, contended_flag, update=False, out_logging=Fals "Create country shapes" if out_logging: - logger.info("Stage 1 of 4: Create country shapes") + logger.info("Stage 1 of 5: Create country shapes") # download data if needed and get the layer id 0, corresponding to the countries df_countries = get_GADM_layer( @@ -252,7 +252,7 @@ def countries(countries, geo_crs, contended_flag, update=False, out_logging=Fals def country_cover(country_shapes, eez_shapes=None, out_logging=False, distance=0.1): if out_logging: - logger.info("Stage 3 of 4: Merge country shapes to create continent shape") + logger.info("Stage 3 of 5: Merge country shapes to create continent shape") shapes = country_shapes.apply(lambda x: x.buffer(distance)) shapes_list = list(shapes) @@ -331,7 +331,7 @@ def eez( """ if out_logging: - logger.info("Stage 2 of 4: Create offshore shapes") + logger.info("Stage 2 of 5: Create offshore shapes") # load data df_eez = load_EEZ(countries, geo_crs, EEZ_gpkg) @@ -432,8 +432,6 @@ def download_WorldPop_standard( WorldPop_filename : str Name of the file """ - if out_logging: - logger.info("Stage 3 of 4: Download WorldPop datasets (standard)") if country_code == "XK": WorldPop_filename = f"srb_ppp_{year}_UNadj_constrained.tif" @@ -456,7 +454,7 @@ def download_WorldPop_standard( if not os.path.exists(WorldPop_inputfile) or update is True: if out_logging: logger.warning( - f"Stage 3 of 4: {WorldPop_filename} does not exist, downloading to {WorldPop_inputfile}" + f"Stage 3 of 5: {WorldPop_filename} does not exist, downloading to {WorldPop_inputfile}" ) # create data/osm directory os.makedirs(os.path.dirname(WorldPop_inputfile), exist_ok=True) @@ -470,7 +468,7 @@ def download_WorldPop_standard( loaded = True break if not loaded: - logger.error(f"Stage 3 of 4: Impossible to download {WorldPop_filename}") + logger.error(f"Stage 3 of 5: Impossible to download {WorldPop_filename}") return WorldPop_inputfile, WorldPop_filename @@ -499,8 +497,6 @@ def download_WorldPop_API( WorldPop_filename : str Name of the file """ - if out_logging: - logger.info("Stage 3 of 4: Download WorldPop datasets (API)") WorldPop_filename = f"{two_2_three_digits_country(country_code).lower()}_ppp_{year}_UNadj_constrained.tif" # Request to get the file @@ -524,7 +520,7 @@ def download_WorldPop_API( loaded = True break if not loaded: - logger.error(f"Stage 3 of 4: Impossible to download {WorldPop_filename}") + logger.error(f"Stage 3 of 5: Impossible to download {WorldPop_filename}") return WorldPop_inputfile, WorldPop_filename @@ -536,7 +532,7 @@ def convert_GDP(name_file_nc, year=2015, out_logging=False): """ if out_logging: - logger.info("Stage 4 of 4: Access to GDP raster data") + logger.info("Stage 5 of 5: Access to GDP raster data") # tif namefile name_file_tif = name_file_nc[:-2] + "tif" @@ -563,7 +559,7 @@ def convert_GDP(name_file_nc, year=2015, out_logging=False): if year not in list_years: if out_logging: logger.warning( - f"Stage 4 of 4 GDP data of year {year} not found, selected the most recent data ({int(list_years[-1])})" + f"Stage 5 of 5 GDP data of year {year} not found, selected the most recent data ({int(list_years[-1])})" ) year = float(list_years[-1]) @@ -587,7 +583,7 @@ def load_GDP( """ if out_logging: - logger.info("Stage 4 of 4: Access to GDP raster data") + logger.info("Stage 5 of 5: Access to GDP raster data") # path of the nc file name_file_tif = name_file_nc[:-2] + "tif" @@ -598,7 +594,7 @@ def load_GDP( if update | (not os.path.exists(GDP_tif)): if out_logging: logger.warning( - f"Stage 4 of 4: File {name_file_tif} not found, the file will be produced by processing {name_file_nc}" + f"Stage 5 of 5: File {name_file_tif} not found, the file will be produced by processing {name_file_nc}" ) convert_GDP(name_file_nc, year, out_logging) @@ -660,7 +656,7 @@ def add_gdp_data( - Includes a new column ["gdp"] """ if out_logging: - logger.info("Stage 4 of 4: Add gdp data to GADM GeoDataFrame") + logger.info("Stage 5 of 5: Add gdp data to GADM GeoDataFrame") # initialize new gdp column df_gadm["gdp"] = 0.0 @@ -912,7 +908,7 @@ def compute_population(country_rows, WorldPop_inputfile, out_logging=False): else: if out_logging: logger.info( - "Stage 3 of 4: compute_population for " + "Stage 4 of 5: compute_population for " + str(country_rows.iloc[0]["country"]) + ": Expected size of file readout was " + str(expected_bytes_input_read // 10**6) @@ -1109,32 +1105,48 @@ def add_population_data( - Includes a new column ["pop"] """ - if out_logging: - logger.info("Stage 3 of 4: Add population data to GADM GeoDataFrame") - - # initialize new population column + # Initialize new population column df_gadm["pop"] = 0.0 - tqdm_kwargs = dict( + # Initialize new dict to hold worldpop_inputfile strings + dict_worldpop_file_locations = {} + + if out_logging: + logger.info( + "Stage 3 of 5: Download WorldPop datasets, method: " + str(worldpop_method) + ) + + tqdm_kwargs_download = dict( ascii=False, - desc="Compute population ", + desc="Downloading worldpop file per country", ) - with tqdm(total=len(country_codes), **tqdm_kwargs) as pbar: + with tqdm(total=len(country_codes), **tqdm_kwargs_download) as pbar: for c_code in country_codes: - # get subset by country code - country_rows = df_gadm.loc[df_gadm["country"] == c_code] - # Download worldpop image (if required) and get file location WorldPop_inputfile, WorldPop_filename = download_WorldPop( c_code, worldpop_method, year, update, out_logging ) + dict_worldpop_file_locations[c_code] = WorldPop_inputfile + pbar.update(1) + + if out_logging: + logger.info("Stage 4 of 5: Add population data to GADM GeoDataFrame") + + tqdm_kwargs_compute = dict( + ascii=False, + desc="Compute population per country", + ) + with tqdm(total=len(country_codes), **tqdm_kwargs_compute) as pbar: + for c_code in country_codes: + # get subset by country code + country_rows = df_gadm.loc[df_gadm["country"] == c_code] if out_logging: - logger.info("Stage 3 of 4: Calculating population of " + str(c_code)) + logger.info("Stage 4 of 5: Calculating population of " + str(c_code)) # Calculate the population for each geometry given in country_rows df_pop_count = compute_population( - country_rows, WorldPop_inputfile, out_logging + country_rows, dict_worldpop_file_locations[c_code], out_logging ) # Loop the regions and write population to df_gadm @@ -1160,7 +1172,7 @@ def gadm( nchunks=None, ): if out_logging: - logger.info("Stage 3 of 4: Creation GADM GeoDataFrame") + logger.info("Stage 3 of 5: Creation GADM GeoDataFrame") # download data if needed and get the desired layer_id df_gadm = get_GADM_layer(countries, layer_id, geo_crs, contended_flag, update) From 30315f36dd8d7abe125fd98104a8807f18cb9641 Mon Sep 17 00:00:00 2001 From: GridGrapher <127969728+GridGrapher@users.noreply.github.com> Date: Sun, 16 Apr 2023 17:49:17 +0200 Subject: [PATCH 14/29] tqdm progress bar shows progress windowed approach --- scripts/build_shapes.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index 52486bb93..b38167a6e 100644 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -843,9 +843,9 @@ def compute_geomask_region( temp_mask = rasterio.features.geometry_mask( [cur_geometry], (y_axis_len, x_axis_len), - affine_transform, - invert=True, + transform=affine_transform, all_touched=True, + invert=True, ) # Map the values of counter value to np_map_ID @@ -860,7 +860,7 @@ def compute_geomask_region( return np_map_ID.astype("H"), pd.DataFrame(id_to_GADM_ID).set_index(0) -def compute_population(country_rows, WorldPop_inputfile, out_logging=False): +def compute_population(country_rows, WorldPop_inputfile, pbar, out_logging=False): """ Function computes the population for the given country rows ------- @@ -904,6 +904,7 @@ def compute_population(country_rows, WorldPop_inputfile, out_logging=False): df_pop_count = sum_values_using_geomask( np_pop_val, np_pop_xy, country_geomask, id_mapping ) + pbar.update(1) else: if out_logging: @@ -918,13 +919,15 @@ def compute_population(country_rows, WorldPop_inputfile, out_logging=False): ) # Calculate the population using windows df_pop_count = windowed_compute_population( - country_rows, WorldPop_inputfile, worldpop_byte_limit + country_rows, WorldPop_inputfile, worldpop_byte_limit, pbar ) return df_pop_count -def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_limit): +def windowed_compute_population( + country_rows, WorldPop_inputfile, worldpop_byte_limit, pbar +): """ Function ------- @@ -972,11 +975,12 @@ def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_ # y_range_start will serve as row offset window_row_off = np.arange(0, worldpop_y_dim, window_y_dim) + # Calculate the percentage each window is of the image for pbar.update + window_percentage_of_image = 1 / len(window_row_off) + for row_off in window_row_off: window_dimensions = [window_col_off, row_off, window_x_dim, window_y_dim] - print("Running window: ", window_dimensions) - np_pop_val, np_pop_xy = get_worldpop_val_xy( WorldPop_inputfile, window_dimensions ) @@ -1001,6 +1005,8 @@ def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_ # Select the row with the same "GADM_ID" and set the population count df_pop_count.loc[df_pop_count["GADM_ID"] == gadm_id, "pop"] += pop_count + pbar.update(window_percentage_of_image) + return df_pop_count @@ -1135,6 +1141,7 @@ def add_population_data( tqdm_kwargs_compute = dict( ascii=False, desc="Compute population per country", + bar_format="{desc}: {percentage:.3f}%|{bar}| {n:.3f}/{total_fmt} [{elapsed}<{remaining}", ) with tqdm(total=len(country_codes), **tqdm_kwargs_compute) as pbar: for c_code in country_codes: @@ -1146,7 +1153,7 @@ def add_population_data( # Calculate the population for each geometry given in country_rows df_pop_count = compute_population( - country_rows, dict_worldpop_file_locations[c_code], out_logging + country_rows, dict_worldpop_file_locations[c_code], pbar, out_logging ) # Loop the regions and write population to df_gadm @@ -1155,8 +1162,6 @@ def add_population_data( # Select the row with the same "GADM_ID" and set the population count df_gadm.loc[df_gadm["GADM_ID"] == gadm_id, "pop"] = pop_count - pbar.update(1) - def gadm( worldpop_method, From 5f59ce11097ccbd70cb9ae28bdaddf58290a807e Mon Sep 17 00:00:00 2001 From: GridGrapher <127969728+GridGrapher@users.noreply.github.com> Date: Sun, 16 Apr 2023 23:56:22 +0200 Subject: [PATCH 15/29] Multiprocessing approach 0.5 --- scripts/build_shapes.py | 88 +++++++++++++++++++++++++---------------- 1 file changed, 53 insertions(+), 35 deletions(-) diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index b38167a6e..c42949725 100644 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -676,9 +676,13 @@ def add_gdp_data( return df_gadm -def _init_process_pop(df_gadm_, year_, worldpop_method_): - global df_gadm, year, worldpop_method - df_gadm, year, worldpop_method = df_gadm_, year_, worldpop_method_ +def _init_process_pop(df_gadm_, dict_worldpop_file_locations_, out_logging_): + global df_gadm, dict_worldpop_file_locations, out_logging + df_gadm, dict_worldpop_file_locations, out_logging = ( + df_gadm_, + dict_worldpop_file_locations_, + out_logging_, + ) # Auxiliary function to calculate population data in a parallel way @@ -705,6 +709,20 @@ def _process_func_pop(gadm_idxs): return df_gadm_subset +def _process_function_population(c_code): + # get subset by country code + country_rows = df_gadm.loc[df_gadm["country"] == c_code] + + if out_logging: + logger.info("Stage 4 of 5: Calculating population of " + str(c_code)) + + # Calculate the population for each geometry given in country_rows + df_pop_count = compute_population( + country_rows, dict_worldpop_file_locations[c_code], out_logging + ) + return df_pop_count + + # Auxiliary function to download WorldPop data in a parallel way def _process_func_download_pop(c_code): WorldPop_inputfile, WorldPop_filename = download_WorldPop( @@ -860,7 +878,7 @@ def compute_geomask_region( return np_map_ID.astype("H"), pd.DataFrame(id_to_GADM_ID).set_index(0) -def compute_population(country_rows, WorldPop_inputfile, pbar, out_logging=False): +def compute_population(country_rows, WorldPop_inputfile, out_logging=False): """ Function computes the population for the given country rows ------- @@ -885,7 +903,7 @@ def compute_population(country_rows, WorldPop_inputfile, pbar, out_logging=False # Introduce a max byte size to avoid overfilling RAM # Ensure worldpop_byte_limit > 883 * 10**6 (minimum memory for 'US') - worldpop_byte_limit = 3096 * 10**6 + worldpop_byte_limit = 1024 * 10**6 # If the rasterio read will be within byte limit if expected_bytes_input_read < worldpop_byte_limit: @@ -904,7 +922,6 @@ def compute_population(country_rows, WorldPop_inputfile, pbar, out_logging=False df_pop_count = sum_values_using_geomask( np_pop_val, np_pop_xy, country_geomask, id_mapping ) - pbar.update(1) else: if out_logging: @@ -919,15 +936,13 @@ def compute_population(country_rows, WorldPop_inputfile, pbar, out_logging=False ) # Calculate the population using windows df_pop_count = windowed_compute_population( - country_rows, WorldPop_inputfile, worldpop_byte_limit, pbar + country_rows, WorldPop_inputfile, worldpop_byte_limit ) return df_pop_count -def windowed_compute_population( - country_rows, WorldPop_inputfile, worldpop_byte_limit, pbar -): +def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_limit): """ Function ------- @@ -1005,8 +1020,6 @@ def windowed_compute_population( # Select the row with the same "GADM_ID" and set the population count df_pop_count.loc[df_pop_count["GADM_ID"] == gadm_id, "pop"] += pop_count - pbar.update(window_percentage_of_image) - return df_pop_count @@ -1138,29 +1151,34 @@ def add_population_data( if out_logging: logger.info("Stage 4 of 5: Add population data to GADM GeoDataFrame") - tqdm_kwargs_compute = dict( - ascii=False, - desc="Compute population per country", - bar_format="{desc}: {percentage:.3f}%|{bar}| {n:.3f}/{total_fmt} [{elapsed}<{remaining}", - ) - with tqdm(total=len(country_codes), **tqdm_kwargs_compute) as pbar: - for c_code in country_codes: - # get subset by country code - country_rows = df_gadm.loc[df_gadm["country"] == c_code] - - if out_logging: - logger.info("Stage 4 of 5: Calculating population of " + str(c_code)) - - # Calculate the population for each geometry given in country_rows - df_pop_count = compute_population( - country_rows, dict_worldpop_file_locations[c_code], pbar, out_logging - ) - - # Loop the regions and write population to df_gadm - for i in range(len(df_pop_count)): - gadm_id, pop_count = df_pop_count.iloc[i] - # Select the row with the same "GADM_ID" and set the population count - df_gadm.loc[df_gadm["GADM_ID"] == gadm_id, "pop"] = pop_count + lock = mp.Lock() + kwargs = { + "initializer": _init_process_pop, + "initargs": ( + df_gadm, + dict_worldpop_file_locations, + out_logging, + ), + "processes": 2, + } + # Spawn processes with the parameters from kwargs + with mp.get_context("spawn").Pool(**kwargs) as pool: + # Create a progress bar + with tqdm( + total=len(country_codes), desc="Compute population per country" + ) as pbar: + # Give the pool a workload + for df_pop_count in pool.imap_unordered( + _process_function_population, country_codes + ): + # Acquire the lock before accessing df_gadm + with lock: + # Loop the regions and write population to df_gadm + for i in range(len(df_pop_count)): + gadm_id, pop_count = df_pop_count.iloc[i] + # Select the row with the same "GADM_ID" and set the population count + df_gadm.loc[df_gadm["GADM_ID"] == gadm_id, "pop"] = pop_count + pbar.update(1) def gadm( From 9da3b26e676352ede45c00bd349a58060f63321b Mon Sep 17 00:00:00 2001 From: GridGrapher <127969728+GridGrapher@users.noreply.github.com> Date: Mon, 17 Apr 2023 22:51:21 +0200 Subject: [PATCH 16/29] Env update --- envs/environment.fixed.yaml | 3 ++- envs/environment.yaml | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/envs/environment.fixed.yaml b/envs/environment.fixed.yaml index ac532f167..351b16f39 100644 --- a/envs/environment.fixed.yaml +++ b/envs/environment.fixed.yaml @@ -260,7 +260,7 @@ dependencies: - notebook-shim=0.1.0 - nspr=4.32 - nss=3.78 -- numba=0.53.1 +- numba=0.56.4 - numexpr=2.8.3 - numpy=1.23.2 - openjpeg=2.4.0 @@ -306,6 +306,7 @@ dependencies: - pyct=0.4.6 - pyct-core=0.4.6 - pygments=2.13.0 +- pyogrio=0.5.1 - pyomo=6.4.2 - pyopenssl=22.0.0 - pyparsing=3.0.9 diff --git a/envs/environment.yaml b/envs/environment.yaml index 480767d0b..b4755a554 100644 --- a/envs/environment.yaml +++ b/envs/environment.yaml @@ -43,6 +43,8 @@ dependencies: - matplotlib<=3.5.2 - reverse-geocode - country_converter +- pyogyrio +- numba # Keep in conda environment when calling ipython - ipython From 2f3b6a15b08b11b72cc9b0c3fc98a0b401c7858c Mon Sep 17 00:00:00 2001 From: GridGrapher <127969728+GridGrapher@users.noreply.github.com> Date: Mon, 17 Apr 2023 22:58:16 +0200 Subject: [PATCH 17/29] Env update - misstyped pyogrio --- envs/environment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/envs/environment.yaml b/envs/environment.yaml index b4755a554..9b8447928 100644 --- a/envs/environment.yaml +++ b/envs/environment.yaml @@ -43,7 +43,7 @@ dependencies: - matplotlib<=3.5.2 - reverse-geocode - country_converter -- pyogyrio +- pyogrio - numba # Keep in conda environment when calling ipython From 418207e28d023382afe3c21460e08c9595e2d822 Mon Sep 17 00:00:00 2001 From: GridGrapher <127969728+GridGrapher@users.noreply.github.com> Date: Tue, 18 Apr 2023 00:31:32 +0200 Subject: [PATCH 18/29] Adding mem_mb, integrating into compute_population --- Snakefile | 2 +- scripts/build_shapes.py | 46 ++++++++++++++++++++++++++++------------- 2 files changed, 33 insertions(+), 15 deletions(-) diff --git a/Snakefile b/Snakefile index 9bf6bed82..dfab7f120 100644 --- a/Snakefile +++ b/Snakefile @@ -221,7 +221,7 @@ rule build_shapes: "benchmarks/" + RDIR + "build_shapes" threads: 1 resources: - mem_mb=500, + mem_mb=2048, script: "scripts/build_shapes.py" diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index 113df61e7..7ea579546 100644 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -678,11 +678,14 @@ def add_gdp_data( return df_gadm -def _init_process_pop(df_gadm_, dict_worldpop_file_locations_, out_logging_): - global df_gadm, dict_worldpop_file_locations, out_logging - df_gadm, dict_worldpop_file_locations, out_logging = ( +def _init_process_pop( + df_gadm_, dict_worldpop_file_locations_, mem_read_limit_per_process_, out_logging_ +): + global df_gadm, dict_worldpop_file_locations, mem_read_limit_per_process, out_logging + df_gadm, dict_worldpop_file_locations, mem_read_limit_per_process, out_logging = ( df_gadm_, dict_worldpop_file_locations_, + mem_read_limit_per_process_, out_logging_, ) @@ -720,7 +723,10 @@ def _process_function_population(c_code): # Calculate the population for each geometry given in country_rows df_pop_count = compute_population( - country_rows, dict_worldpop_file_locations[c_code], out_logging + country_rows, + dict_worldpop_file_locations[c_code], + mem_read_limit_per_process, + out_logging, ) return df_pop_count @@ -880,7 +886,9 @@ def compute_geomask_region( return np_map_ID.astype("H"), pd.DataFrame(id_to_GADM_ID).set_index(0) -def compute_population(country_rows, WorldPop_inputfile, out_logging=False): +def compute_population( + country_rows, WorldPop_inputfile, mem_read_limit_per_process, out_logging=False +): """ Function computes the population for the given country rows ------- @@ -905,7 +913,7 @@ def compute_population(country_rows, WorldPop_inputfile, out_logging=False): # Introduce a max byte size to avoid overfilling RAM # Ensure worldpop_byte_limit > 883 * 10**6 (minimum memory for 'US') - worldpop_byte_limit = 1024 * 10**6 + worldpop_byte_limit = max(883, mem_read_limit_per_process) * 10**6 # If the rasterio read will be within byte limit if expected_bytes_input_read < worldpop_byte_limit: @@ -1106,6 +1114,7 @@ def add_population_data( year=2020, update=False, out_logging=False, + mem_read_limit_per_process=1024, nprocesses=2, nchunks=2, disable_progressbar=False, @@ -1159,6 +1168,7 @@ def add_population_data( "initargs": ( df_gadm, dict_worldpop_file_locations, + mem_read_limit_per_process, out_logging, ), "processes": 2, @@ -1189,6 +1199,7 @@ def gadm( countries, geo_crs, contended_flag, + mem_mb, layer_id=2, update=False, out_logging=False, @@ -1213,6 +1224,7 @@ def gadm( ) if worldpop_method != False: + mem_read_limit_per_process = mem_mb / nprocesses # add the population data to the dataset add_population_data( df_gadm, @@ -1221,6 +1233,7 @@ def gadm( year, update, out_logging, + mem_read_limit_per_process, nprocesses=nprocesses, nchunks=nchunks, ) @@ -1263,18 +1276,22 @@ def gadm( out = snakemake.output + EEZ_gpkg = snakemake.input["eez"] + mem_mb = snakemake.resources["mem_mb"] + countries_list = snakemake.config["countries"] + geo_crs = snakemake.config["crs"]["geo_crs"] + distance_crs = snakemake.config["crs"]["distance_crs"] + + contended_flag = snakemake.config["build_shape_options"]["contended_flag"] + gdp_method = snakemake.config["build_shape_options"]["gdp_method"] layer_id = snakemake.config["build_shape_options"]["gadm_layer_id"] - update = snakemake.config["build_shape_options"]["update_file"] - out_logging = snakemake.config["build_shape_options"]["out_logging"] - year = snakemake.config["build_shape_options"]["year"] nprocesses = snakemake.config["build_shape_options"]["nprocesses"] - contended_flag = snakemake.config["build_shape_options"]["contended_flag"] - EEZ_gpkg = snakemake.input["eez"] + out_logging = snakemake.config["build_shape_options"]["out_logging"] + update = snakemake.config["build_shape_options"]["update_file"] worldpop_method = snakemake.config["build_shape_options"]["worldpop_method"] - gdp_method = snakemake.config["build_shape_options"]["gdp_method"] - geo_crs = snakemake.config["crs"]["geo_crs"] - distance_crs = snakemake.config["crs"]["distance_crs"] + year = snakemake.config["build_shape_options"]["year"] + nchunks = snakemake.config["build_shape_options"]["nchunks"] if nchunks < nprocesses: logger.info(f"build_shapes data chunks set to nprocesses {nprocesses}") @@ -1306,6 +1323,7 @@ def gadm( countries_list, geo_crs, contended_flag, + mem_mb, layer_id, update, out_logging, From e96f3ac39f1379e78aee9d5ffb679a4abc2dc545 Mon Sep 17 00:00:00 2001 From: GridGrapher <127969728+GridGrapher@users.noreply.github.com> Date: Fri, 21 Apr 2023 01:01:33 +0200 Subject: [PATCH 19/29] compute_geomask_region adjustment --- scripts/build_shapes.py | 137 ++++++++++++++++++++++------------------ 1 file changed, 75 insertions(+), 62 deletions(-) diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index 7ea579546..4fa245468 100644 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -813,77 +813,86 @@ def compute_geomask_region( pd.DataFrame(id_to_GADM_ID).set_index(0): DataFrame of the mapping from id (from counter) to GADM_ID """ - col_off, row_off, x_axis_len, y_axis_len = window_dimensions + try: + col_off, row_off, x_axis_len, y_axis_len = [int(i) for i in window_dimensions] - if windowed: - # Declare a transformer with given affine_transform - transformer = rasterio.transform.AffineTransformer(affine_transform) - - # Obtain the coordinates of the upper left corner of window - window_topleft_longitude, window_topleft_latitude = transformer.xy( - row_off, col_off - ) - - # Obtain the coordinates of the bottom right corner of window - window_botright_longitude, window_botright_latitude = transformer.xy( - row_off + y_axis_len, col_off + x_axis_len - ) + if windowed: + # Declare a transformer with given affine_transform + transformer = rasterio.transform.AffineTransformer(affine_transform) - # Set the current transform to the correct lat and long - affine_transform = rasterio.Affine( - affine_transform[0], - affine_transform[1], - window_topleft_longitude, - affine_transform[3], - affine_transform[4], - window_topleft_latitude, - ) + # Obtain the coordinates of the upper left corner of window + window_topleft_longitude, window_topleft_latitude = transformer.xy( + row_off, col_off + ) - # Set an empty numpy array with the dimensions of the country .tif file - # np_map_ID will contain an ID for each location (undefined is 0) - # ID corresponds to a specific geometry in country_rows - np_map_ID = np.zeros((y_axis_len, x_axis_len)) + # Obtain the coordinates of the bottom right corner of window + window_botright_longitude, window_botright_latitude = transformer.xy( + row_off + y_axis_len, col_off + x_axis_len + ) - # List to contain the mappings of id to GADM_ID - id_to_GADM_ID = [] + # Set the current transform to the correct lat and long + affine_transform = rasterio.Affine( + affine_transform[0], + affine_transform[1], + window_topleft_longitude, + affine_transform[3], + affine_transform[4], + window_topleft_latitude, + ) - # Loop the country_rows geoDataFrame - for i in range(len(country_rows)): - # Set the current geometry - cur_geometry = country_rows.iloc[i]["geometry"] + # Set an empty numpy array with the dimensions of the country .tif file + # np_map_ID will contain an ID for each location (undefined is 0) + # ID corresponds to a specific geometry in country_rows + np_map_ID = np.zeros((y_axis_len, x_axis_len)) + + # List to contain the mappings of id to GADM_ID + id_to_GADM_ID = [] + + # Loop the country_rows geoDataFrame + for i in range(len(country_rows)): + # Set the current geometry + cur_geometry = country_rows.iloc[i]["geometry"] + + # In windowed mode we check if bounds of geometry overlap the window + if windowed: + latitude_min = cur_geometry.bounds[1] + latitude_max = cur_geometry.bounds[3] + + # In the following cases we don't have to continue the loop + # If the geometry is above the window + if latitude_min > window_topleft_latitude: + continue + # If the geometry is below the window + if latitude_max < window_botright_latitude: + continue + + # Generate a mask for the specific geometry + temp_mask = rasterio.features.geometry_mask( + [cur_geometry], + (y_axis_len, x_axis_len), + transform=affine_transform, + all_touched=True, + invert=True, + ) - # In windowed mode we check if bounds of geometry overlap the window - if windowed: - latitude_min = cur_geometry.bounds[1] - latitude_max = cur_geometry.bounds[3] - - # In the following cases we don't have to continue the loop - # If the geometry is above the window - if latitude_min > window_topleft_latitude: - continue - # If the geometry is below the window - if latitude_max < window_botright_latitude: - continue - - # Generate a mask for the specific geometry - temp_mask = rasterio.features.geometry_mask( - [cur_geometry], - (y_axis_len, x_axis_len), - transform=affine_transform, - all_touched=True, - invert=True, - ) + # Map the values of counter value to np_map_ID + np_map_ID[temp_mask] = i + 1 - # Map the values of counter value to np_map_ID - np_map_ID[temp_mask] = i + 1 + # Store the id -> GADM_ID mapping + id_to_GADM_ID.append([i + 1, country_rows.iloc[i]["GADM_ID"]]) - # Store the id -> GADM_ID mapping - id_to_GADM_ID.append([i + 1, country_rows.iloc[i]["GADM_ID"]]) + if len(id_to_GADM_ID) > 0: + id_result = pd.DataFrame(id_to_GADM_ID).set_index(0) + else: + id_result = pd.DataFrame() - # Return np_map_ID as type 'H' np.ushort - # 'H' -> https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.ushort - # This lowers memory usage, note: ID has to be within the range [0,65535] - return np_map_ID.astype("H"), pd.DataFrame(id_to_GADM_ID).set_index(0) + # Return np_map_ID as type 'H' np.ushort + # 'H' -> https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.ushort + # This lowers memory usage, note: ID has to be within the range [0,65535] + return np_map_ID.astype("H"), id_result + except: + print(country_rows, affine_transform, window_dimensions, windowed) + raise def compute_population( @@ -1019,6 +1028,10 @@ def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_ country_rows, transform, window_dimensions, windowed=True ) + # If no values are present in the id_mapping skip the remaining steps + if len(id_mapping) == 0: + continue + # Calculate the population for each region windowed_pop_count = sum_values_using_geomask( np_pop_val, np_pop_xy, region_geomask, id_mapping From 6791d65a3355984c9a6e7102e3ddf8928687740e Mon Sep 17 00:00:00 2001 From: GridGrapher <127969728+GridGrapher@users.noreply.github.com> Date: Sun, 23 Apr 2023 12:36:10 +0200 Subject: [PATCH 20/29] Cleanup 1 --- scripts/build_shapes.py | 55 +---------------------------------------- 1 file changed, 1 insertion(+), 54 deletions(-) diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index 4fa245468..ade7399c8 100644 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -690,37 +690,10 @@ def _init_process_pop( ) -# Auxiliary function to calculate population data in a parallel way -def _process_func_pop(gadm_idxs): - # get subset by country code - df_gadm_subset = df_gadm.loc[gadm_idxs].copy() - - country_sublist = df_gadm_subset["country"].unique() - - for c_code in country_sublist: - # get worldpop image - WorldPop_inputfile, WorldPop_filename = download_WorldPop( - c_code, worldpop_method, year, False, False - ) - - idxs_country = df_gadm_subset[df_gadm_subset["country"] == c_code].index - - with rasterio.open(WorldPop_inputfile) as src: - for i in idxs_country: - df_gadm_subset.loc[i, "pop"] = _sum_raster_over_mask( - df_gadm_subset.geometry.loc[i], src - ) - - return df_gadm_subset - - def _process_function_population(c_code): # get subset by country code country_rows = df_gadm.loc[df_gadm["country"] == c_code] - if out_logging: - logger.info("Stage 4 of 5: Calculating population of " + str(c_code)) - # Calculate the population for each geometry given in country_rows df_pop_count = compute_population( country_rows, @@ -731,13 +704,6 @@ def _process_function_population(c_code): return df_pop_count -# Auxiliary function to download WorldPop data in a parallel way -def _process_func_download_pop(c_code): - WorldPop_inputfile, WorldPop_filename = download_WorldPop( - c_code, worldpop_method, year, False, False - ) - - def get_worldpop_features(WorldPop_inputfile): """ Function to extract data from .tif input file @@ -943,16 +909,6 @@ def compute_population( ) else: - if out_logging: - logger.info( - "Stage 4 of 5: compute_population for " - + str(country_rows.iloc[0]["country"]) - + ": Expected size of file readout was " - + str(expected_bytes_input_read // 10**6) - + " Megabytes. As the limit is " - + str(worldpop_byte_limit // 10**6) - + " Megabytes switching to windowed approach" - ) # Calculate the population using windows df_pop_count = windowed_compute_population( country_rows, WorldPop_inputfile, worldpop_byte_limit @@ -1129,7 +1085,6 @@ def add_population_data( out_logging=False, mem_read_limit_per_process=1024, nprocesses=2, - nchunks=2, disable_progressbar=False, ): """ @@ -1184,7 +1139,7 @@ def add_population_data( mem_read_limit_per_process, out_logging, ), - "processes": 2, + "processes": nprocesses, } # Spawn processes with the parameters from kwargs with mp.get_context("spawn").Pool(**kwargs) as pool: @@ -1218,7 +1173,6 @@ def gadm( out_logging=False, year=2020, nprocesses=None, - nchunks=None, ): if out_logging: logger.info("Stage 3 of 5: Creation GADM GeoDataFrame") @@ -1248,7 +1202,6 @@ def gadm( out_logging, mem_read_limit_per_process, nprocesses=nprocesses, - nchunks=nchunks, ) if gdp_method != False: @@ -1305,11 +1258,6 @@ def gadm( worldpop_method = snakemake.config["build_shape_options"]["worldpop_method"] year = snakemake.config["build_shape_options"]["year"] - nchunks = snakemake.config["build_shape_options"]["nchunks"] - if nchunks < nprocesses: - logger.info(f"build_shapes data chunks set to nprocesses {nprocesses}") - nchunks = nprocesses - country_shapes = countries( countries_list, geo_crs, @@ -1342,6 +1290,5 @@ def gadm( out_logging, year, nprocesses=nprocesses, - nchunks=nchunks, ) save_to_geojson(gadm_shapes, out.gadm_shapes) From 41edcbbc85dddec7b1bddf4a2abc7ab4855fb521 Mon Sep 17 00:00:00 2001 From: GridGrapher <127969728+GridGrapher@users.noreply.github.com> Date: Sun, 23 Apr 2023 13:16:19 +0200 Subject: [PATCH 21/29] Contested ISO codes fix for df_gadm --- scripts/build_shapes.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index ade7399c8..3282a3f6f 100644 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -1215,11 +1215,9 @@ def gadm( ) # renaming 3 letter to 2 letter ISO code before saving GADM file - # solves issue: https://github.com/pypsa-meets-earth/pypsa-earth/issues/671 - df_gadm["GADM_ID"] = ( - df_gadm["GADM_ID"] - .str.split(".") - .apply(lambda id: three_2_two_digits_country(id[0]) + "." + ".".join(id[1:])) + # In the case of a contested territory in the form 'Z00.00_0', save 'AA.Z00.00_0' + df_gadm["GADM_ID"] = df_gadm["country"] + df_gadm["GADM_ID"].apply( + lambda x: "." + x if x[0] == "Z" else x[3:] ) df_gadm.set_index("GADM_ID", inplace=True) df_gadm["geometry"] = df_gadm["geometry"].map(_simplify_polys) From 74177b27a8ea3da2c691b5937f280e03654e0fcc Mon Sep 17 00:00:00 2001 From: GridGrapher <127969728+GridGrapher@users.noreply.github.com> Date: Sun, 23 Apr 2023 14:19:37 +0200 Subject: [PATCH 22/29] Readability update --- scripts/build_shapes.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index 3282a3f6f..b0e142656 100644 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -731,9 +731,9 @@ def get_worldpop_val_xy(WorldPop_inputfile, window_dimensions): np_pop_valid: array filled with values for each nonzero pixel in the worldpop file np_pop_xy: array with [x,y] coordinates of the corresponding nonzero values in np_pop_valid """ - col_off, row_off, width, height = window_dimensions + col_offset, row_offset, width, height = window_dimensions - current_window = Window(col_off, row_off, width, height) + current_window = Window(col_offset, row_offset, width, height) # Open the file using rasterio with rasterio.open(WorldPop_inputfile) as src: @@ -780,7 +780,9 @@ def compute_geomask_region( DataFrame of the mapping from id (from counter) to GADM_ID """ try: - col_off, row_off, x_axis_len, y_axis_len = [int(i) for i in window_dimensions] + col_offset, row_offset, x_axis_len, y_axis_len = [ + int(i) for i in window_dimensions + ] if windowed: # Declare a transformer with given affine_transform @@ -788,12 +790,12 @@ def compute_geomask_region( # Obtain the coordinates of the upper left corner of window window_topleft_longitude, window_topleft_latitude = transformer.xy( - row_off, col_off + row_offset, col_offset ) # Obtain the coordinates of the bottom right corner of window window_botright_longitude, window_botright_latitude = transformer.xy( - row_off + y_axis_len, col_off + x_axis_len + row_offset + y_axis_len, col_offset + x_axis_len ) # Set the current transform to the correct lat and long @@ -887,7 +889,7 @@ def compute_population( expected_bytes_input_read = 4 * worldpop_y_dim * worldpop_x_dim # Introduce a max byte size to avoid overfilling RAM - # Ensure worldpop_byte_limit > 883 * 10**6 (minimum memory for 'US') + # Ensure worldpop_byte_limit >= 883 * 10**6 (minimum memory for 'US') worldpop_byte_limit = max(883, mem_read_limit_per_process) * 10**6 # If the rasterio read will be within byte limit @@ -948,7 +950,7 @@ def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_ # Hence worldpop_byte_limit has to be greater than 883 MB # As the window spans the x dimension, set column offset to 0 - window_col_off = 0 + window_col_offset = 0 # Calculate the bytes for reading the window using window_x_dim (float32) read_block_size = 4 * block_y_dim * window_x_dim @@ -963,13 +965,11 @@ def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_ # Calculate the y ranges of the blocks to scan the image # y_range_start will serve as row offset - window_row_off = np.arange(0, worldpop_y_dim, window_y_dim) - - # Calculate the percentage each window is of the image for pbar.update - window_percentage_of_image = 1 / len(window_row_off) + window_row_offset = np.arange(0, worldpop_y_dim, window_y_dim) - for row_off in window_row_off: - window_dimensions = [window_col_off, row_off, window_x_dim, window_y_dim] + # Loop the windows + for row_offset in window_row_offset: + window_dimensions = [window_col_offset, row_offset, window_x_dim, window_y_dim] np_pop_val, np_pop_xy = get_worldpop_val_xy( WorldPop_inputfile, window_dimensions From 605c6b8d29f29fad88bdcdd1ff7bc596a3ef5e16 Mon Sep 17 00:00:00 2001 From: GridGrapher <127969728+GridGrapher@users.noreply.github.com> Date: Sun, 23 Apr 2023 22:46:06 +0200 Subject: [PATCH 23/29] Restructuring for adaptability --- Snakefile | 2 +- scripts/build_shapes.py | 473 +++++++++++++++++++--------------------- 2 files changed, 226 insertions(+), 249 deletions(-) diff --git a/Snakefile b/Snakefile index dfab7f120..1a19b56a7 100644 --- a/Snakefile +++ b/Snakefile @@ -221,7 +221,7 @@ rule build_shapes: "benchmarks/" + RDIR + "build_shapes" threads: 1 resources: - mem_mb=2048, + mem_mb=3096, script: "scripts/build_shapes.py" diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index b0e142656..b18e20bcd 100644 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -678,45 +678,52 @@ def add_gdp_data( return df_gadm -def _init_process_pop( - df_gadm_, dict_worldpop_file_locations_, mem_read_limit_per_process_, out_logging_ -): - global df_gadm, dict_worldpop_file_locations, mem_read_limit_per_process, out_logging - df_gadm, dict_worldpop_file_locations, mem_read_limit_per_process, out_logging = ( - df_gadm_, +def _init_process_pop(df_gadm_, df_tasks_, dict_worldpop_file_locations_): + global df_gadm, df_tasks + df_gadm, df_tasks = df_gadm_, df_tasks_ + + global dict_worldpop_file_locations, out_logging + dict_worldpop_file_locations, out_logging = ( dict_worldpop_file_locations_, - mem_read_limit_per_process_, - out_logging_, + False, ) -def _process_function_population(c_code): +def new_process_function_population(row_id): + # Get the current task values + current_row = df_tasks.iloc[row_id] + + c_code = current_row["c_code"] + window_dimensions = current_row["window_dimensions"] + transform = current_row["affine_transform"] + latlong_topleft = current_row["latlong_coordinate_topleft"] + latlong_botright = current_row["latlong_coordinate_botright"] + + # Obtain the inputfile location from dict + WorldPop_inputfile = dict_worldpop_file_locations[c_code] # get subset by country code country_rows = df_gadm.loc[df_gadm["country"] == c_code] - # Calculate the population for each geometry given in country_rows - df_pop_count = compute_population( - country_rows, - dict_worldpop_file_locations[c_code], - mem_read_limit_per_process, - out_logging, + np_pop_val, np_pop_xy = get_worldpop_val_xy(WorldPop_inputfile, window_dimensions) + + # If no values are present in the current window skip the remaining steps + if len(np_pop_val) == 0: + return [] + + # get the geomask with id mappings + region_geomask, id_mapping = compute_geomask_region( + country_rows, transform, window_dimensions, latlong_topleft, latlong_botright ) - return df_pop_count + # If no values are present in the id_mapping skip the remaining steps + if len(id_mapping) == 0: + return [] -def get_worldpop_features(WorldPop_inputfile): - """ - Function to extract data from .tif input file - ------- - Inputs: - WorldPop_inputfile: String pointing to location of file - -------- - Outputs: - src.meta["transform"]: Representation of the affine transform - src.shape: Dimensions of the input image - """ - with rasterio.open(WorldPop_inputfile) as src: - return src.meta["transform"], src.shape + # Calculate the population for each region + windowed_pop_count = sum_values_using_geomask( + np_pop_val, np_pop_xy, region_geomask, id_mapping + ) + return windowed_pop_count def get_worldpop_val_xy(WorldPop_inputfile, window_dimensions): @@ -762,7 +769,7 @@ def get_worldpop_val_xy(WorldPop_inputfile, window_dimensions): def compute_geomask_region( - country_rows, affine_transform, window_dimensions, windowed=False + country_rows, affine_transform, window_dimensions, latlong_topleft, latlong_botright ): """ Function to mask geometries into np_map_ID using an incrementing counter @@ -776,230 +783,60 @@ def compute_geomask_region( Outputs: np_map_ID.astype("H"): np_map_ID contains an ID for each location (undefined is 0) dimensions are taken from window_dimensions, .astype("H") for memory savings - pd.DataFrame(id_to_GADM_ID).set_index(0): + id_result: DataFrame of the mapping from id (from counter) to GADM_ID """ - try: - col_offset, row_offset, x_axis_len, y_axis_len = [ - int(i) for i in window_dimensions - ] - - if windowed: - # Declare a transformer with given affine_transform - transformer = rasterio.transform.AffineTransformer(affine_transform) - - # Obtain the coordinates of the upper left corner of window - window_topleft_longitude, window_topleft_latitude = transformer.xy( - row_offset, col_offset - ) - - # Obtain the coordinates of the bottom right corner of window - window_botright_longitude, window_botright_latitude = transformer.xy( - row_offset + y_axis_len, col_offset + x_axis_len - ) - - # Set the current transform to the correct lat and long - affine_transform = rasterio.Affine( - affine_transform[0], - affine_transform[1], - window_topleft_longitude, - affine_transform[3], - affine_transform[4], - window_topleft_latitude, - ) - - # Set an empty numpy array with the dimensions of the country .tif file - # np_map_ID will contain an ID for each location (undefined is 0) - # ID corresponds to a specific geometry in country_rows - np_map_ID = np.zeros((y_axis_len, x_axis_len)) - - # List to contain the mappings of id to GADM_ID - id_to_GADM_ID = [] - - # Loop the country_rows geoDataFrame - for i in range(len(country_rows)): - # Set the current geometry - cur_geometry = country_rows.iloc[i]["geometry"] - - # In windowed mode we check if bounds of geometry overlap the window - if windowed: - latitude_min = cur_geometry.bounds[1] - latitude_max = cur_geometry.bounds[3] - - # In the following cases we don't have to continue the loop - # If the geometry is above the window - if latitude_min > window_topleft_latitude: - continue - # If the geometry is below the window - if latitude_max < window_botright_latitude: - continue - - # Generate a mask for the specific geometry - temp_mask = rasterio.features.geometry_mask( - [cur_geometry], - (y_axis_len, x_axis_len), - transform=affine_transform, - all_touched=True, - invert=True, - ) - - # Map the values of counter value to np_map_ID - np_map_ID[temp_mask] = i + 1 - - # Store the id -> GADM_ID mapping - id_to_GADM_ID.append([i + 1, country_rows.iloc[i]["GADM_ID"]]) - - if len(id_to_GADM_ID) > 0: - id_result = pd.DataFrame(id_to_GADM_ID).set_index(0) - else: - id_result = pd.DataFrame() - - # Return np_map_ID as type 'H' np.ushort - # 'H' -> https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.ushort - # This lowers memory usage, note: ID has to be within the range [0,65535] - return np_map_ID.astype("H"), id_result - except: - print(country_rows, affine_transform, window_dimensions, windowed) - raise - - -def compute_population( - country_rows, WorldPop_inputfile, mem_read_limit_per_process, out_logging=False -): - """ - Function computes the population for the given country rows - ------- - Inputs: - country_rows: geoDataFrame filled with geometries and their GADM_ID - WorldPop_inputfile: file location of worldpop file - out_logging: Boolean to enable logging - -------- - Outputs: - df_pop_count: Dataframe with columns - - "pop" containing population of GADM_ID region - - "GADM_ID" - """ - # Get the features of the worldpop input file - transform, worldpop_dim = get_worldpop_features(WorldPop_inputfile) - - worldpop_y_dim, worldpop_x_dim = worldpop_dim - - # Rasterio doesn't support lower than float32 readout - # Hence reading the file will take up: nbytes = 4 * y_dim * x_dim - expected_bytes_input_read = 4 * worldpop_y_dim * worldpop_x_dim - - # Introduce a max byte size to avoid overfilling RAM - # Ensure worldpop_byte_limit >= 883 * 10**6 (minimum memory for 'US') - worldpop_byte_limit = max(883, mem_read_limit_per_process) * 10**6 - - # If the rasterio read will be within byte limit - if expected_bytes_input_read < worldpop_byte_limit: - # Call functions with input dimensions for window - window_dim = [0, 0, worldpop_x_dim, worldpop_y_dim] - - # Get population values and corresponding x,y coords - np_pop_val, np_pop_xy = get_worldpop_val_xy(WorldPop_inputfile, window_dim) - - # get the geomask with id mappings - country_geomask, id_mapping = compute_geomask_region( - country_rows, transform, window_dim - ) - - # Calculate the population for each region - df_pop_count = sum_values_using_geomask( - np_pop_val, np_pop_xy, country_geomask, id_mapping - ) - - else: - # Calculate the population using windows - df_pop_count = windowed_compute_population( - country_rows, WorldPop_inputfile, worldpop_byte_limit - ) - - return df_pop_count - - -def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_limit): - """ - Function - ------- - Inputs: - country_rows: geoDataFrame filled with geometries and their GADM_ID - WorldPop_inputfile: file location of worldpop file - worldpop_byte_limit: Memory limit which determines the window size - -------- - Outputs: - df_pop_count: Dataframe with columns - - "pop" containing population of GADM_ID region - - "GADM_ID" - - """ - # Create a dataframe to store the population data - df_pop_count = country_rows.loc[:, ["GADM_ID", "pop"]].copy() - - # Open the file using rasterio - with rasterio.open(WorldPop_inputfile) as src: - transform = src.meta["transform"] - worldpop_y_dim, worldpop_x_dim = src.shape - block_y_dim, block_x_dim = src.block_shapes[0] - - # Set the windows x dimension to the input x dimension (width) - window_x_dim = worldpop_x_dim - # From testing we can assume max x dimension will always fit in memory: - # Largest memory requirement is 'US' at ~882.1 MB = 4 * 512 * window_x_dim - # Hence worldpop_byte_limit has to be greater than 883 MB - - # As the window spans the x dimension, set column offset to 0 - window_col_offset = 0 + col_offset, row_offset, x_axis_len, y_axis_len = window_dimensions - # Calculate the bytes for reading the window using window_x_dim (float32) - read_block_size = 4 * block_y_dim * window_x_dim + # Set an empty numpy array with the dimensions of the country .tif file + # np_map_ID will contain an ID for each location (undefined is 0) + # ID corresponds to a specific geometry in country_rows + np_map_ID = np.zeros((y_axis_len, x_axis_len)) - # Calculate the amount of blocks that fit into the memory budget - # Using the calculated x dimension - window_block_count = worldpop_byte_limit // read_block_size + # List to contain the mappings of id to GADM_ID + id_to_GADM_ID = [] - # Multiply the y_dimension by the amount of blocks in the window - # window_y_dim will be height of the window - window_y_dim = window_block_count * block_y_dim + # Loop the country_rows geoDataFrame + for i in range(len(country_rows)): + # Set the current geometry + cur_geometry = country_rows.iloc[i]["geometry"] - # Calculate the y ranges of the blocks to scan the image - # y_range_start will serve as row offset - window_row_offset = np.arange(0, worldpop_y_dim, window_y_dim) + # Check if bounds of geometry overlap the window + latitude_min = cur_geometry.bounds[1] + latitude_max = cur_geometry.bounds[3] - # Loop the windows - for row_offset in window_row_offset: - window_dimensions = [window_col_offset, row_offset, window_x_dim, window_y_dim] - - np_pop_val, np_pop_xy = get_worldpop_val_xy( - WorldPop_inputfile, window_dimensions - ) - - # If no values are present in the current window skip the remaining steps - if len(np_pop_val) == 0: + # In the following cases we don't have to continue the loop + # If the geometry is above the window + if latitude_min > latlong_topleft[0]: + continue + # If the geometry is below the window + if latitude_max < latlong_botright[0]: continue - # get the geomask with id mappings - region_geomask, id_mapping = compute_geomask_region( - country_rows, transform, window_dimensions, windowed=True + # Generate a mask for the specific geometry + temp_mask = rasterio.features.geometry_mask( + [cur_geometry], + (y_axis_len, x_axis_len), + transform=affine_transform, + all_touched=True, + invert=True, ) - # If no values are present in the id_mapping skip the remaining steps - if len(id_mapping) == 0: - continue + # Map the values of counter value to np_map_ID + np_map_ID[temp_mask] = i + 1 - # Calculate the population for each region - windowed_pop_count = sum_values_using_geomask( - np_pop_val, np_pop_xy, region_geomask, id_mapping - ) + # Store the id -> GADM_ID mapping + id_to_GADM_ID.append([i + 1, country_rows.iloc[i]["GADM_ID"]]) - # Loop the regions and write population to df_pop_count - for i in range(len(windowed_pop_count)): - gadm_id, pop_count = windowed_pop_count.iloc[i] - # Select the row with the same "GADM_ID" and set the population count - df_pop_count.loc[df_pop_count["GADM_ID"] == gadm_id, "pop"] += pop_count + if len(id_to_GADM_ID) > 0: + id_result = pd.DataFrame(id_to_GADM_ID).set_index(0) + else: + id_result = pd.DataFrame() - return df_pop_count + # Return np_map_ID as type 'H' np.ushort + # 'H' -> https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.ushort + # This lowers memory usage, note: ID has to be within the range [0,65535] + return np_map_ID.astype("H"), id_result def sum_values_using_geomask(np_pop_val, np_pop_xy, region_geomask, id_mapping): @@ -1076,6 +913,124 @@ def loop_and_extact_val_x_y( return np_pop_count +def calculate_transform_and_coords_for_window( + current_transform, window_dimensions, original_window=False +): + col_offset, row_offset, x_axis_len, y_axis_len = window_dimensions + + # Declare a transformer with given affine_transform + transformer = rasterio.transform.AffineTransformer(current_transform) + + # Obtain the coordinates of the upper left corner of window + window_topleft_longitude, window_topleft_latitude = transformer.xy( + row_offset, col_offset + ) + + # Obtain the coordinates of the bottom right corner of window + window_botright_longitude, window_botright_latitude = transformer.xy( + row_offset + y_axis_len, col_offset + x_axis_len + ) + + if original_window: + adjusted_transform = current_transform + else: + # Set the adjusted transform to the correct lat and long + adjusted_transform = rasterio.Affine( + current_transform[0], + current_transform[1], + window_topleft_longitude, + current_transform[3], + current_transform[4], + window_topleft_latitude, + ) + + coordinate_topleft = [window_topleft_latitude, window_topleft_longitude] + coordinate_botright = [window_botright_latitude, window_botright_longitude] + + return [adjusted_transform, coordinate_topleft, coordinate_botright] + + +def generate_df_tasks(c_code, mem_read_limit_per_process, WorldPop_inputfile): + """ + Function to generate a list of tasks based on the memory constraints + One task represents a single window of the image + ------- + Inputs: + c_code: country code + mem_read_limit_per_process: memory limit for src.read() operation + -------- + Outputs: + Dataframe of task_list + """ + task_list = [] + + # Read out dimensions and transform of image + with rasterio.open(WorldPop_inputfile) as src: + worldpop_y_dim, worldpop_x_dim = src.shape + transform = src.meta["transform"] + block_y_dim, block_x_dim = [int(i) for i in src.block_shapes[0]] + + # Rasterio doesn't support lower than float32 readout + # Hence reading the file will take up: nbytes = 4 * y_dim * x_dim + expected_bytes_input_read = 4 * worldpop_y_dim * worldpop_x_dim + + # Introduce a limit that avoids overfilling RAM during readout + # Ensure worldpop_byte_limit >= 883 * 10**6 (minimum memory for 'US') + worldpop_byte_limit = max(883, mem_read_limit_per_process) * 10**6 + + if expected_bytes_input_read < worldpop_byte_limit: + # If the rasterio read will be within byte limit + bool_original_window = True + # Set the window to entire image dimensions + current_window = [0, 0, worldpop_x_dim, worldpop_y_dim] + # Get latlong coordinates of window + transform_and_coords = calculate_transform_and_coords_for_window( + transform, current_window, bool_original_window + ) + task_list.append( + [c_code, current_window, bool_original_window] + transform_and_coords + ) + else: + # Reading operation has to be split up into multiple windows + bool_original_window = False + # Set the windows x dimension to the input x dimension (width) + window_x_dim = worldpop_x_dim + # From testing we can assume max x dimension will always fit in memory: + # Largest memory requirement is 'US' at ~882.1 MB = 4 * 512 * window_x_dim + # Hence worldpop_byte_limit has to be greater than 883 MB + + # As the window spans the x dimension, set column offset to 0 + window_col_offset = 0 + + # Calculate the bytes for reading the window using window_x_dim (float32) + read_block_size = 4 * block_y_dim * window_x_dim + + # Calculate the amount of blocks that fit into the memory budget + # Using the calculated x dimension + window_block_count = int(worldpop_byte_limit // read_block_size) + + # Multiply the y_dimension by the amount of blocks in the window + # window_y_dim will be height of the window + window_y_dim = window_block_count * block_y_dim + + # Calculate row offsets for all windows + window_row_offset = np.arange(0, worldpop_y_dim, window_y_dim) + + # Loop the windows and add task for each one to task_list + for row_offset in window_row_offset: + current_window = [window_col_offset, row_offset, window_x_dim, window_y_dim] + + transform_and_coords = calculate_transform_and_coords_for_window( + transform, current_window, bool_original_window + ) + + task_list.append( + [c_code, current_window, bool_original_window] + transform_and_coords + ) + + return pd.DataFrame(task_list) + + def add_population_data( df_gadm, country_codes, @@ -1109,6 +1064,9 @@ def add_population_data( # Initialize new dict to hold worldpop_inputfile strings dict_worldpop_file_locations = {} + # Initialize new dataframe to hold tasks for processing + df_tasks = pd.DataFrame() + if out_logging: logger.info( "Stage 3 of 5: Download WorldPop datasets, method: " + str(worldpop_method) @@ -1116,7 +1074,7 @@ def add_population_data( tqdm_kwargs_download = dict( ascii=False, - desc="Downloading worldpop file per country", + desc="Downloading worldpop file per country and assigning tasks", ) with tqdm(total=len(country_codes), **tqdm_kwargs_download) as pbar: for c_code in country_codes: @@ -1125,31 +1083,50 @@ def add_population_data( c_code, worldpop_method, year, update, out_logging ) dict_worldpop_file_locations[c_code] = WorldPop_inputfile + + df_new_tasks = generate_df_tasks( + c_code, mem_read_limit_per_process, WorldPop_inputfile + ) + + df_tasks = pd.concat([df_tasks, df_new_tasks], ignore_index=True) + pbar.update(1) + df_tasks.columns = [ + "c_code", + "window_dimensions", + "is_original_window", + "affine_transform", + "latlong_coordinate_topleft", + "latlong_coordinate_botright", + ] + if out_logging: logger.info("Stage 4 of 5: Add population data to GADM GeoDataFrame") + if out_logging: + logger.info("Stage 4 of 5: Starting multiprocessing") + + tqdm_kwargs_compute = dict( + ascii=False, desc="Compute population per window", unit=" window" + ) lock = mp.Lock() kwargs = { "initializer": _init_process_pop, "initargs": ( df_gadm, + df_tasks, dict_worldpop_file_locations, - mem_read_limit_per_process, - out_logging, ), "processes": nprocesses, } # Spawn processes with the parameters from kwargs with mp.get_context("spawn").Pool(**kwargs) as pool: # Create a progress bar - with tqdm( - total=len(country_codes), desc="Compute population per country" - ) as pbar: + with tqdm(total=len(df_tasks), **tqdm_kwargs_compute) as pbar: # Give the pool a workload for df_pop_count in pool.imap_unordered( - _process_function_population, country_codes + new_process_function_population, range(len(df_tasks)) ): # Acquire the lock before accessing df_gadm with lock: @@ -1157,7 +1134,7 @@ def add_population_data( for i in range(len(df_pop_count)): gadm_id, pop_count = df_pop_count.iloc[i] # Select the row with the same "GADM_ID" and set the population count - df_gadm.loc[df_gadm["GADM_ID"] == gadm_id, "pop"] = pop_count + df_gadm.loc[df_gadm["GADM_ID"] == gadm_id, "pop"] += pop_count pbar.update(1) From 0e168daf4bd1f431175800d2ec20c69eef765507 Mon Sep 17 00:00:00 2001 From: GridGrapher <127969728+GridGrapher@users.noreply.github.com> Date: Wed, 26 Apr 2023 00:53:02 +0200 Subject: [PATCH 24/29] Docstrings and comments --- scripts/build_shapes.py | 79 +++++++++++++++++++++++++++++------------ 1 file changed, 57 insertions(+), 22 deletions(-) diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index 6acf4c169..7537a4a7d 100644 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -682,14 +682,23 @@ def _init_process_pop(df_gadm_, df_tasks_, dict_worldpop_file_locations_): global df_gadm, df_tasks df_gadm, df_tasks = df_gadm_, df_tasks_ - global dict_worldpop_file_locations, out_logging - dict_worldpop_file_locations, out_logging = ( - dict_worldpop_file_locations_, - False, - ) + global dict_worldpop_file_locations + dict_worldpop_file_locations = dict_worldpop_file_locations_ -def new_process_function_population(row_id): +def process_function_population(row_id): + """ + Function that reads the task from df_tasks and executes all the methods + to obtain population values for the specified region + ------- + Inputs: + row_id: integer which indicates a specific row of df_tasks + -------- + Outputs: + windowed_pop_count: Dataframe containing "GADM_ID" and "pop" columns + It represents the amount of population per region (GADM_ID), + for the settings given by the row in df_tasks + """ # Get the current task values current_row = df_tasks.iloc[row_id] @@ -776,9 +785,10 @@ def compute_geomask_region( ------- Inputs: country_rows: geoDataFrame filled with geometries and their GADM_ID - affine_transform: affine transform of current image + affine_transform: affine transform of current window window_dimensions: dimensions of window used when reading file - windowed: True if called during windowed processing (default = False) + latlong_topleft: [latitude, longitude] of top left corner of the window + latlong_botright: [latitude, longitude] of bottom right corner of the window -------- Outputs: np_map_ID.astype("H"): np_map_ID contains an ID for each location (undefined is 0) @@ -801,11 +811,11 @@ def compute_geomask_region( # Set the current geometry cur_geometry = country_rows.iloc[i]["geometry"] - # Check if bounds of geometry overlap the window latitude_min = cur_geometry.bounds[1] latitude_max = cur_geometry.bounds[3] - # In the following cases we don't have to continue the loop + # Check if bounds of geometry overlap the window + # In the following cases we can skip the rest of the loop # If the geometry is above the window if latitude_min > latlong_topleft[0]: continue @@ -841,13 +851,15 @@ def compute_geomask_region( def sum_values_using_geomask(np_pop_val, np_pop_xy, region_geomask, id_mapping): """ - Function + Function that sums all the population values in np_pop_val into the correct GADM_ID + It uses np_pop_xy to access the key stored in region_geomask[x][y] + The relation of this key to GADM_ID is stored in id_mapping ------- Inputs: np_pop_val: array filled with values for each nonzero pixel in the worldpop file np_pop_xy: array with [x,y] coordinates of the corresponding nonzero values in np_pop_valid - region_geomask: - id_mapping: + region_geomask: array with dimensions of window, values are keys that map to GADM_ID using id_mapping + id_mapping: Dataframe that contains mappings of region_geomask values to GADM_IDs -------- Outputs: df_pop_count: Dataframe with columns @@ -870,6 +882,7 @@ def sum_values_using_geomask(np_pop_val, np_pop_xy, region_geomask, id_mapping): # Declare an array to contain population counts np_pop_count = np.zeros(len(id_mapping) + 1) + # Calculate population count of region using a numba njit compiled function np_pop_count = loop_and_extact_val_x_y( np_pop_count, np_pop_val, np_pop_xy, region_geomask, dict_id ) @@ -886,14 +899,16 @@ def loop_and_extact_val_x_y( np_pop_count, np_pop_val, np_pop_xy, region_geomask, dict_id ): """ - Function + Function that will be compiled using @njit (numba) + It takes all the population values from np_pop_val and stores them in np_pop_count + where each location in np_pop_count is mapped to a GADM_ID through dict_id (id_mapping by extension) ------- Inputs: - np_pop_count: - np_pop_val: - np_pop_xy: - region_geomask: - dict_id: + np_pop_count: np.zeros array, which will store population counts + np_pop_val: array filled with values for each nonzero pixel in the worldpop file + np_pop_xy: array with [x,y] coordinates of the corresponding nonzero values in np_pop_valid + region_geomask: array with dimensions of window, values are keys that map to GADM_ID using id_mapping + dict_id: numba typed.dict containing id_mapping.index -> location in np_pop_count -------- Outputs: np_pop_count: np.array containing population counts @@ -916,9 +931,26 @@ def loop_and_extact_val_x_y( def calculate_transform_and_coords_for_window( current_transform, window_dimensions, original_window=False ): + """ + Function which calculates the [lat,long] corners of the window given window_dimensions, + if not(original_window) it also changes the affine transform to match the window + ------- + Inputs: + current_transform: affine transform of source image + window_dimensions: dimensions of window used when reading file + original_window: boolean to track if window covers entire country + -------- + Outputs: + A list of: [ + adjusted_transform: affine transform adjusted to window + coordinate_topleft: [latitude, longitude] of top left corner of the window + coordinate_botright: [latitude, longitude] of bottom right corner of the window + ] + + """ col_offset, row_offset, x_axis_len, y_axis_len = window_dimensions - # Declare a transformer with given affine_transform + # Declare a affine transformer with given current_transform transformer = rasterio.transform.AffineTransformer(current_transform) # Obtain the coordinates of the upper left corner of window @@ -932,6 +964,7 @@ def calculate_transform_and_coords_for_window( ) if original_window: + # If the window covers the entire country then use original affine transform adjusted_transform = current_transform else: # Set the adjusted transform to the correct lat and long @@ -958,6 +991,7 @@ def generate_df_tasks(c_code, mem_read_limit_per_process, WorldPop_inputfile): Inputs: c_code: country code mem_read_limit_per_process: memory limit for src.read() operation + WorldPop_inputfile: file location of worldpop file -------- Outputs: Dataframe of task_list @@ -1002,7 +1036,7 @@ def generate_df_tasks(c_code, mem_read_limit_per_process, WorldPop_inputfile): # As the window spans the x dimension, set column offset to 0 window_col_offset = 0 - # Calculate the bytes for reading the window using window_x_dim (float32) + # Calculate the bytes for reading the window using window_x_dim (readout is float32) read_block_size = 4 * block_y_dim * window_x_dim # Calculate the amount of blocks that fit into the memory budget @@ -1084,6 +1118,7 @@ def add_population_data( ) dict_worldpop_file_locations[c_code] = WorldPop_inputfile + # Given the memory constraint, generate a task for all windows to cover the country (c_code) df_new_tasks = generate_df_tasks( c_code, mem_read_limit_per_process, WorldPop_inputfile ) @@ -1126,7 +1161,7 @@ def add_population_data( with tqdm(total=len(df_tasks), **tqdm_kwargs_compute) as pbar: # Give the pool a workload for df_pop_count in pool.imap_unordered( - new_process_function_population, range(len(df_tasks)) + process_function_population, range(len(df_tasks)) ): # Acquire the lock before accessing df_gadm with lock: From a991c7018d5fec08fb42f48452eb33f64fe08f81 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 14 Aug 2023 11:23:14 +0000 Subject: [PATCH 25/29] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- scripts/build_shapes.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index 3f20a141d..32bf83e22 100644 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -714,7 +714,8 @@ def _init_process_pop(df_gadm_, df_tasks_, dict_worldpop_file_locations_): def process_function_population(row_id): """ - Function that reads the task from df_tasks and executes all the methods + Function that reads the task from df_tasks and executes all the methods. + to obtain population values for the specified region ------- Inputs: @@ -877,8 +878,9 @@ def compute_geomask_region( def sum_values_using_geomask(np_pop_val, np_pop_xy, region_geomask, id_mapping): """ - Function that sums all the population values in np_pop_val into the correct GADM_ID - It uses np_pop_xy to access the key stored in region_geomask[x][y] + Function that sums all the population values in np_pop_val into the correct + GADM_ID It uses np_pop_xy to access the key stored in region_geomask[x][y] + The relation of this key to GADM_ID is stored in id_mapping ------- Inputs: @@ -891,7 +893,6 @@ def sum_values_using_geomask(np_pop_val, np_pop_xy, region_geomask, id_mapping): df_pop_count: Dataframe with columns - "GADM_ID" - "pop" containing population of GADM_ID region - """ # Initialize a dictionary dict_id = Dict.empty( @@ -925,8 +926,9 @@ def loop_and_extact_val_x_y( np_pop_count, np_pop_val, np_pop_xy, region_geomask, dict_id ): """ - Function that will be compiled using @njit (numba) - It takes all the population values from np_pop_val and stores them in np_pop_count + Function that will be compiled using @njit (numba) It takes all the + population values from np_pop_val and stores them in np_pop_count. + where each location in np_pop_count is mapped to a GADM_ID through dict_id (id_mapping by extension) ------- Inputs: @@ -958,7 +960,9 @@ def calculate_transform_and_coords_for_window( current_transform, window_dimensions, original_window=False ): """ - Function which calculates the [lat,long] corners of the window given window_dimensions, + Function which calculates the [lat,long] corners of the window given + window_dimensions, + if not(original_window) it also changes the affine transform to match the window ------- Inputs: @@ -972,7 +976,6 @@ def calculate_transform_and_coords_for_window( coordinate_topleft: [latitude, longitude] of top left corner of the window coordinate_botright: [latitude, longitude] of bottom right corner of the window ] - """ col_offset, row_offset, x_axis_len, y_axis_len = window_dimensions @@ -1011,7 +1014,8 @@ def calculate_transform_and_coords_for_window( def generate_df_tasks(c_code, mem_read_limit_per_process, WorldPop_inputfile): """ - Function to generate a list of tasks based on the memory constraints + Function to generate a list of tasks based on the memory constraints. + One task represents a single window of the image ------- Inputs: From 62364678555d0057e6f3ebd83125a28122a82963 Mon Sep 17 00:00:00 2001 From: davide-f Date: Tue, 15 Aug 2023 18:38:21 +0200 Subject: [PATCH 26/29] Minor improvements to code and documentation --- config.default.yaml | 1 - config.tutorial.yaml | 1 - doc/configtables/build_shape_options.csv | 1 - scripts/build_shapes.py | 20 +++++++++++++++++--- 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/config.default.yaml b/config.default.yaml index 40a00662d..08a9ee890 100644 --- a/config.default.yaml +++ b/config.default.yaml @@ -90,7 +90,6 @@ build_shape_options: out_logging: true # When true, logging is printed to console year: 2020 # reference year used to derive shapes, info on population and info on GDP nprocesses: 3 # number of processes to be used in build_shapes - nchunks: 3 # number of data chuncks for build_shapes. If not specified or smaller than nprocesses, specified as nprocesses worldpop_method: "standard" # "standard" pulls from web 1kmx1km raster, "api" pulls from API 100mx100m raster, false (not "false") no pop addition to shape which is useful when generating only cutout gdp_method: "standard" # "standard" pulls from web 1x1km raster, false (not "false") no gdp addition to shape which useful when generating only cutout contended_flag: "set_by_country" # "set_by_country" assigns the contended areas to the countries according to the GADM database, "drop" drops these contended areas from the model diff --git a/config.tutorial.yaml b/config.tutorial.yaml index 2115c00df..20f154200 100644 --- a/config.tutorial.yaml +++ b/config.tutorial.yaml @@ -104,7 +104,6 @@ build_shape_options: out_logging: true # When true, logging is printed to console year: 2020 # reference year used to derive shapes, info on population and info on GDP nprocesses: 2 # number of processes to be used in build_shapes - nchunks: 2 # number of data chuncks for build_shapes. If not specified or smaller than nprocesses, specified as nprocesses worldpop_method: "standard" # "standard" pulls from web 1kmx1km raster, "api" pulls from API 100mx100m raster, false (not "false") no pop addition to shape which is useful when generating only cutout gdp_method: "standard" # "standard" pulls from web 1x1km raster, false (not "false") no gdp addition to shape which useful when generating only cutout contended_flag: "set_by_country" # "set_by_country" assigns the contended areas to the countries according to the GADM database, "drop" drops these contended areas from the model diff --git a/doc/configtables/build_shape_options.csv b/doc/configtables/build_shape_options.csv index 446dab8bd..78df78713 100644 --- a/doc/configtables/build_shape_options.csv +++ b/doc/configtables/build_shape_options.csv @@ -4,7 +4,6 @@ update_file, bool, "{True, False}", "True: all input files are downloaded again out_logging, bool, "{True, False}", "True: Logging is printed in the console." year,, "past year; e.g. YYYY", "Reference year used to derive shapes, info on population and info on GDP." nprocesses, int,, "Number of processes to be used in build_shapes." -nchunks, int,, "Number of data chuncks for build_shapes. If not specified or smaller than nprocesses, specified as nprocesses." worldpop_method,, "{""standard"", ""api"", false}", "Specifies how population is added to every shape: ""standard"" pulls from web 1kmx1km raster; ""api"" pulls from API 100mx100m raster; false (not ""false"") no population addition to shape. This is useful when generating only cutout." gdp_method,, "{""standard"", false}", "Specifies how GDP is added to every shape: ""standard"" pulls from web 1x1km raster; false (not ""false"") no gdp addition to shape. This is useful when generating only cutout." contended_flag,, "{""set_by_country"", ""drop""}", "Specifies what to do with contended countries: ""set_by_country"" assigns the contended areas to the countries according to the GADM database; ""drop"" drops the contended areas from the model." diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index 32bf83e22..38b17c661 100644 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -941,7 +941,6 @@ def loop_and_extact_val_x_y( Outputs: np_pop_count: np.array containing population counts """ - # Loop the population data for i in range(len(np_pop_val)): cur_value = np_pop_val[i] @@ -1108,6 +1107,19 @@ def add_population_data( ): """ Function to add population data to arbitrary number of shapes in a country. + It loads data from WorldPop raster files where each pixel represents the population in that square region. + Each square polygon (or pixel) is then mapped into the corresponding GADM shape. + Then the population in a GADM shape is identified by summing over all pixels mapped to that region. + + This is performed with an iterative approach: + 1. All necessary WorldPop data tiff file are downloaded + 2. The so-called windows are created to handle RAM limitations related to large WorldPop files. + Large WorldPop files require significant RAM to handle, which may not be available, + hence, the entire activity is decomposed into multiple windows (or tasks). + Each window represents a subset of a raster file on which the following algorithm is applied. + Note: when enough RAM is available only a window is created for efficiency purposes. + 3. Execute all tasks by summing the values of the pixels mapped into each GADM shape. + Parallelization applies in this task. Inputs: ------- @@ -1193,14 +1205,16 @@ def add_population_data( for df_pop_count in pool.imap_unordered( process_function_population, range(len(df_tasks)) ): - # Acquire the lock before accessing df_gadm + # Acquire the lock before accessing df_gadm and pbar with lock: # Loop the regions and write population to df_gadm for i in range(len(df_pop_count)): gadm_id, pop_count = df_pop_count.iloc[i] # Select the row with the same "GADM_ID" and set the population count df_gadm.loc[df_gadm["GADM_ID"] == gadm_id, "pop"] += pop_count - pbar.update(1) + + # update bar + pbar.update(1) def gadm( From b3074316e870426ad08c353af26e345552e7840e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 15 Aug 2023 17:07:19 +0000 Subject: [PATCH 27/29] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- scripts/build_shapes.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index 38b17c661..52a211a24 100644 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -1107,10 +1107,11 @@ def add_population_data( ): """ Function to add population data to arbitrary number of shapes in a country. - It loads data from WorldPop raster files where each pixel represents the population in that square region. - Each square polygon (or pixel) is then mapped into the corresponding GADM shape. - Then the population in a GADM shape is identified by summing over all pixels mapped to that region. - + It loads data from WorldPop raster files where each pixel represents the + population in that square region. Each square polygon (or pixel) is then + mapped into the corresponding GADM shape. Then the population in a GADM + shape is identified by summing over all pixels mapped to that region. + This is performed with an iterative approach: 1. All necessary WorldPop data tiff file are downloaded 2. The so-called windows are created to handle RAM limitations related to large WorldPop files. @@ -1212,7 +1213,7 @@ def add_population_data( gadm_id, pop_count = df_pop_count.iloc[i] # Select the row with the same "GADM_ID" and set the population count df_gadm.loc[df_gadm["GADM_ID"] == gadm_id, "pop"] += pop_count - + # update bar pbar.update(1) From b3953fea7b3cf45e7ef509d638304715cd5b89fd Mon Sep 17 00:00:00 2001 From: davide-f Date: Tue, 15 Aug 2023 20:55:04 +0200 Subject: [PATCH 28/29] Fix kosovo and spelling --- scripts/build_shapes.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index 52a211a24..84963764e 100644 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -461,10 +461,10 @@ def download_WorldPop_standard( """ if country_code == "XK": - WorldPop_filename = f"srb_ppp_{year}_UNadj_constrained.tif" + WorldPop_filename = f"kos_ppp_{year}_constrained.tif" WorldPop_urls = [ - f"https://data.worldpop.org/GIS/Population/Global_2000_2020_Constrained/2020/BSGM/SRB/{WorldPop_filename}", - f"https://data.worldpop.org/GIS/Population/Global_2000_2020_Constrained/2020/maxar_v1/SRB/{WorldPop_filename}", + f"https://data.worldpop.org/GIS/Population/Global_2000_2020_Constrained/2020/BSGM/KOS/{WorldPop_filename}", + f"https://data.worldpop.org/GIS/Population/Global_2000_2020_Constrained/2020/maxar_v1/KOS/{WorldPop_filename}", ] else: WorldPop_filename = f"{two_2_three_digits_country(country_code).lower()}_ppp_{year}_UNadj_constrained.tif" @@ -901,7 +901,7 @@ def sum_values_using_geomask(np_pop_val, np_pop_xy, region_geomask, id_mapping): ) dict_id[0] = 0 counter = 1 - # Loop over ip mapping and add indicies to the dictionary + # Loop over ip mapping and add indices to the dictionary for ID_index in np.array(id_mapping.index): dict_id[ID_index] = counter counter += 1 From 9ddb40dfedd465729f5cb321000157f67e479d9c Mon Sep 17 00:00:00 2001 From: Davide Fioriti Date: Sat, 23 Dec 2023 17:36:06 +0100 Subject: [PATCH 29/29] Fix max snakemake version --- envs/environment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/envs/environment.yaml b/envs/environment.yaml index 491e29052..b65376cb7 100644 --- a/envs/environment.yaml +++ b/envs/environment.yaml @@ -23,7 +23,7 @@ dependencies: - xlrd - openpyxl - seaborn -- snakemake-minimal +- snakemake-minimal<8 - memory_profiler - ruamel.yaml<=0.17.26 - pytables