From e18272d8cfb45e6f66a9e79d532d31824d83cfb6 Mon Sep 17 00:00:00 2001
From: GridGrapher <127969728+GridGrapher@users.noreply.github.com>
Date: Tue, 21 Mar 2023 23:36:32 +0100
Subject: [PATCH 01/29] Adding methods for efficient processing of raster

---
 scripts/build_shapes.py | 228 ++++++++++++++++++++++++++++++----------
 1 file changed, 173 insertions(+), 55 deletions(-)

diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index ee7d07733..cfd857ddf 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -281,7 +281,7 @@ def load_EEZ(countries_codes, geo_crs, EEZ_gpkg="./data/eez/eez_v11.gpkg"):
             f"File EEZ {EEZ_gpkg} not found, please download it from https://www.marineregions.org/download_file.php?name=World_EEZ_v11_20191118_gpkg.zip and copy it in {os.path.dirname(EEZ_gpkg)}"
         )
 
-    geodf_EEZ = gpd.read_file(EEZ_gpkg).to_crs(geo_crs)
+    geodf_EEZ = gpd.read_file(EEZ_gpkg, engine="pyogrio").to_crs(geo_crs)
     geodf_EEZ.dropna(axis=0, how="any", subset=["ISO_TER1"], inplace=True)
     # [["ISO_TER1", "TERRITORY1", "ISO_SOV1", "ISO_SOV2", "ISO_SOV3", "geometry"]]
     geodf_EEZ = geodf_EEZ[["ISO_TER1", "geometry"]]
@@ -652,7 +652,7 @@ def add_gdp_data(
         logger.info("Stage 4/4: Add gdp data to GADM GeoDataFrame")
 
     # initialize new gdp column
-    df_gadm["gdp"] = 0.0
+    # df_gadm["gdp"] = 0.0
 
     GDP_tif, name_tif = load_GDP(year, update, out_logging, name_file_nc)
 
@@ -705,6 +705,103 @@ def _process_func_download_pop(c_code):
     )
 
 
+def get_worldpop_features(WorldPop_inputfile):
+    """
+    Function
+
+
+
+    """
+    worldpop_features = []
+    # Open the file using rasterio
+    with rasterio.open(WorldPop_inputfile) as src:
+        # Add affine transform
+        worldpop_features.append(src.meta["transform"])
+        # Add dimensions of image
+        worldpop_features.append(src.shape[0])
+        worldpop_features.append(src.shape[1])
+
+        # --- Process the pixels in the image for population data ---
+        # Read the gray layer (1) to get an np.array of this band
+        np_pop_raster = src.read(1)
+
+        # Set np_pop_valid to all pixels in np_pop_raster that aren't 'nodata'
+        np_pop_valid = np_pop_raster[np_pop_raster != src.nodata]
+
+        # Set 'nodata' values to 0
+        np_pop_raster[np_pop_raster == src.nodata] = 0
+
+        # Set np_pop_xy to pixel locations of non zero values (same order as np_pop_valid)
+        np_pop_xy = np_pop_raster.nonzero()
+
+        # Combine arrays to get [ value, x, y ] array
+        np_pop = np.array([np_pop_valid, np_pop_xy[0], np_pop_xy[1]]).T
+
+    return worldpop_features, np_pop
+
+
+def compute_geomask_country(country_rows, worldpop_features):
+    """
+    Function
+
+
+
+    """
+    affine_transform, y_axis_len, x_axis_len = worldpop_features
+
+    # Set an empty numpy array with the dimensions of the country .tif file
+    # np_map_ID will contain a ID for each location (undefined is 0)
+    # ID corresponds to a specific geometry in country_rows
+    np_map_ID = np.zeros((y_axis_len, x_axis_len))
+
+    # List to contain the mappings of id to GADM_ID
+    id_to_GADM_ID = []
+    # Loop the country_rows geoDataFrame
+    for i in range(len(country_rows)):
+        # Set the current geometry
+        cur_geometry = country_rows["geometry"][i]
+        # Generate a mask for the specific geometry
+        temp_mask = rasterio.features.geometry_mask(
+            [cur_geometry], (y_axis_len, x_axis_len), affine_transform, invert=True
+        )
+
+        # Map the values of counter value to np_map_ID
+        np_map_ID[temp_mask] = i + 1
+
+        # Store the id -> GADM_ID mapping
+        id_to_GADM_ID.append([i + 1, country_rows["GADM_ID"][i]])
+
+    return np_map_ID.astype(int), pd.DataFrame(id_to_GADM_ID).set_index(0)
+
+
+def compute_population(np_pop, country_geomask, id_mapping):
+    """
+    Function
+
+
+
+    """
+    # Declare an array to contain population counts
+    np_pop_count = np.zeros(len(id_mapping) + 1)
+
+    # Loop the population data
+    for i in range(len(np_pop)):
+        cur_value, cur_x, cur_y = np_pop[i]
+
+        # Set the current id to the id at the same coordinate of the geomask
+        cur_id = country_geomask[int(cur_x)][int(cur_y)]
+
+        # Add the current value to the population
+        np_pop_count[cur_id] += cur_value
+
+    df_pop_count = pd.DataFrame(np_pop_count)
+    df_pop_count = pd.DataFrame(df_pop_count).join(id_mapping)
+
+    df_pop_count.columns = ["pop", "GADM_ID"]
+
+    return df_pop_count.drop(0)
+
+
 def add_population_data(
     df_gadm,
     country_codes,
@@ -742,62 +839,83 @@ def add_population_data(
         ascii=False,
         desc="Compute population ",
     )
-    if (nprocesses is None) or (nprocesses == 1):
-        with tqdm(total=df_gadm.shape[0], **tqdm_kwargs) as pbar:
-            for c_code in country_codes:
-                # get subset by country code
-                country_rows = df_gadm.loc[df_gadm["country"] == c_code]
-
-                # get worldpop image
-                WorldPop_inputfile, WorldPop_filename = download_WorldPop(
-                    c_code, worldpop_method, year, update, out_logging
-                )
-
-                with rasterio.open(WorldPop_inputfile) as src:
-                    for i, row in country_rows.iterrows():
-                        df_gadm.loc[i, "pop"] = _sum_raster_over_mask(row.geometry, src)
-                        pbar.update(1)
+    with tqdm(total=len(country_codes), **tqdm_kwargs) as pbar:
+        for c_code in country_codes:
+            # get subset by country code
+            country_rows = df_gadm.loc[df_gadm["country"] == c_code]
+
+            # get worldpop image
+            WorldPop_inputfile, WorldPop_filename = download_WorldPop(
+                c_code, worldpop_method, year, update, out_logging
+            )
 
-    else:
-        # generator function to split a list ll into n_objs lists
-        def divide_list(ll, n_objs):
-            for i in range(0, len(ll), n_objs):
-                yield ll[i : i + n_objs]
+            worldpop_features, np_pop = get_worldpop_features(WorldPop_inputfile)
 
-        id_parallel_chunks = list(
-            divide_list(df_gadm.index, ceil(len(df_gadm) / nchunks))
-        )
+            # get the geomask with id mappings
+            country_geomask, id_mapping = compute_geomask_country(
+                country_rows, worldpop_features
+            )
 
-        kwargs = {
-            "initializer": _init_process_pop,
-            "initargs": (df_gadm, year, worldpop_method),
-            "processes": nprocesses,
-        }
-        with mp.get_context("spawn").Pool(**kwargs) as pool:
-            if disable_progressbar:
-                list(pool.map(_process_func_download_pop, country_codes))
-                _ = list(pool.map(_process_func_pop, id_parallel_chunks))
-                for elem in _:
-                    df_gadm.loc[elem.index, "pop"] = elem["pop"]
-            else:
-                list(
-                    tqdm(
-                        pool.imap(_process_func_download_pop, country_codes),
-                        total=len(country_codes),
-                        ascii=False,
-                        desc="Download WorldPop ",
-                        unit=" countries",
-                    )
-                )
-                _ = list(
-                    tqdm(
-                        pool.imap(_process_func_pop, id_parallel_chunks),
-                        total=len(id_parallel_chunks),
-                        **tqdm_kwargs,
-                    )
-                )
-                for elem in _:
-                    df_gadm.loc[elem.index, "pop"] = elem["pop"]
+            df_pop_count = compute_population(np_pop, country_geomask, id_mapping)
+
+            pbar.update(1)
+
+    # if (nprocesses is None) or (nprocesses == 1):
+    #     with tqdm(total=df_gadm.shape[0], **tqdm_kwargs) as pbar:
+    #         for c_code in country_codes:
+    #             # get subset by country code
+    #             country_rows = df_gadm.loc[df_gadm["country"] == c_code]
+
+    #             # get worldpop image
+    #             WorldPop_inputfile, WorldPop_filename = download_WorldPop(
+    #                 c_code, worldpop_method, year, update, out_logging
+    #             )
+
+    #             with rasterio.open(WorldPop_inputfile) as src:
+    #                 for i, row in country_rows.iterrows():
+    #                     df_gadm.loc[i, "pop"] = _sum_raster_over_mask(row.geometry, src)
+    #                     pbar.update(1)
+
+    # else:
+    #     # generator function to split a list ll into n_objs lists
+    #     def divide_list(ll, n_objs):
+    #         for i in range(0, len(ll), n_objs):
+    #             yield ll[i : i + n_objs]
+
+    #     id_parallel_chunks = list(
+    #         divide_list(df_gadm.index, ceil(len(df_gadm) / nchunks))
+    #     )
+
+    #     kwargs = {
+    #         "initializer": _init_process_pop,
+    #         "initargs": (df_gadm, year, worldpop_method),
+    #         "processes": nprocesses,
+    #     }
+    #     with mp.get_context("spawn").Pool(**kwargs) as pool:
+    #         if disable_progressbar:
+    #             list(pool.map(_process_func_download_pop, country_codes))
+    #             _ = list(pool.map(_process_func_pop, id_parallel_chunks))
+    #             for elem in _:
+    #                 df_gadm.loc[elem.index, "pop"] = elem["pop"]
+    #         else:
+    #             list(
+    #                 tqdm(
+    #                     pool.imap(_process_func_download_pop, country_codes),
+    #                     total=len(country_codes),
+    #                     ascii=False,
+    #                     desc="Download WorldPop ",
+    #                     unit=" countries",
+    #                 )
+    #             )
+    #             _ = list(
+    #                 tqdm(
+    #                     pool.imap(_process_func_pop, id_parallel_chunks),
+    #                     total=len(id_parallel_chunks),
+    #                     **tqdm_kwargs,
+    #                 )
+    #             )
+    #             for elem in _:
+    #                 df_gadm.loc[elem.index, "pop"] = elem["pop"]
 
 
 def gadm(

From 802fe65577bcc295a595622bb9b6f3ae8f934d52 Mon Sep 17 00:00:00 2001
From: GridGrapher <127969728+GridGrapher@users.noreply.github.com>
Date: Wed, 22 Mar 2023 12:20:46 +0100
Subject: [PATCH 02/29] Adjust dataframe to iloc and write to df_gadm

---
 scripts/build_shapes.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index cfd857ddf..dd440bbbe 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -756,10 +756,11 @@ def compute_geomask_country(country_rows, worldpop_features):
 
     # List to contain the mappings of id to GADM_ID
     id_to_GADM_ID = []
+
     # Loop the country_rows geoDataFrame
     for i in range(len(country_rows)):
         # Set the current geometry
-        cur_geometry = country_rows["geometry"][i]
+        cur_geometry = country_rows.iloc[i]["geometry"]
         # Generate a mask for the specific geometry
         temp_mask = rasterio.features.geometry_mask(
             [cur_geometry], (y_axis_len, x_axis_len), affine_transform, invert=True
@@ -769,7 +770,7 @@ def compute_geomask_country(country_rows, worldpop_features):
         np_map_ID[temp_mask] = i + 1
 
         # Store the id -> GADM_ID mapping
-        id_to_GADM_ID.append([i + 1, country_rows["GADM_ID"][i]])
+        id_to_GADM_ID.append([i + 1, country_rows.iloc[i]["GADM_ID"]])
 
     return np_map_ID.astype(int), pd.DataFrame(id_to_GADM_ID).set_index(0)
 
@@ -799,7 +800,7 @@ def compute_population(np_pop, country_geomask, id_mapping):
 
     df_pop_count.columns = ["pop", "GADM_ID"]
 
-    return df_pop_count.drop(0)
+    return df_pop_count
 
 
 def add_population_data(
@@ -856,8 +857,14 @@ def add_population_data(
                 country_rows, worldpop_features
             )
 
+            # Calculate the population for each region
             df_pop_count = compute_population(np_pop, country_geomask, id_mapping)
 
+            # Loop the regions and write population to df_gadm
+            for i in range(len(df_pop_count)):
+                pop_count, gadm_id = df_pop_count.iloc[i]
+                df_gadm.loc[df_gadm["GADM_ID"] == gadm_id, "pop"] = pop_count
+
             pbar.update(1)
 
     # if (nprocesses is None) or (nprocesses == 1):

From 8b92c59991f19c7187f0537bf26753e631d287a3 Mon Sep 17 00:00:00 2001
From: GridGrapher <127969728+GridGrapher@users.noreply.github.com>
Date: Wed, 22 Mar 2023 15:58:35 +0100
Subject: [PATCH 03/29] Memory reduction output compute_geomask_country

---
 scripts/build_shapes.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index dd440bbbe..c7875f0cc 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -772,7 +772,10 @@ def compute_geomask_country(country_rows, worldpop_features):
         # Store the id -> GADM_ID mapping
         id_to_GADM_ID.append([i + 1, country_rows.iloc[i]["GADM_ID"]])
 
-    return np_map_ID.astype(int), pd.DataFrame(id_to_GADM_ID).set_index(0)
+    # Return np_map_ID as type 'H' np.ushort
+    # 'H' -> https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.ushort
+    # This lowers memory usage, note: ID has to be within the range [0,65535]
+    return np_map_ID.astype("H"), pd.DataFrame(id_to_GADM_ID).set_index(0)
 
 
 def compute_population(np_pop, country_geomask, id_mapping):

From fb9359e5026f107064b04ac31105819e42f15f60 Mon Sep 17 00:00:00 2001
From: GridGrapher <127969728+GridGrapher@users.noreply.github.com>
Date: Wed, 22 Mar 2023 16:00:21 +0100
Subject: [PATCH 04/29] df_gadm operation adjustment

---
 scripts/build_shapes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index c7875f0cc..61bebd97a 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -652,7 +652,7 @@ def add_gdp_data(
         logger.info("Stage 4/4: Add gdp data to GADM GeoDataFrame")
 
     # initialize new gdp column
-    # df_gadm["gdp"] = 0.0
+    df_gadm["gdp"] = 0.0
 
     GDP_tif, name_tif = load_GDP(year, update, out_logging, name_file_nc)
 

From e8c9b3b0098ac62202b501bbd53a110a0bea5570 Mon Sep 17 00:00:00 2001
From: GridGrapher <127969728+GridGrapher@users.noreply.github.com>
Date: Fri, 24 Mar 2023 00:41:01 +0100
Subject: [PATCH 05/29] Restructure methods to implement scaling

---
 scripts/build_shapes.py | 305 +++++++++++++++++++++++++++-------------
 1 file changed, 210 insertions(+), 95 deletions(-)

diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index 61bebd97a..a4b693522 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -28,6 +28,7 @@
     two_digits_2_name_country,
 )
 from rasterio.mask import mask
+from rasterio.windows import Window
 from shapely.geometry import LineString, MultiPolygon, Point, Polygon
 from shapely.geometry.base import BaseGeometry
 from shapely.ops import unary_union
@@ -39,6 +40,26 @@
 
 sets_path_to_root("pypsa-earth")
 
+# Imports for profiling [temporary]
+import cProfile
+import os
+
+import psutil
+
+
+# Function for profiling functions [temporary]
+def profile(func):
+    """Decorator for run function profile"""
+
+    def wrapper(*args, **kwargs):
+        profile_filename = func.__name__ + ".prof"
+        profiler = cProfile.Profile()
+        result = profiler.runcall(func, *args, **kwargs)
+        profiler.dump_stats(profile_filename)
+        return result
+
+    return wrapper
+
 
 def download_GADM(country_code, update=False, out_logging=False):
     """
@@ -71,7 +92,7 @@ def download_GADM(country_code, update=False, out_logging=False):
     if not os.path.exists(GADM_inputfile_gpkg) or update is True:
         if out_logging:
             logger.warning(
-                f"Stage 4/4: {GADM_filename} of country {two_digits_2_name_country(country_code)} does not exist, downloading to {GADM_inputfile_gpkg}"
+                f"Stage 4 of 4: {GADM_filename} of country {two_digits_2_name_country(country_code)} does not exist, downloading to {GADM_inputfile_gpkg}"
             )
         #  create data/osm directory
         os.makedirs(os.path.dirname(GADM_inputfile_gpkg), exist_ok=True)
@@ -422,7 +443,7 @@ def download_WorldPop_standard(
         Name of the file
     """
     if out_logging:
-        logger.info("Stage 3/4: Download WorldPop datasets")
+        logger.info("Stage 3 of 4: Download WorldPop datasets (standard)")
 
     if country_code == "XK":
         WorldPop_filename = f"srb_ppp_{year}_UNadj_constrained.tif"
@@ -445,7 +466,7 @@ def download_WorldPop_standard(
     if not os.path.exists(WorldPop_inputfile) or update is True:
         if out_logging:
             logger.warning(
-                f"Stage 4/4: {WorldPop_filename} does not exist, downloading to {WorldPop_inputfile}"
+                f"Stage 3 of 4: {WorldPop_filename} does not exist, downloading to {WorldPop_inputfile}"
             )
         #  create data/osm directory
         os.makedirs(os.path.dirname(WorldPop_inputfile), exist_ok=True)
@@ -459,7 +480,7 @@ def download_WorldPop_standard(
                         loaded = True
                         break
         if not loaded:
-            logger.error(f"Stage 4/4: Impossible to download {WorldPop_filename}")
+            logger.error(f"Stage 3 of 4: Impossible to download {WorldPop_filename}")
 
     return WorldPop_inputfile, WorldPop_filename
 
@@ -489,7 +510,7 @@ def download_WorldPop_API(
         Name of the file
     """
     if out_logging:
-        logger.info("Stage 3/4: Download WorldPop datasets")
+        logger.info("Stage 3 of 4: Download WorldPop datasets (API)")
 
     WorldPop_filename = f"{two_2_three_digits_country(country_code).lower()}_ppp_{year}_UNadj_constrained.tif"
     # Request to get the file
@@ -513,7 +534,7 @@ def download_WorldPop_API(
                     loaded = True
                     break
     if not loaded:
-        logger.error(f"Stage 4/4: Impossible to download {WorldPop_filename}")
+        logger.error(f"Stage 3 of 4: Impossible to download {WorldPop_filename}")
 
     return WorldPop_inputfile, WorldPop_filename
 
@@ -525,7 +546,7 @@ def convert_GDP(name_file_nc, year=2015, out_logging=False):
     """
 
     if out_logging:
-        logger.info("Stage 4/4: Access to GDP raster data")
+        logger.info("Stage 4 of 4: Access to GDP raster data")
 
     # tif namefile
     name_file_tif = name_file_nc[:-2] + "tif"
@@ -552,7 +573,7 @@ def convert_GDP(name_file_nc, year=2015, out_logging=False):
     if year not in list_years:
         if out_logging:
             logger.warning(
-                f"Stage 3/4 GDP data of year {year} not found, selected the most recent data ({int(list_years[-1])})"
+                f"Stage 4 of 4 GDP data of year {year} not found, selected the most recent data ({int(list_years[-1])})"
             )
         year = float(list_years[-1])
 
@@ -576,7 +597,7 @@ def load_GDP(
     """
 
     if out_logging:
-        logger.info("Stage 4/4: Access to GDP raster data")
+        logger.info("Stage 4 of 4: Access to GDP raster data")
 
     # path of the nc file
     name_file_tif = name_file_nc[:-2] + "tif"
@@ -587,7 +608,7 @@ def load_GDP(
     if update | (not os.path.exists(GDP_tif)):
         if out_logging:
             logger.warning(
-                f"Stage 4/4: File {name_file_tif} not found, the file will be produced by processing {name_file_nc}"
+                f"Stage 4 of 4: File {name_file_tif} not found, the file will be produced by processing {name_file_nc}"
             )
         convert_GDP(name_file_nc, year, out_logging)
 
@@ -649,7 +670,7 @@ def add_gdp_data(
         - Includes a new column ["gdp"]
     """
     if out_logging:
-        logger.info("Stage 4/4: Add gdp data to GADM GeoDataFrame")
+        logger.info("Stage 4 of 4: Add gdp data to GADM GeoDataFrame")
 
     # initialize new gdp column
     df_gadm["gdp"] = 0.0
@@ -706,24 +727,40 @@ def _process_func_download_pop(c_code):
 
 
 def get_worldpop_features(WorldPop_inputfile):
+    """
+    Function to extract data from .tif input file
+    -------
+    Inputs:
+        WorldPop_inputfile: String pointing to location of file
+    --------
+    Outputs:
+        src.meta["transform"]: Representation of the affine transform
+        src.shape: Dimensions of the input image
+    """
+    # Open the file using rasterio
+    with rasterio.open(WorldPop_inputfile) as src:
+        return src.meta["transform"], src.shape
+
+
+def get_worldpop_val_xy(WorldPop_inputfile, window_dimensions):
     """
     Function
 
 
 
     """
-    worldpop_features = []
+    col_off, row_off, width, height = window_dimensions
+
+    current_window = Window(col_off, row_off, width, height)
+
     # Open the file using rasterio
     with rasterio.open(WorldPop_inputfile) as src:
-        # Add affine transform
-        worldpop_features.append(src.meta["transform"])
-        # Add dimensions of image
-        worldpop_features.append(src.shape[0])
-        worldpop_features.append(src.shape[1])
-
         # --- Process the pixels in the image for population data ---
         # Read the gray layer (1) to get an np.array of this band
-        np_pop_raster = src.read(1)
+        # Rasterio doesn't support lower than float32 readout
+        # Hence np_pop_raster will have nbytes = 4 * shape[0] * shape[1]
+
+        np_pop_raster = src.read(1, window=current_window)
 
         # Set np_pop_valid to all pixels in np_pop_raster that aren't 'nodata'
         np_pop_valid = np_pop_raster[np_pop_raster != src.nodata]
@@ -734,20 +771,22 @@ def get_worldpop_features(WorldPop_inputfile):
         # Set np_pop_xy to pixel locations of non zero values (same order as np_pop_valid)
         np_pop_xy = np_pop_raster.nonzero()
 
-        # Combine arrays to get [ value, x, y ] array
-        np_pop = np.array([np_pop_valid, np_pop_xy[0], np_pop_xy[1]]).T
+        # Transform to get [ x, y ] array
+        # np_pop_xy as 'I' (uintc), see
+        # https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.uintc
+        np_pop_xy = np.array([np_pop_xy[0], np_pop_xy[1]]).T.astype("I")
 
-    return worldpop_features, np_pop
+    return np_pop_valid, np_pop_xy
 
 
-def compute_geomask_country(country_rows, worldpop_features):
+def compute_geomask_region(country_rows, affine_transform, dimensions):
     """
     Function
 
 
 
     """
-    affine_transform, y_axis_len, x_axis_len = worldpop_features
+    y_axis_len, x_axis_len = dimensions
 
     # Set an empty numpy array with the dimensions of the country .tif file
     # np_map_ID will contain a ID for each location (undefined is 0)
@@ -778,7 +817,140 @@ def compute_geomask_country(country_rows, worldpop_features):
     return np_map_ID.astype("H"), pd.DataFrame(id_to_GADM_ID).set_index(0)
 
 
-def compute_population(np_pop, country_geomask, id_mapping):
+def compute_population(country_rows, WorldPop_inputfile, out_logging=False):
+    """
+    Function computes the population for the given country rows
+
+    Inputs:
+        -------
+        country_rows:
+
+        WorldPop_inputfile:
+
+
+    Outputs:
+        --------
+        df_pop_count: Dataframe with columns
+            - "pop" containing population of GADM_ID region
+            - "GADM_ID"
+
+
+    """
+    # Get the features of the worldpop input file
+    transform, worldpop_dim = get_worldpop_features(WorldPop_inputfile)
+
+    worldpop_y_dim, worldpop_x_dim = worldpop_dim
+
+    # Rasterio doesn't support lower than float32 readout
+    # Hence reading the file will take up: nbytes = 4 * y_dim * x_dim
+    expected_bytes_input_read = 4 * worldpop_y_dim * worldpop_x_dim
+
+    # Introduce a max byte size to avoid overfilling RAM
+    worldpop_byte_limit = 100 * 10**6
+
+    # If the rasterio read will be within byte limit
+    if expected_bytes_input_read < worldpop_byte_limit:
+        # Call functions with input dimensions for window
+        window_dim = [0, 0, worldpop_x_dim, worldpop_y_dim]
+
+        # Get population values and corresponding x,y coords
+        np_pop_val, np_pop_xy = get_worldpop_val_xy(WorldPop_inputfile, window_dim)
+
+        # get the geomask with id mappings
+        country_geomask, id_mapping = compute_geomask_region(
+            country_rows, transform, [worldpop_y_dim, worldpop_x_dim]
+        )
+
+        # Calculate the population for each region
+        df_pop_count = old_compute_population(
+            np_pop_val, np_pop_xy, country_geomask, id_mapping
+        )
+
+    else:
+        if out_logging:
+            logger.info(
+                "Stage 3 of 4: compute_population for "
+                + str(country_rows["country"][0])
+                + ": Expected size of file readout was "
+                + str(expected_bytes_input_read // 10**6)
+                + " Megabytes. As the limit is "
+                + str(worldpop_byte_limit // 10**6)
+                + " Megabytes switching to windowed approach"
+            )
+        # Calculate the population using windows
+        df_pop_count = windowed_compute_population(
+            country_rows, WorldPop_inputfile, worldpop_byte_limit
+        )
+
+    return df_pop_count
+
+
+def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_limit):
+    """
+    Function
+
+
+
+    """
+    # Open the file using rasterio
+    with rasterio.open(WorldPop_inputfile) as src:
+        transform = src.meta["transform"]
+        worldpop_y_dim, worldpop_x_dim = src.shape
+        block_y_dim, block_x_dim = src.block_shapes[0]
+
+    # Calculate the bytes for reading one block from the image (float32)
+    read_block_size = 4 * block_y_dim * block_x_dim
+
+    # Calculate the amount of blocks that fit into the memory budget
+    window_block_count = worldpop_byte_limit // read_block_size
+
+    # The input files have blocks that vary the x dimension and keep y dimension to 512
+    # Hence set the dimension of the windows to the same x dimension (width)
+    window_x_dim = block_x_dim
+    window_col_off = 0
+
+    # Multiply the y_dimension by the amount of blocks in the window
+    # window_y_dim will be height of the window
+    window_y_dim = window_block_count * block_y_dim
+
+    # Calculate the y ranges of the blocks to scan the image
+    # y_range_start will serve as row offset
+    y_range_start = np.arange(0, worldpop_y_dim, window_y_dim)
+
+    print(y_range_start)
+
+    for row_off in y_range_start:
+        window_dimensions = [window_col_off, row_off, window_x_dim, window_y_dim]
+
+        print("Running window: ", window_dimensions)
+
+        np_pop_val, np_pop_xy = get_worldpop_val_xy(
+            WorldPop_inputfile, window_dimensions
+        )
+
+        # get the geomask with id mappings
+        region_geomask, id_mapping = compute_geomask_region(
+            country_rows, transform, [window_y_dim, window_x_dim]
+        )
+
+        print("geo_mask size is (MB) ", region_geomask.nbytes / 10**6)
+        print(
+            "Current RAM usage: ",
+            psutil.Process(os.getpid()).memory_info().rss / 1024**2,
+        )
+
+        print("Running: old_compute_population")
+        # Calculate the population for each region
+        df_pop_count = old_compute_population(
+            np_pop_val, np_pop_xy, region_geomask, id_mapping
+        )
+
+        print(df_pop_count)
+
+    return df_pop_count
+
+
+def old_compute_population(np_pop_val, np_pop_xy, region_geomask, id_mapping):
     """
     Function
 
@@ -789,11 +961,12 @@ def compute_population(np_pop, country_geomask, id_mapping):
     np_pop_count = np.zeros(len(id_mapping) + 1)
 
     # Loop the population data
-    for i in range(len(np_pop)):
-        cur_value, cur_x, cur_y = np_pop[i]
+    for i in range(len(np_pop_val)):
+        cur_value = np_pop_val[i]
+        cur_x, cur_y = np_pop_xy[i]
 
         # Set the current id to the id at the same coordinate of the geomask
-        cur_id = country_geomask[int(cur_x)][int(cur_y)]
+        cur_id = region_geomask[int(cur_x)][int(cur_y)]
 
         # Add the current value to the population
         np_pop_count[cur_id] += cur_value
@@ -834,7 +1007,7 @@ def add_population_data(
     """
 
     if out_logging:
-        logger.info("Stage 4/4 POP: Add population data to GADM GeoDataFrame")
+        logger.info("Stage 3 of 4: Add population data to GADM GeoDataFrame")
 
     # initialize new population column
     df_gadm["pop"] = 0.0
@@ -848,85 +1021,27 @@ def add_population_data(
             # get subset by country code
             country_rows = df_gadm.loc[df_gadm["country"] == c_code]
 
-            # get worldpop image
+            # Download worldpop image (if required) and get file location
             WorldPop_inputfile, WorldPop_filename = download_WorldPop(
                 c_code, worldpop_method, year, update, out_logging
             )
 
-            worldpop_features, np_pop = get_worldpop_features(WorldPop_inputfile)
+            if out_logging:
+                logger.info("Stage 3 of 4: Calculating population of " + str(c_code))
 
-            # get the geomask with id mappings
-            country_geomask, id_mapping = compute_geomask_country(
-                country_rows, worldpop_features
+            # Calculate the population for each geometry given in country_rows
+            df_pop_count = compute_population(
+                country_rows, WorldPop_inputfile, out_logging
             )
 
-            # Calculate the population for each region
-            df_pop_count = compute_population(np_pop, country_geomask, id_mapping)
-
             # Loop the regions and write population to df_gadm
             for i in range(len(df_pop_count)):
                 pop_count, gadm_id = df_pop_count.iloc[i]
+                # Select the row with the same "GADM_ID" and set the population count
                 df_gadm.loc[df_gadm["GADM_ID"] == gadm_id, "pop"] = pop_count
 
             pbar.update(1)
 
-    # if (nprocesses is None) or (nprocesses == 1):
-    #     with tqdm(total=df_gadm.shape[0], **tqdm_kwargs) as pbar:
-    #         for c_code in country_codes:
-    #             # get subset by country code
-    #             country_rows = df_gadm.loc[df_gadm["country"] == c_code]
-
-    #             # get worldpop image
-    #             WorldPop_inputfile, WorldPop_filename = download_WorldPop(
-    #                 c_code, worldpop_method, year, update, out_logging
-    #             )
-
-    #             with rasterio.open(WorldPop_inputfile) as src:
-    #                 for i, row in country_rows.iterrows():
-    #                     df_gadm.loc[i, "pop"] = _sum_raster_over_mask(row.geometry, src)
-    #                     pbar.update(1)
-
-    # else:
-    #     # generator function to split a list ll into n_objs lists
-    #     def divide_list(ll, n_objs):
-    #         for i in range(0, len(ll), n_objs):
-    #             yield ll[i : i + n_objs]
-
-    #     id_parallel_chunks = list(
-    #         divide_list(df_gadm.index, ceil(len(df_gadm) / nchunks))
-    #     )
-
-    #     kwargs = {
-    #         "initializer": _init_process_pop,
-    #         "initargs": (df_gadm, year, worldpop_method),
-    #         "processes": nprocesses,
-    #     }
-    #     with mp.get_context("spawn").Pool(**kwargs) as pool:
-    #         if disable_progressbar:
-    #             list(pool.map(_process_func_download_pop, country_codes))
-    #             _ = list(pool.map(_process_func_pop, id_parallel_chunks))
-    #             for elem in _:
-    #                 df_gadm.loc[elem.index, "pop"] = elem["pop"]
-    #         else:
-    #             list(
-    #                 tqdm(
-    #                     pool.imap(_process_func_download_pop, country_codes),
-    #                     total=len(country_codes),
-    #                     ascii=False,
-    #                     desc="Download WorldPop ",
-    #                     unit=" countries",
-    #                 )
-    #             )
-    #             _ = list(
-    #                 tqdm(
-    #                     pool.imap(_process_func_pop, id_parallel_chunks),
-    #                     total=len(id_parallel_chunks),
-    #                     **tqdm_kwargs,
-    #                 )
-    #             )
-    #             for elem in _:
-    #                 df_gadm.loc[elem.index, "pop"] = elem["pop"]
-
 
 def gadm(
     worldpop_method,
@@ -942,7 +1057,7 @@ def gadm(
     nchunks=None,
 ):
     if out_logging:
-        logger.info("Stage 4/4: Creation GADM GeoDataFrame")
+        logger.info("Stage 3 of 4: Creation GADM GeoDataFrame")
 
     # download data if needed and get the desired layer_id
     df_gadm = get_GADM_layer(countries, layer_id, geo_crs, contended_flag, update)

From 0b85814a62f9501cf4f4fecd4c12868c86998460 Mon Sep 17 00:00:00 2001
From: GridGrapher <127969728+GridGrapher@users.noreply.github.com>
Date: Sun, 26 Mar 2023 21:54:09 +0200
Subject: [PATCH 06/29] Windowed mode operational

---
 scripts/build_shapes.py | 96 ++++++++++++++++++++++++++++++-----------
 1 file changed, 72 insertions(+), 24 deletions(-)

diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index a4b693522..fd5aeb2d5 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -779,14 +779,40 @@ def get_worldpop_val_xy(WorldPop_inputfile, window_dimensions):
     return np_pop_valid, np_pop_xy
 
 
-def compute_geomask_region(country_rows, affine_transform, dimensions):
+def compute_geomask_region(
+    country_rows, affine_transform, window_dimensions, windowed=False
+):
     """
     Function
 
 
 
     """
-    y_axis_len, x_axis_len = dimensions
+    col_off, row_off, x_axis_len, y_axis_len = window_dimensions
+
+    if windowed:
+        # Declare a transformer with given affine_transform
+        transformer = rasterio.transform.AffineTransformer(affine_transform)
+
+        # Obtain the coordinates of the upper left corner of window
+        window_topleft_longitude, window_topleft_latitude = transformer.xy(
+            row_off, col_off
+        )
+
+        # Obtain the coordinates of the bottom right corner of window
+        window_botright_longitude, window_botright_latitude = transformer.xy(
+            row_off + y_axis_len, col_off + x_axis_len
+        )
+
+        # Set the current transform to the correct lat and long
+        affine_transform = rasterio.Affine(
+            affine_transform[0],
+            affine_transform[1],
+            window_topleft_longitude,
+            affine_transform[3],
+            affine_transform[4],
+            window_topleft_latitude,
+        )
 
     # Set an empty numpy array with the dimensions of the country .tif file
     # np_map_ID will contain a ID for each location (undefined is 0)
@@ -800,9 +826,27 @@ def compute_geomask_region(country_rows, affine_transform, dimensions):
     for i in range(len(country_rows)):
         # Set the current geometry
         cur_geometry = country_rows.iloc[i]["geometry"]
+
+        # In windowed mode we check if bounds of geometry overlap the window
+        if windowed:
+            latitude_min = cur_geometry.bounds[1]
+            latitude_max = cur_geometry.bounds[3]
+
+            # In the following cases we don't have to continue the loop
+            # If the geometry is above the window
+            if latitude_min > window_topleft_latitude:
+                continue
+            # If the geometry is below the window
+            if latitude_max < window_botright_latitude:
+                continue
+
         # Generate a mask for the specific geometry
         temp_mask = rasterio.features.geometry_mask(
-            [cur_geometry], (y_axis_len, x_axis_len), affine_transform, invert=True
+            [cur_geometry],
+            (y_axis_len, x_axis_len),
+            affine_transform,
+            invert=True,
+            all_touched=True,
         )
 
         # Map the values of counter value to np_map_ID
@@ -846,7 +890,7 @@ def compute_population(country_rows, WorldPop_inputfile, out_logging=False):
     expected_bytes_input_read = 4 * worldpop_y_dim * worldpop_x_dim
 
     # Introduce a max byte size to avoid overfilling RAM
-    worldpop_byte_limit = 100 * 10**6
+    worldpop_byte_limit = 3096 * 10**6
 
     # If the rasterio read will be within byte limit
     if expected_bytes_input_read < worldpop_byte_limit:
@@ -858,11 +902,11 @@ def compute_population(country_rows, WorldPop_inputfile, out_logging=False):
 
         # get the geomask with id mappings
         country_geomask, id_mapping = compute_geomask_region(
-            country_rows, transform, [worldpop_y_dim, worldpop_x_dim]
+            country_rows, transform, window_dim
         )
 
         # Calculate the population for each region
-        df_pop_count = old_compute_population(
+        df_pop_count = sum_values_using_geomask(
             np_pop_val, np_pop_xy, country_geomask, id_mapping
         )
 
@@ -892,6 +936,9 @@ def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_
 
 
     """
+    # Create a dataframe to store the population data
+    df_pop_count = country_rows.loc[:, ["GADM_ID", "pop"]].copy()
+
     # Open the file using rasterio
     with rasterio.open(WorldPop_inputfile) as src:
         transform = src.meta["transform"]
@@ -917,8 +964,6 @@ def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_
     # y_range_start will serve as row offset
     y_range_start = np.arange(0, worldpop_y_dim, window_y_dim)
 
-    print(y_range_start)
-
     for row_off in y_range_start:
         window_dimensions = [window_col_off, row_off, window_x_dim, window_y_dim]
 
@@ -930,33 +975,38 @@ def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_
 
         # get the geomask with id mappings
         region_geomask, id_mapping = compute_geomask_region(
-            country_rows, transform, [window_y_dim, window_x_dim]
+            country_rows, transform, window_dimensions, windowed=True
         )
 
-        print("geo_mask size is (MB) ", region_geomask.nbytes / 10**6)
-        print(
-            "Current RAM usage: ",
-            psutil.Process(os.getpid()).memory_info().rss / 1024**2,
-        )
-
-        print("Running: old_compute_population")
         # Calculate the population for each region
-        df_pop_count = old_compute_population(
+        windowed_pop_count = sum_values_using_geomask(
             np_pop_val, np_pop_xy, region_geomask, id_mapping
         )
 
-        print(df_pop_count)
+        # Loop the regions and write population to df_pop_count
+        for i in range(len(windowed_pop_count)):
+            pop_count, gadm_id = windowed_pop_count.iloc[i]
+            # Select the row with the same "GADM_ID" and set the population count
+            df_pop_count.loc[df_pop_count["GADM_ID"] == gadm_id, "pop"] = pop_count
 
     return df_pop_count
 
 
-def old_compute_population(np_pop_val, np_pop_xy, region_geomask, id_mapping):
+def sum_values_using_geomask(np_pop_val, np_pop_xy, region_geomask, id_mapping):
     """
     Function
 
 
 
     """
+    # Initialize a dictionary
+    dict_id = {0: 0}
+    counter = 1
+    # Loop over ip mapping and add indicies to the dictionary
+    for ID_index in np.array(id_mapping.index):
+        dict_id[ID_index] = counter
+        counter += 1
+
     # Declare an array to contain population counts
     np_pop_count = np.zeros(len(id_mapping) + 1)
 
@@ -969,12 +1019,10 @@ def old_compute_population(np_pop_val, np_pop_xy, region_geomask, id_mapping):
         cur_id = region_geomask[int(cur_x)][int(cur_y)]
 
         # Add the current value to the population
-        np_pop_count[cur_id] += cur_value
-
-    df_pop_count = pd.DataFrame(np_pop_count)
-    df_pop_count = pd.DataFrame(df_pop_count).join(id_mapping)
+        np_pop_count[dict_id[cur_id]] += cur_value
 
-    df_pop_count.columns = ["pop", "GADM_ID"]
+    df_pop_count = pd.DataFrame(np_pop_count, columns=["pop"])
+    df_pop_count["GADM_ID"] = np.append(np.array("NaN"), id_mapping.values)
 
     return df_pop_count
 

From 2c9833810d0fc8ad5c39ef665acc4ce658323252 Mon Sep 17 00:00:00 2001
From: GridGrapher <127969728+GridGrapher@users.noreply.github.com>
Date: Wed, 29 Mar 2023 23:40:44 +0200
Subject: [PATCH 07/29] Refine due to bugs with 'US', now fixed

---
 scripts/build_shapes.py | 49 +++++++++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index fd5aeb2d5..d9c58dc05 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -756,19 +756,16 @@ def get_worldpop_val_xy(WorldPop_inputfile, window_dimensions):
     # Open the file using rasterio
     with rasterio.open(WorldPop_inputfile) as src:
         # --- Process the pixels in the image for population data ---
+
         # Read the gray layer (1) to get an np.array of this band
         # Rasterio doesn't support lower than float32 readout
-        # Hence np_pop_raster will have nbytes = 4 * shape[0] * shape[1]
-
+        # Hence np_pop_raster will have nbytes = 4 * width * height
         np_pop_raster = src.read(1, window=current_window)
 
-        # Set np_pop_valid to all pixels in np_pop_raster that aren't 'nodata'
-        np_pop_valid = np_pop_raster[np_pop_raster != src.nodata]
-
         # Set 'nodata' values to 0
         np_pop_raster[np_pop_raster == src.nodata] = 0
 
-        # Set np_pop_xy to pixel locations of non zero values (same order as np_pop_valid)
+        # Set np_pop_xy to pixel locations of non zero values
         np_pop_xy = np_pop_raster.nonzero()
 
         # Transform to get [ x, y ] array
@@ -776,6 +773,9 @@ def get_worldpop_val_xy(WorldPop_inputfile, window_dimensions):
         # https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.uintc
         np_pop_xy = np.array([np_pop_xy[0], np_pop_xy[1]]).T.astype("I")
 
+        # Extract the values from the locations of non zero pixels
+        np_pop_valid = np_pop_raster[np_pop_xy.T[0], np_pop_xy.T[1]]
+
     return np_pop_valid, np_pop_xy
 
 
@@ -890,6 +890,7 @@ def compute_population(country_rows, WorldPop_inputfile, out_logging=False):
     expected_bytes_input_read = 4 * worldpop_y_dim * worldpop_x_dim
 
     # Introduce a max byte size to avoid overfilling RAM
+    # Ensure worldpop_byte_limit > 883 * 10**6 (minimum memory for 'US')
     worldpop_byte_limit = 3096 * 10**6
 
     # If the rasterio read will be within byte limit
@@ -914,7 +915,7 @@ def compute_population(country_rows, WorldPop_inputfile, out_logging=False):
         if out_logging:
             logger.info(
                 "Stage 3 of 4: compute_population for "
-                + str(country_rows["country"][0])
+                + str(country_rows.iloc[0]["country"])
                 + ": Expected size of file readout was "
                 + str(expected_bytes_input_read // 10**6)
                 + " Megabytes. As the limit is "
@@ -945,26 +946,31 @@ def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_
         worldpop_y_dim, worldpop_x_dim = src.shape
         block_y_dim, block_x_dim = src.block_shapes[0]
 
-    # Calculate the bytes for reading one block from the image (float32)
-    read_block_size = 4 * block_y_dim * block_x_dim
+    # Set the windows x dimension to the input x dimension (width)
+    window_x_dim = worldpop_x_dim
+    # From testing we can assume max x dimension will always fit in memory:
+    #   Largest memory requirement is 'US' at ~882.1 MB = 4 * 512 * window_x_dim
+    #   Hence worldpop_byte_limit has to be greater than 883 MB
+
+    # As the window spans the x dimension, set column offset to 0
+    window_col_off = 0
+
+    # Calculate the bytes for reading the window using window_x_dim (float32)
+    read_block_size = 4 * block_y_dim * window_x_dim
 
     # Calculate the amount of blocks that fit into the memory budget
+    # Using the calculated x dimension
     window_block_count = worldpop_byte_limit // read_block_size
 
-    # The input files have blocks that vary the x dimension and keep y dimension to 512
-    # Hence set the dimension of the windows to the same x dimension (width)
-    window_x_dim = block_x_dim
-    window_col_off = 0
-
     # Multiply the y_dimension by the amount of blocks in the window
     # window_y_dim will be height of the window
     window_y_dim = window_block_count * block_y_dim
 
     # Calculate the y ranges of the blocks to scan the image
     # y_range_start will serve as row offset
-    y_range_start = np.arange(0, worldpop_y_dim, window_y_dim)
+    window_row_off = np.arange(0, worldpop_y_dim, window_y_dim)
 
-    for row_off in y_range_start:
+    for row_off in window_row_off:
         window_dimensions = [window_col_off, row_off, window_x_dim, window_y_dim]
 
         print("Running window: ", window_dimensions)
@@ -973,6 +979,10 @@ def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_
             WorldPop_inputfile, window_dimensions
         )
 
+        # If no values are present in the current window skip the remaining steps
+        if len(np_pop_val) == 0:
+            continue
+
         # get the geomask with id mappings
         region_geomask, id_mapping = compute_geomask_region(
             country_rows, transform, window_dimensions, windowed=True
@@ -985,9 +995,9 @@ def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_
 
         # Loop the regions and write population to df_pop_count
         for i in range(len(windowed_pop_count)):
-            pop_count, gadm_id = windowed_pop_count.iloc[i]
+            gadm_id, pop_count = windowed_pop_count.iloc[i]
             # Select the row with the same "GADM_ID" and set the population count
-            df_pop_count.loc[df_pop_count["GADM_ID"] == gadm_id, "pop"] = pop_count
+            df_pop_count.loc[df_pop_count["GADM_ID"] == gadm_id, "pop"] += pop_count
 
     return df_pop_count
 
@@ -1023,6 +1033,7 @@ def sum_values_using_geomask(np_pop_val, np_pop_xy, region_geomask, id_mapping):
 
     df_pop_count = pd.DataFrame(np_pop_count, columns=["pop"])
     df_pop_count["GADM_ID"] = np.append(np.array("NaN"), id_mapping.values)
+    df_pop_count = df_pop_count[["GADM_ID", "pop"]]
 
     return df_pop_count
 
@@ -1084,7 +1095,7 @@ def add_population_data(
 
             # Loop the regions and write population to df_gadm
             for i in range(len(df_pop_count)):
-                pop_count, gadm_id = df_pop_count.iloc[i]
+                gadm_id, pop_count = df_pop_count.iloc[i]
                 # Select the row with the same "GADM_ID" and set the population count
                 df_gadm.loc[df_gadm["GADM_ID"] == gadm_id, "pop"] = pop_count
 

From 369084a9f5abce1a40044baa0ececf6a8550b403 Mon Sep 17 00:00:00 2001
From: GridGrapher <127969728+GridGrapher@users.noreply.github.com>
Date: Sat, 1 Apr 2023 00:19:08 +0200
Subject: [PATCH 08/29] Kosovo naming difference GADM

---
 scripts/build_shapes.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index d9c58dc05..e20a7de48 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -77,8 +77,11 @@ def download_GADM(country_code, update=False, out_logging=False):
     gpkg file per country
 
     """
+    if country_code == "XK":
+        GADM_filename = f"gadm41_XKO"
+    else:
+        GADM_filename = f"gadm41_{two_2_three_digits_country(country_code)}"
 
-    GADM_filename = f"gadm41_{two_2_three_digits_country(country_code)}"
     GADM_url = f"https://geodata.ucdavis.edu/gadm/gadm4.1/gpkg/{GADM_filename}.gpkg"
 
     GADM_inputfile_gpkg = os.path.join(

From ad38dae436eb62b636c33a44fcafab9e4d964521 Mon Sep 17 00:00:00 2001
From: GridGrapher <127969728+GridGrapher@users.noreply.github.com>
Date: Sun, 2 Apr 2023 21:07:32 +0200
Subject: [PATCH 09/29] gadm_layer_id bug fix

---
 scripts/build_shapes.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index e20a7de48..47b6d9127 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -177,6 +177,9 @@ def get_GADM_layer(
     geodf_list = []
 
     for country_code in country_list:
+        # Set the current layer id (cur_layer_id) to global layer_id
+        cur_layer_id = layer_id
+
         # download file gpkg
         file_gpkg, name_file = download_GADM(country_code, update, outlogging)
 
@@ -184,18 +187,18 @@ def get_GADM_layer(
         list_layers = fiona.listlayers(file_gpkg)
 
         # get layer name
-        if (layer_id < 0) or (layer_id >= len(list_layers)):
+        if (cur_layer_id < 0) or (cur_layer_id >= len(list_layers)):
             # when layer id is negative or larger than the number of layers, select the last layer
-            layer_id = len(list_layers) - 1
+            cur_layer_id = len(list_layers) - 1
 
         # read gpkg file
-        geodf_temp = gpd.read_file(file_gpkg, layer="ADM_ADM_" + str(layer_id)).to_crs(
-            geo_crs
-        )
+        geodf_temp = gpd.read_file(
+            file_gpkg, layer="ADM_ADM_" + str(cur_layer_id), engine="pyogrio"
+        ).to_crs(geo_crs)
 
         geodf_temp = filter_gadm(
             geodf=geodf_temp,
-            layer=layer_id,
+            layer=cur_layer_id,
             cc=country_code,
             contended_flag=contended_flag,
             output_nonstd_to_csv=False,
@@ -203,7 +206,7 @@ def get_GADM_layer(
 
         # create a subindex column that is useful
         # in the GADM processing of sub-national zones
-        geodf_temp["GADM_ID"] = geodf_temp[f"GID_{layer_id}"]
+        geodf_temp["GADM_ID"] = geodf_temp[f"GID_{cur_layer_id}"]
 
         # append geodataframes
         geodf_list.append(geodf_temp)

From 7524b98f8fe662af90325b063b154fa6112cff39 Mon Sep 17 00:00:00 2001
From: GridGrapher <127969728+GridGrapher@users.noreply.github.com>
Date: Sun, 2 Apr 2023 22:44:19 +0200
Subject: [PATCH 10/29] intro numba ~14x speedup sum_values_using_geomask

---
 scripts/build_shapes.py | 38 +++++++++++++++++++++++++++++++-------
 1 file changed, 31 insertions(+), 7 deletions(-)

diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index 47b6d9127..741763ef9 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -44,7 +44,9 @@
 import cProfile
 import os
 
-import psutil
+from numba import njit
+from numba.core import types
+from numba.typed import Dict
 
 
 # Function for profiling functions [temporary]
@@ -1016,7 +1018,11 @@ def sum_values_using_geomask(np_pop_val, np_pop_xy, region_geomask, id_mapping):
 
     """
     # Initialize a dictionary
-    dict_id = {0: 0}
+    dict_id = Dict.empty(
+        key_type=types.int64,
+        value_type=types.int64,
+    )
+    dict_id[0] = 0
     counter = 1
     # Loop over ip mapping and add indicies to the dictionary
     for ID_index in np.array(id_mapping.index):
@@ -1026,6 +1032,28 @@ def sum_values_using_geomask(np_pop_val, np_pop_xy, region_geomask, id_mapping):
     # Declare an array to contain population counts
     np_pop_count = np.zeros(len(id_mapping) + 1)
 
+    np_pop_count = loop_and_extact_val_x_y(
+        np_pop_count, np_pop_val, np_pop_xy, region_geomask, dict_id
+    )
+
+    df_pop_count = pd.DataFrame(np_pop_count, columns=["pop"])
+    df_pop_count["GADM_ID"] = np.append(np.array("NaN"), id_mapping.values)
+    df_pop_count = df_pop_count[["GADM_ID", "pop"]]
+
+    return df_pop_count
+
+
+@njit
+def loop_and_extact_val_x_y(
+    np_pop_count, np_pop_val, np_pop_xy, region_geomask, dict_id
+):
+    """
+    Function
+
+
+
+    """
+
     # Loop the population data
     for i in range(len(np_pop_val)):
         cur_value = np_pop_val[i]
@@ -1037,11 +1065,7 @@ def sum_values_using_geomask(np_pop_val, np_pop_xy, region_geomask, id_mapping):
         # Add the current value to the population
         np_pop_count[dict_id[cur_id]] += cur_value
 
-    df_pop_count = pd.DataFrame(np_pop_count, columns=["pop"])
-    df_pop_count["GADM_ID"] = np.append(np.array("NaN"), id_mapping.values)
-    df_pop_count = df_pop_count[["GADM_ID", "pop"]]
-
-    return df_pop_count
+    return np_pop_count
 
 
 def add_population_data(

From 2ba62a60a6121074b95d42b2d9f995a640cc505b Mon Sep 17 00:00:00 2001
From: GridGrapher <127969728+GridGrapher@users.noreply.github.com>
Date: Mon, 3 Apr 2023 23:49:13 +0200
Subject: [PATCH 11/29] Documentation update 1

---
 scripts/build_shapes.py | 86 ++++++++++++++++++++++++++++-------------
 1 file changed, 59 insertions(+), 27 deletions(-)

diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index 741763ef9..02d8e8f4a 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -745,17 +745,21 @@ def get_worldpop_features(WorldPop_inputfile):
         src.meta["transform"]: Representation of the affine transform
         src.shape: Dimensions of the input image
     """
-    # Open the file using rasterio
     with rasterio.open(WorldPop_inputfile) as src:
         return src.meta["transform"], src.shape
 
 
 def get_worldpop_val_xy(WorldPop_inputfile, window_dimensions):
     """
-    Function
-
-
-
+    Function to extract data from .tif input file
+    -------
+    Inputs:
+        WorldPop_inputfile: file location of worldpop file
+        window_dimensions: dimensions of window used when reading file
+    --------
+    Outputs:
+        np_pop_valid: array filled with values for each nonzero pixel in the worldpop file
+        np_pop_xy: array with [x,y] coordinates of the corresponding nonzero values in np_pop_valid
     """
     col_off, row_off, width, height = window_dimensions
 
@@ -791,10 +795,19 @@ def compute_geomask_region(
     country_rows, affine_transform, window_dimensions, windowed=False
 ):
     """
-    Function
-
-
-
+    Function to mask geometries into np_map_ID using an incrementing counter
+    -------
+    Inputs:
+        country_rows: geoDataFrame filled with geometries and their GADM_ID
+        affine_transform: affine transform of current image
+        window_dimensions: dimensions of window used when reading file
+        windowed: True if called during windowed processing (default = False)
+    --------
+    Outputs:
+        np_map_ID.astype("H"): np_map_ID contains an ID for each location (undefined is 0)
+            dimensions are taken from window_dimensions, .astype("H") for memory savings
+        pd.DataFrame(id_to_GADM_ID).set_index(0):
+            DataFrame of the mapping from id (from counter) to GADM_ID
     """
     col_off, row_off, x_axis_len, y_axis_len = window_dimensions
 
@@ -823,7 +836,7 @@ def compute_geomask_region(
         )
 
     # Set an empty numpy array with the dimensions of the country .tif file
-    # np_map_ID will contain a ID for each location (undefined is 0)
+    # np_map_ID will contain an ID for each location (undefined is 0)
     # ID corresponds to a specific geometry in country_rows
     np_map_ID = np.zeros((y_axis_len, x_axis_len))
 
@@ -872,21 +885,16 @@ def compute_geomask_region(
 def compute_population(country_rows, WorldPop_inputfile, out_logging=False):
     """
     Function computes the population for the given country rows
-
+    -------
     Inputs:
-        -------
-        country_rows:
-
-        WorldPop_inputfile:
-
-
+        country_rows: geoDataFrame filled with geometries and their GADM_ID
+        WorldPop_inputfile: file location of worldpop file
+        out_logging: Boolean to enable logging
+    --------
     Outputs:
-        --------
         df_pop_count: Dataframe with columns
             - "pop" containing population of GADM_ID region
             - "GADM_ID"
-
-
     """
     # Get the features of the worldpop input file
     transform, worldpop_dim = get_worldpop_features(WorldPop_inputfile)
@@ -941,8 +949,16 @@ def compute_population(country_rows, WorldPop_inputfile, out_logging=False):
 def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_limit):
     """
     Function
-
-
+    -------
+    Inputs:
+        country_rows: geoDataFrame filled with geometries and their GADM_ID
+        WorldPop_inputfile: file location of worldpop file
+        worldpop_byte_limit: Memory limit which determines the window size
+    --------
+    Outputs:
+        df_pop_count: Dataframe with columns
+            - "pop" containing population of GADM_ID region
+            - "GADM_ID"
 
     """
     # Create a dataframe to store the population data
@@ -1013,8 +1029,17 @@ def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_
 def sum_values_using_geomask(np_pop_val, np_pop_xy, region_geomask, id_mapping):
     """
     Function
-
-
+    -------
+    Inputs:
+        np_pop_val: array filled with values for each nonzero pixel in the worldpop file
+        np_pop_xy: array with [x,y] coordinates of the corresponding nonzero values in np_pop_valid
+        region_geomask:
+        id_mapping:
+    --------
+    Outputs:
+        df_pop_count: Dataframe with columns
+            - "GADM_ID"
+            - "pop" containing population of GADM_ID region
 
     """
     # Initialize a dictionary
@@ -1049,9 +1074,16 @@ def loop_and_extact_val_x_y(
 ):
     """
     Function
-
-
-
+    -------
+    Inputs:
+        np_pop_count:
+        np_pop_val:
+        np_pop_xy:
+        region_geomask:
+        dict_id:
+    --------
+    Outputs:
+        np_pop_count: np.array containing population counts
     """
 
     # Loop the population data

From 5517827f712505342cb5f565844fe58de73f75e1 Mon Sep 17 00:00:00 2001
From: GridGrapher <127969728+GridGrapher@users.noreply.github.com>
Date: Thu, 13 Apr 2023 18:22:41 +0200
Subject: [PATCH 12/29] Removal of profile function

---
 scripts/build_shapes.py | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index 02d8e8f4a..14114d92f 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -40,29 +40,11 @@
 
 sets_path_to_root("pypsa-earth")
 
-# Imports for profiling [temporary]
-import cProfile
-import os
-
 from numba import njit
 from numba.core import types
 from numba.typed import Dict
 
 
-# Function for profiling functions [temporary]
-def profile(func):
-    """Decorator for run function profile"""
-
-    def wrapper(*args, **kwargs):
-        profile_filename = func.__name__ + ".prof"
-        profiler = cProfile.Profile()
-        result = profiler.runcall(func, *args, **kwargs)
-        profiler.dump_stats(profile_filename)
-        return result
-
-    return wrapper
-
-
 def download_GADM(country_code, update=False, out_logging=False):
     """
     Download gpkg file from GADM for a given country code

From b41bdc8f773b0e5c555c9b5393e1ed2eccaf20d4 Mon Sep 17 00:00:00 2001
From: GridGrapher <127969728+GridGrapher@users.noreply.github.com>
Date: Sun, 16 Apr 2023 14:53:28 +0200
Subject: [PATCH 13/29] Split download worldpop, logging stages update

---
 scripts/build_shapes.py | 72 ++++++++++++++++++++++++-----------------
 1 file changed, 42 insertions(+), 30 deletions(-)

diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index 14114d92f..52486bb93 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -79,7 +79,7 @@ def download_GADM(country_code, update=False, out_logging=False):
     if not os.path.exists(GADM_inputfile_gpkg) or update is True:
         if out_logging:
             logger.warning(
-                f"Stage 4 of 4: {GADM_filename} of country {two_digits_2_name_country(country_code)} does not exist, downloading to {GADM_inputfile_gpkg}"
+                f"Stage 5 of 5: {GADM_filename} of country {two_digits_2_name_country(country_code)} does not exist, downloading to {GADM_inputfile_gpkg}"
             )
         #  create data/osm directory
         os.makedirs(os.path.dirname(GADM_inputfile_gpkg), exist_ok=True)
@@ -224,7 +224,7 @@ def countries(countries, geo_crs, contended_flag, update=False, out_logging=Fals
     "Create country shapes"
 
     if out_logging:
-        logger.info("Stage 1 of 4: Create country shapes")
+        logger.info("Stage 1 of 5: Create country shapes")
 
     # download data if needed and get the layer id 0, corresponding to the countries
     df_countries = get_GADM_layer(
@@ -252,7 +252,7 @@ def countries(countries, geo_crs, contended_flag, update=False, out_logging=Fals
 
 def country_cover(country_shapes, eez_shapes=None, out_logging=False, distance=0.1):
     if out_logging:
-        logger.info("Stage 3 of 4: Merge country shapes to create continent shape")
+        logger.info("Stage 3 of 5: Merge country shapes to create continent shape")
 
     shapes = country_shapes.apply(lambda x: x.buffer(distance))
     shapes_list = list(shapes)
@@ -331,7 +331,7 @@ def eez(
     """
 
     if out_logging:
-        logger.info("Stage 2 of 4: Create offshore shapes")
+        logger.info("Stage 2 of 5: Create offshore shapes")
 
     # load data
     df_eez = load_EEZ(countries, geo_crs, EEZ_gpkg)
@@ -432,8 +432,6 @@ def download_WorldPop_standard(
     WorldPop_filename : str
         Name of the file
     """
-    if out_logging:
-        logger.info("Stage 3 of 4: Download WorldPop datasets (standard)")
 
     if country_code == "XK":
         WorldPop_filename = f"srb_ppp_{year}_UNadj_constrained.tif"
@@ -456,7 +454,7 @@ def download_WorldPop_standard(
     if not os.path.exists(WorldPop_inputfile) or update is True:
         if out_logging:
             logger.warning(
-                f"Stage 3 of 4: {WorldPop_filename} does not exist, downloading to {WorldPop_inputfile}"
+                f"Stage 3 of 5: {WorldPop_filename} does not exist, downloading to {WorldPop_inputfile}"
             )
         #  create data/osm directory
         os.makedirs(os.path.dirname(WorldPop_inputfile), exist_ok=True)
@@ -470,7 +468,7 @@ def download_WorldPop_standard(
                         loaded = True
                         break
         if not loaded:
-            logger.error(f"Stage 3 of 4: Impossible to download {WorldPop_filename}")
+            logger.error(f"Stage 3 of 5: Impossible to download {WorldPop_filename}")
 
     return WorldPop_inputfile, WorldPop_filename
 
@@ -499,8 +497,6 @@ def download_WorldPop_API(
     WorldPop_filename : str
         Name of the file
     """
-    if out_logging:
-        logger.info("Stage 3 of 4: Download WorldPop datasets (API)")
 
     WorldPop_filename = f"{two_2_three_digits_country(country_code).lower()}_ppp_{year}_UNadj_constrained.tif"
     # Request to get the file
@@ -524,7 +520,7 @@ def download_WorldPop_API(
                     loaded = True
                     break
     if not loaded:
-        logger.error(f"Stage 3 of 4: Impossible to download {WorldPop_filename}")
+        logger.error(f"Stage 3 of 5: Impossible to download {WorldPop_filename}")
 
     return WorldPop_inputfile, WorldPop_filename
 
@@ -536,7 +532,7 @@ def convert_GDP(name_file_nc, year=2015, out_logging=False):
     """
 
     if out_logging:
-        logger.info("Stage 4 of 4: Access to GDP raster data")
+        logger.info("Stage 5 of 5: Access to GDP raster data")
 
     # tif namefile
     name_file_tif = name_file_nc[:-2] + "tif"
@@ -563,7 +559,7 @@ def convert_GDP(name_file_nc, year=2015, out_logging=False):
     if year not in list_years:
         if out_logging:
             logger.warning(
-                f"Stage 4 of 4 GDP data of year {year} not found, selected the most recent data ({int(list_years[-1])})"
+                f"Stage 5 of 5 GDP data of year {year} not found, selected the most recent data ({int(list_years[-1])})"
             )
         year = float(list_years[-1])
 
@@ -587,7 +583,7 @@ def load_GDP(
     """
 
     if out_logging:
-        logger.info("Stage 4 of 4: Access to GDP raster data")
+        logger.info("Stage 5 of 5: Access to GDP raster data")
 
     # path of the nc file
     name_file_tif = name_file_nc[:-2] + "tif"
@@ -598,7 +594,7 @@ def load_GDP(
     if update | (not os.path.exists(GDP_tif)):
         if out_logging:
             logger.warning(
-                f"Stage 4 of 4: File {name_file_tif} not found, the file will be produced by processing {name_file_nc}"
+                f"Stage 5 of 5: File {name_file_tif} not found, the file will be produced by processing {name_file_nc}"
             )
         convert_GDP(name_file_nc, year, out_logging)
 
@@ -660,7 +656,7 @@ def add_gdp_data(
         - Includes a new column ["gdp"]
     """
     if out_logging:
-        logger.info("Stage 4 of 4: Add gdp data to GADM GeoDataFrame")
+        logger.info("Stage 5 of 5: Add gdp data to GADM GeoDataFrame")
 
     # initialize new gdp column
     df_gadm["gdp"] = 0.0
@@ -912,7 +908,7 @@ def compute_population(country_rows, WorldPop_inputfile, out_logging=False):
     else:
         if out_logging:
             logger.info(
-                "Stage 3 of 4: compute_population for "
+                "Stage 4 of 5: compute_population for "
                 + str(country_rows.iloc[0]["country"])
                 + ": Expected size of file readout was "
                 + str(expected_bytes_input_read // 10**6)
@@ -1109,32 +1105,48 @@ def add_population_data(
         - Includes a new column ["pop"]
     """
 
-    if out_logging:
-        logger.info("Stage 3 of 4: Add population data to GADM GeoDataFrame")
-
-    # initialize new population column
+    # Initialize new population column
     df_gadm["pop"] = 0.0
 
-    tqdm_kwargs = dict(
+    # Initialize new dict to hold worldpop_inputfile strings
+    dict_worldpop_file_locations = {}
+
+    if out_logging:
+        logger.info(
+            "Stage 3 of 5: Download WorldPop datasets, method: " + str(worldpop_method)
+        )
+
+    tqdm_kwargs_download = dict(
         ascii=False,
-        desc="Compute population ",
+        desc="Downloading worldpop file per country",
     )
-    with tqdm(total=len(country_codes), **tqdm_kwargs) as pbar:
+    with tqdm(total=len(country_codes), **tqdm_kwargs_download) as pbar:
         for c_code in country_codes:
-            # get subset by country code
-            country_rows = df_gadm.loc[df_gadm["country"] == c_code]
-
             # Download worldpop image (if required) and get file location
             WorldPop_inputfile, WorldPop_filename = download_WorldPop(
                 c_code, worldpop_method, year, update, out_logging
             )
+            dict_worldpop_file_locations[c_code] = WorldPop_inputfile
+            pbar.update(1)
+
+    if out_logging:
+        logger.info("Stage 4 of 5: Add population data to GADM GeoDataFrame")
+
+    tqdm_kwargs_compute = dict(
+        ascii=False,
+        desc="Compute population per country",
+    )
+    with tqdm(total=len(country_codes), **tqdm_kwargs_compute) as pbar:
+        for c_code in country_codes:
+            # get subset by country code
+            country_rows = df_gadm.loc[df_gadm["country"] == c_code]
 
             if out_logging:
-                logger.info("Stage 3 of 4: Calculating population of " + str(c_code))
+                logger.info("Stage 4 of 5: Calculating population of " + str(c_code))
 
             # Calculate the population for each geometry given in country_rows
             df_pop_count = compute_population(
-                country_rows, WorldPop_inputfile, out_logging
+                country_rows, dict_worldpop_file_locations[c_code], out_logging
             )
 
             # Loop the regions and write population to df_gadm
@@ -1160,7 +1172,7 @@ def gadm(
     nchunks=None,
 ):
     if out_logging:
-        logger.info("Stage 3 of 4: Creation GADM GeoDataFrame")
+        logger.info("Stage 3 of 5: Creation GADM GeoDataFrame")
 
     # download data if needed and get the desired layer_id
     df_gadm = get_GADM_layer(countries, layer_id, geo_crs, contended_flag, update)

From 30315f36dd8d7abe125fd98104a8807f18cb9641 Mon Sep 17 00:00:00 2001
From: GridGrapher <127969728+GridGrapher@users.noreply.github.com>
Date: Sun, 16 Apr 2023 17:49:17 +0200
Subject: [PATCH 14/29] tqdm progress bar shows progress windowed approach

---
 scripts/build_shapes.py | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index 52486bb93..b38167a6e 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -843,9 +843,9 @@ def compute_geomask_region(
         temp_mask = rasterio.features.geometry_mask(
             [cur_geometry],
             (y_axis_len, x_axis_len),
-            affine_transform,
-            invert=True,
+            transform=affine_transform,
             all_touched=True,
+            invert=True,
         )
 
         # Map the values of counter value to np_map_ID
@@ -860,7 +860,7 @@ def compute_geomask_region(
     return np_map_ID.astype("H"), pd.DataFrame(id_to_GADM_ID).set_index(0)
 
 
-def compute_population(country_rows, WorldPop_inputfile, out_logging=False):
+def compute_population(country_rows, WorldPop_inputfile, pbar, out_logging=False):
     """
     Function computes the population for the given country rows
     -------
@@ -904,6 +904,7 @@ def compute_population(country_rows, WorldPop_inputfile, out_logging=False):
         df_pop_count = sum_values_using_geomask(
             np_pop_val, np_pop_xy, country_geomask, id_mapping
         )
+        pbar.update(1)
 
     else:
         if out_logging:
@@ -918,13 +919,15 @@ def compute_population(country_rows, WorldPop_inputfile, out_logging=False):
             )
         # Calculate the population using windows
         df_pop_count = windowed_compute_population(
-            country_rows, WorldPop_inputfile, worldpop_byte_limit
+            country_rows, WorldPop_inputfile, worldpop_byte_limit, pbar
         )
 
     return df_pop_count
 
 
-def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_limit):
+def windowed_compute_population(
+    country_rows, WorldPop_inputfile, worldpop_byte_limit, pbar
+):
     """
     Function
     -------
@@ -972,11 +975,12 @@ def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_
     # y_range_start will serve as row offset
     window_row_off = np.arange(0, worldpop_y_dim, window_y_dim)
 
+    # Calculate the percentage each window is of the image for pbar.update
+    window_percentage_of_image = 1 / len(window_row_off)
+
     for row_off in window_row_off:
         window_dimensions = [window_col_off, row_off, window_x_dim, window_y_dim]
 
-        print("Running window: ", window_dimensions)
-
         np_pop_val, np_pop_xy = get_worldpop_val_xy(
             WorldPop_inputfile, window_dimensions
         )
@@ -1001,6 +1005,8 @@ def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_
             # Select the row with the same "GADM_ID" and set the population count
             df_pop_count.loc[df_pop_count["GADM_ID"] == gadm_id, "pop"] += pop_count
 
+        pbar.update(window_percentage_of_image)
+
     return df_pop_count
 
 
@@ -1135,6 +1141,7 @@ def add_population_data(
     tqdm_kwargs_compute = dict(
         ascii=False,
         desc="Compute population per country",
+        bar_format="{desc}: {percentage:.3f}%|{bar}| {n:.3f}/{total_fmt} [{elapsed}<{remaining}",
     )
     with tqdm(total=len(country_codes), **tqdm_kwargs_compute) as pbar:
         for c_code in country_codes:
@@ -1146,7 +1153,7 @@ def add_population_data(
 
             # Calculate the population for each geometry given in country_rows
             df_pop_count = compute_population(
-                country_rows, dict_worldpop_file_locations[c_code], out_logging
+                country_rows, dict_worldpop_file_locations[c_code], pbar, out_logging
             )
 
             # Loop the regions and write population to df_gadm
@@ -1155,8 +1162,6 @@ def add_population_data(
                 # Select the row with the same "GADM_ID" and set the population count
                 df_gadm.loc[df_gadm["GADM_ID"] == gadm_id, "pop"] = pop_count
 
-            pbar.update(1)
-
 
 def gadm(
     worldpop_method,

From 5f59ce11097ccbd70cb9ae28bdaddf58290a807e Mon Sep 17 00:00:00 2001
From: GridGrapher <127969728+GridGrapher@users.noreply.github.com>
Date: Sun, 16 Apr 2023 23:56:22 +0200
Subject: [PATCH 15/29] Multiprocessing approach 0.5

---
 scripts/build_shapes.py | 88 +++++++++++++++++++++++++----------------
 1 file changed, 53 insertions(+), 35 deletions(-)

diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index b38167a6e..c42949725 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -676,9 +676,13 @@ def add_gdp_data(
     return df_gadm
 
 
-def _init_process_pop(df_gadm_, year_, worldpop_method_):
-    global df_gadm, year, worldpop_method
-    df_gadm, year, worldpop_method = df_gadm_, year_, worldpop_method_
+def _init_process_pop(df_gadm_, dict_worldpop_file_locations_, out_logging_):
+    global df_gadm, dict_worldpop_file_locations, out_logging
+    df_gadm, dict_worldpop_file_locations, out_logging = (
+        df_gadm_,
+        dict_worldpop_file_locations_,
+        out_logging_,
+    )
 
 
 # Auxiliary function to calculate population data in a parallel way
@@ -705,6 +709,20 @@ def _process_func_pop(gadm_idxs):
     return df_gadm_subset
 
 
+def _process_function_population(c_code):
+    # get subset by country code
+    country_rows = df_gadm.loc[df_gadm["country"] == c_code]
+
+    if out_logging:
+        logger.info("Stage 4 of 5: Calculating population of " + str(c_code))
+
+    # Calculate the population for each geometry given in country_rows
+    df_pop_count = compute_population(
+        country_rows, dict_worldpop_file_locations[c_code], out_logging
+    )
+    return df_pop_count
+
+
 # Auxiliary function to download WorldPop data in a parallel way
 def _process_func_download_pop(c_code):
     WorldPop_inputfile, WorldPop_filename = download_WorldPop(
@@ -860,7 +878,7 @@ def compute_geomask_region(
     return np_map_ID.astype("H"), pd.DataFrame(id_to_GADM_ID).set_index(0)
 
 
-def compute_population(country_rows, WorldPop_inputfile, pbar, out_logging=False):
+def compute_population(country_rows, WorldPop_inputfile, out_logging=False):
     """
     Function computes the population for the given country rows
     -------
@@ -885,7 +903,7 @@ def compute_population(country_rows, WorldPop_inputfile, pbar, out_logging=False
 
     # Introduce a max byte size to avoid overfilling RAM
     # Ensure worldpop_byte_limit > 883 * 10**6 (minimum memory for 'US')
-    worldpop_byte_limit = 3096 * 10**6
+    worldpop_byte_limit = 1024 * 10**6
 
     # If the rasterio read will be within byte limit
     if expected_bytes_input_read < worldpop_byte_limit:
@@ -904,7 +922,6 @@ def compute_population(country_rows, WorldPop_inputfile, pbar, out_logging=False
         df_pop_count = sum_values_using_geomask(
             np_pop_val, np_pop_xy, country_geomask, id_mapping
         )
-        pbar.update(1)
 
     else:
         if out_logging:
@@ -919,15 +936,13 @@ def compute_population(country_rows, WorldPop_inputfile, pbar, out_logging=False
             )
         # Calculate the population using windows
         df_pop_count = windowed_compute_population(
-            country_rows, WorldPop_inputfile, worldpop_byte_limit, pbar
+            country_rows, WorldPop_inputfile, worldpop_byte_limit
         )
 
     return df_pop_count
 
 
-def windowed_compute_population(
-    country_rows, WorldPop_inputfile, worldpop_byte_limit, pbar
-):
+def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_limit):
     """
     Function
     -------
@@ -1005,8 +1020,6 @@ def windowed_compute_population(
             # Select the row with the same "GADM_ID" and set the population count
             df_pop_count.loc[df_pop_count["GADM_ID"] == gadm_id, "pop"] += pop_count
 
-        pbar.update(window_percentage_of_image)
-
     return df_pop_count
 
 
@@ -1138,29 +1151,34 @@ def add_population_data(
     if out_logging:
         logger.info("Stage 4 of 5: Add population data to GADM GeoDataFrame")
 
-    tqdm_kwargs_compute = dict(
-        ascii=False,
-        desc="Compute population per country",
-        bar_format="{desc}: {percentage:.3f}%|{bar}| {n:.3f}/{total_fmt} [{elapsed}<{remaining}",
-    )
-    with tqdm(total=len(country_codes), **tqdm_kwargs_compute) as pbar:
-        for c_code in country_codes:
-            # get subset by country code
-            country_rows = df_gadm.loc[df_gadm["country"] == c_code]
-
-            if out_logging:
-                logger.info("Stage 4 of 5: Calculating population of " + str(c_code))
-
-            # Calculate the population for each geometry given in country_rows
-            df_pop_count = compute_population(
-                country_rows, dict_worldpop_file_locations[c_code], pbar, out_logging
-            )
-
-            # Loop the regions and write population to df_gadm
-            for i in range(len(df_pop_count)):
-                gadm_id, pop_count = df_pop_count.iloc[i]
-                # Select the row with the same "GADM_ID" and set the population count
-                df_gadm.loc[df_gadm["GADM_ID"] == gadm_id, "pop"] = pop_count
+    lock = mp.Lock()
+    kwargs = {
+        "initializer": _init_process_pop,
+        "initargs": (
+            df_gadm,
+            dict_worldpop_file_locations,
+            out_logging,
+        ),
+        "processes": 2,
+    }
+    # Spawn processes with the parameters from kwargs
+    with mp.get_context("spawn").Pool(**kwargs) as pool:
+        # Create a progress bar
+        with tqdm(
+            total=len(country_codes), desc="Compute population per country"
+        ) as pbar:
+            # Give the pool a workload
+            for df_pop_count in pool.imap_unordered(
+                _process_function_population, country_codes
+            ):
+                # Acquire the lock before accessing df_gadm
+                with lock:
+                    # Loop the regions and write population to df_gadm
+                    for i in range(len(df_pop_count)):
+                        gadm_id, pop_count = df_pop_count.iloc[i]
+                        # Select the row with the same "GADM_ID" and set the population count
+                        df_gadm.loc[df_gadm["GADM_ID"] == gadm_id, "pop"] = pop_count
+                pbar.update(1)
 
 
 def gadm(

From 9da3b26e676352ede45c00bd349a58060f63321b Mon Sep 17 00:00:00 2001
From: GridGrapher <127969728+GridGrapher@users.noreply.github.com>
Date: Mon, 17 Apr 2023 22:51:21 +0200
Subject: [PATCH 16/29] Env update

---
 envs/environment.fixed.yaml | 3 ++-
 envs/environment.yaml       | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/envs/environment.fixed.yaml b/envs/environment.fixed.yaml
index ac532f167..351b16f39 100644
--- a/envs/environment.fixed.yaml
+++ b/envs/environment.fixed.yaml
@@ -260,7 +260,7 @@ dependencies:
 - notebook-shim=0.1.0
 - nspr=4.32
 - nss=3.78
-- numba=0.53.1
+- numba=0.56.4
 - numexpr=2.8.3
 - numpy=1.23.2
 - openjpeg=2.4.0
@@ -306,6 +306,7 @@ dependencies:
 - pyct=0.4.6
 - pyct-core=0.4.6
 - pygments=2.13.0
+- pyogrio=0.5.1
 - pyomo=6.4.2
 - pyopenssl=22.0.0
 - pyparsing=3.0.9
diff --git a/envs/environment.yaml b/envs/environment.yaml
index 480767d0b..b4755a554 100644
--- a/envs/environment.yaml
+++ b/envs/environment.yaml
@@ -43,6 +43,8 @@ dependencies:
 - matplotlib<=3.5.2
 - reverse-geocode
 - country_converter
+- pyogyrio
+- numba
 
   # Keep in conda environment when calling ipython
 - ipython

From 2f3b6a15b08b11b72cc9b0c3fc98a0b401c7858c Mon Sep 17 00:00:00 2001
From: GridGrapher <127969728+GridGrapher@users.noreply.github.com>
Date: Mon, 17 Apr 2023 22:58:16 +0200
Subject: [PATCH 17/29] Env update - misstyped pyogrio

---
 envs/environment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/envs/environment.yaml b/envs/environment.yaml
index b4755a554..9b8447928 100644
--- a/envs/environment.yaml
+++ b/envs/environment.yaml
@@ -43,7 +43,7 @@ dependencies:
 - matplotlib<=3.5.2
 - reverse-geocode
 - country_converter
-- pyogyrio
+- pyogrio
 - numba
 
   # Keep in conda environment when calling ipython

From 418207e28d023382afe3c21460e08c9595e2d822 Mon Sep 17 00:00:00 2001
From: GridGrapher <127969728+GridGrapher@users.noreply.github.com>
Date: Tue, 18 Apr 2023 00:31:32 +0200
Subject: [PATCH 18/29] Adding mem_mb, integrating into compute_population

---
 Snakefile               |  2 +-
 scripts/build_shapes.py | 46 ++++++++++++++++++++++++++++-------------
 2 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/Snakefile b/Snakefile
index 9bf6bed82..dfab7f120 100644
--- a/Snakefile
+++ b/Snakefile
@@ -221,7 +221,7 @@ rule build_shapes:
         "benchmarks/" + RDIR + "build_shapes"
     threads: 1
     resources:
-        mem_mb=500,
+        mem_mb=2048,
     script:
         "scripts/build_shapes.py"
 
diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index 113df61e7..7ea579546 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -678,11 +678,14 @@ def add_gdp_data(
     return df_gadm
 
 
-def _init_process_pop(df_gadm_, dict_worldpop_file_locations_, out_logging_):
-    global df_gadm, dict_worldpop_file_locations, out_logging
-    df_gadm, dict_worldpop_file_locations, out_logging = (
+def _init_process_pop(
+    df_gadm_, dict_worldpop_file_locations_, mem_read_limit_per_process_, out_logging_
+):
+    global df_gadm, dict_worldpop_file_locations, mem_read_limit_per_process, out_logging
+    df_gadm, dict_worldpop_file_locations, mem_read_limit_per_process, out_logging = (
         df_gadm_,
         dict_worldpop_file_locations_,
+        mem_read_limit_per_process_,
         out_logging_,
     )
 
@@ -720,7 +723,10 @@ def _process_function_population(c_code):
 
     # Calculate the population for each geometry given in country_rows
     df_pop_count = compute_population(
-        country_rows, dict_worldpop_file_locations[c_code], out_logging
+        country_rows,
+        dict_worldpop_file_locations[c_code],
+        mem_read_limit_per_process,
+        out_logging,
     )
     return df_pop_count
 
@@ -880,7 +886,9 @@ def compute_geomask_region(
     return np_map_ID.astype("H"), pd.DataFrame(id_to_GADM_ID).set_index(0)
 
 
-def compute_population(country_rows, WorldPop_inputfile, out_logging=False):
+def compute_population(
+    country_rows, WorldPop_inputfile, mem_read_limit_per_process, out_logging=False
+):
     """
     Function computes the population for the given country rows
     -------
@@ -905,7 +913,7 @@ def compute_population(country_rows, WorldPop_inputfile, out_logging=False):
 
     # Introduce a max byte size to avoid overfilling RAM
     # Ensure worldpop_byte_limit > 883 * 10**6 (minimum memory for 'US')
-    worldpop_byte_limit = 1024 * 10**6
+    worldpop_byte_limit = max(883, mem_read_limit_per_process) * 10**6
 
     # If the rasterio read will be within byte limit
     if expected_bytes_input_read < worldpop_byte_limit:
@@ -1106,6 +1114,7 @@ def add_population_data(
     year=2020,
     update=False,
     out_logging=False,
+    mem_read_limit_per_process=1024,
     nprocesses=2,
     nchunks=2,
     disable_progressbar=False,
@@ -1159,6 +1168,7 @@ def add_population_data(
         "initargs": (
             df_gadm,
             dict_worldpop_file_locations,
+            mem_read_limit_per_process,
             out_logging,
         ),
         "processes": 2,
@@ -1189,6 +1199,7 @@ def gadm(
     countries,
     geo_crs,
     contended_flag,
+    mem_mb,
     layer_id=2,
     update=False,
     out_logging=False,
@@ -1213,6 +1224,7 @@ def gadm(
     )
 
     if worldpop_method != False:
+        mem_read_limit_per_process = mem_mb / nprocesses
         # add the population data to the dataset
         add_population_data(
             df_gadm,
@@ -1221,6 +1233,7 @@ def gadm(
             year,
             update,
             out_logging,
+            mem_read_limit_per_process,
             nprocesses=nprocesses,
             nchunks=nchunks,
         )
@@ -1263,18 +1276,22 @@ def gadm(
 
     out = snakemake.output
 
+    EEZ_gpkg = snakemake.input["eez"]
+    mem_mb = snakemake.resources["mem_mb"]
+
     countries_list = snakemake.config["countries"]
+    geo_crs = snakemake.config["crs"]["geo_crs"]
+    distance_crs = snakemake.config["crs"]["distance_crs"]
+
+    contended_flag = snakemake.config["build_shape_options"]["contended_flag"]
+    gdp_method = snakemake.config["build_shape_options"]["gdp_method"]
     layer_id = snakemake.config["build_shape_options"]["gadm_layer_id"]
-    update = snakemake.config["build_shape_options"]["update_file"]
-    out_logging = snakemake.config["build_shape_options"]["out_logging"]
-    year = snakemake.config["build_shape_options"]["year"]
     nprocesses = snakemake.config["build_shape_options"]["nprocesses"]
-    contended_flag = snakemake.config["build_shape_options"]["contended_flag"]
-    EEZ_gpkg = snakemake.input["eez"]
+    out_logging = snakemake.config["build_shape_options"]["out_logging"]
+    update = snakemake.config["build_shape_options"]["update_file"]
     worldpop_method = snakemake.config["build_shape_options"]["worldpop_method"]
-    gdp_method = snakemake.config["build_shape_options"]["gdp_method"]
-    geo_crs = snakemake.config["crs"]["geo_crs"]
-    distance_crs = snakemake.config["crs"]["distance_crs"]
+    year = snakemake.config["build_shape_options"]["year"]
+
     nchunks = snakemake.config["build_shape_options"]["nchunks"]
     if nchunks < nprocesses:
         logger.info(f"build_shapes data chunks set to nprocesses {nprocesses}")
@@ -1306,6 +1323,7 @@ def gadm(
         countries_list,
         geo_crs,
         contended_flag,
+        mem_mb,
         layer_id,
         update,
         out_logging,

From e96f3ac39f1379e78aee9d5ffb679a4abc2dc545 Mon Sep 17 00:00:00 2001
From: GridGrapher <127969728+GridGrapher@users.noreply.github.com>
Date: Fri, 21 Apr 2023 01:01:33 +0200
Subject: [PATCH 19/29] compute_geomask_region adjustment

---
 scripts/build_shapes.py | 137 ++++++++++++++++++++++------------------
 1 file changed, 75 insertions(+), 62 deletions(-)

diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index 7ea579546..4fa245468 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -813,77 +813,86 @@ def compute_geomask_region(
         pd.DataFrame(id_to_GADM_ID).set_index(0):
             DataFrame of the mapping from id (from counter) to GADM_ID
     """
-    col_off, row_off, x_axis_len, y_axis_len = window_dimensions
+    try:
+        col_off, row_off, x_axis_len, y_axis_len = [int(i) for i in window_dimensions]
 
-    if windowed:
-        # Declare a transformer with given affine_transform
-        transformer = rasterio.transform.AffineTransformer(affine_transform)
-
-        # Obtain the coordinates of the upper left corner of window
-        window_topleft_longitude, window_topleft_latitude = transformer.xy(
-            row_off, col_off
-        )
-
-        # Obtain the coordinates of the bottom right corner of window
-        window_botright_longitude, window_botright_latitude = transformer.xy(
-            row_off + y_axis_len, col_off + x_axis_len
-        )
+        if windowed:
+            # Declare a transformer with given affine_transform
+            transformer = rasterio.transform.AffineTransformer(affine_transform)
 
-        # Set the current transform to the correct lat and long
-        affine_transform = rasterio.Affine(
-            affine_transform[0],
-            affine_transform[1],
-            window_topleft_longitude,
-            affine_transform[3],
-            affine_transform[4],
-            window_topleft_latitude,
-        )
+            # Obtain the coordinates of the upper left corner of window
+            window_topleft_longitude, window_topleft_latitude = transformer.xy(
+                row_off, col_off
+            )
 
-    # Set an empty numpy array with the dimensions of the country .tif file
-    # np_map_ID will contain an ID for each location (undefined is 0)
-    # ID corresponds to a specific geometry in country_rows
-    np_map_ID = np.zeros((y_axis_len, x_axis_len))
+            # Obtain the coordinates of the bottom right corner of window
+            window_botright_longitude, window_botright_latitude = transformer.xy(
+                row_off + y_axis_len, col_off + x_axis_len
+            )
 
-    # List to contain the mappings of id to GADM_ID
-    id_to_GADM_ID = []
+            # Set the current transform to the correct lat and long
+            affine_transform = rasterio.Affine(
+                affine_transform[0],
+                affine_transform[1],
+                window_topleft_longitude,
+                affine_transform[3],
+                affine_transform[4],
+                window_topleft_latitude,
+            )
 
-    # Loop the country_rows geoDataFrame
-    for i in range(len(country_rows)):
-        # Set the current geometry
-        cur_geometry = country_rows.iloc[i]["geometry"]
+        # Set an empty numpy array with the dimensions of the country .tif file
+        # np_map_ID will contain an ID for each location (undefined is 0)
+        # ID corresponds to a specific geometry in country_rows
+        np_map_ID = np.zeros((y_axis_len, x_axis_len))
+
+        # List to contain the mappings of id to GADM_ID
+        id_to_GADM_ID = []
+
+        # Loop the country_rows geoDataFrame
+        for i in range(len(country_rows)):
+            # Set the current geometry
+            cur_geometry = country_rows.iloc[i]["geometry"]
+
+            # In windowed mode we check if bounds of geometry overlap the window
+            if windowed:
+                latitude_min = cur_geometry.bounds[1]
+                latitude_max = cur_geometry.bounds[3]
+
+                # In the following cases we don't have to continue the loop
+                # If the geometry is above the window
+                if latitude_min > window_topleft_latitude:
+                    continue
+                # If the geometry is below the window
+                if latitude_max < window_botright_latitude:
+                    continue
+
+            # Generate a mask for the specific geometry
+            temp_mask = rasterio.features.geometry_mask(
+                [cur_geometry],
+                (y_axis_len, x_axis_len),
+                transform=affine_transform,
+                all_touched=True,
+                invert=True,
+            )
 
-        # In windowed mode we check if bounds of geometry overlap the window
-        if windowed:
-            latitude_min = cur_geometry.bounds[1]
-            latitude_max = cur_geometry.bounds[3]
-
-            # In the following cases we don't have to continue the loop
-            # If the geometry is above the window
-            if latitude_min > window_topleft_latitude:
-                continue
-            # If the geometry is below the window
-            if latitude_max < window_botright_latitude:
-                continue
-
-        # Generate a mask for the specific geometry
-        temp_mask = rasterio.features.geometry_mask(
-            [cur_geometry],
-            (y_axis_len, x_axis_len),
-            transform=affine_transform,
-            all_touched=True,
-            invert=True,
-        )
+            # Map the values of counter value to np_map_ID
+            np_map_ID[temp_mask] = i + 1
 
-        # Map the values of counter value to np_map_ID
-        np_map_ID[temp_mask] = i + 1
+            # Store the id -> GADM_ID mapping
+            id_to_GADM_ID.append([i + 1, country_rows.iloc[i]["GADM_ID"]])
 
-        # Store the id -> GADM_ID mapping
-        id_to_GADM_ID.append([i + 1, country_rows.iloc[i]["GADM_ID"]])
+        if len(id_to_GADM_ID) > 0:
+            id_result = pd.DataFrame(id_to_GADM_ID).set_index(0)
+        else:
+            id_result = pd.DataFrame()
 
-    # Return np_map_ID as type 'H' np.ushort
-    # 'H' -> https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.ushort
-    # This lowers memory usage, note: ID has to be within the range [0,65535]
-    return np_map_ID.astype("H"), pd.DataFrame(id_to_GADM_ID).set_index(0)
+        # Return np_map_ID as type 'H' np.ushort
+        # 'H' -> https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.ushort
+        # This lowers memory usage, note: ID has to be within the range [0,65535]
+        return np_map_ID.astype("H"), id_result
+    except:
+        print(country_rows, affine_transform, window_dimensions, windowed)
+        raise
 
 
 def compute_population(
@@ -1019,6 +1028,10 @@ def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_
             country_rows, transform, window_dimensions, windowed=True
         )
 
+        # If no values are present in the id_mapping skip the remaining steps
+        if len(id_mapping) == 0:
+            continue
+
         # Calculate the population for each region
         windowed_pop_count = sum_values_using_geomask(
             np_pop_val, np_pop_xy, region_geomask, id_mapping

From 6791d65a3355984c9a6e7102e3ddf8928687740e Mon Sep 17 00:00:00 2001
From: GridGrapher <127969728+GridGrapher@users.noreply.github.com>
Date: Sun, 23 Apr 2023 12:36:10 +0200
Subject: [PATCH 20/29] Cleanup 1

---
 scripts/build_shapes.py | 55 +----------------------------------------
 1 file changed, 1 insertion(+), 54 deletions(-)

diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index 4fa245468..ade7399c8 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -690,37 +690,10 @@ def _init_process_pop(
     )
 
 
-# Auxiliary function to calculate population data in a parallel way
-def _process_func_pop(gadm_idxs):
-    # get subset by country code
-    df_gadm_subset = df_gadm.loc[gadm_idxs].copy()
-
-    country_sublist = df_gadm_subset["country"].unique()
-
-    for c_code in country_sublist:
-        # get worldpop image
-        WorldPop_inputfile, WorldPop_filename = download_WorldPop(
-            c_code, worldpop_method, year, False, False
-        )
-
-        idxs_country = df_gadm_subset[df_gadm_subset["country"] == c_code].index
-
-        with rasterio.open(WorldPop_inputfile) as src:
-            for i in idxs_country:
-                df_gadm_subset.loc[i, "pop"] = _sum_raster_over_mask(
-                    df_gadm_subset.geometry.loc[i], src
-                )
-
-    return df_gadm_subset
-
-
 def _process_function_population(c_code):
     # get subset by country code
     country_rows = df_gadm.loc[df_gadm["country"] == c_code]
 
-    if out_logging:
-        logger.info("Stage 4 of 5: Calculating population of " + str(c_code))
-
     # Calculate the population for each geometry given in country_rows
     df_pop_count = compute_population(
         country_rows,
@@ -731,13 +704,6 @@ def _process_function_population(c_code):
     return df_pop_count
 
 
-# Auxiliary function to download WorldPop data in a parallel way
-def _process_func_download_pop(c_code):
-    WorldPop_inputfile, WorldPop_filename = download_WorldPop(
-        c_code, worldpop_method, year, False, False
-    )
-
-
 def get_worldpop_features(WorldPop_inputfile):
     """
     Function to extract data from .tif input file
@@ -943,16 +909,6 @@ def compute_population(
         )
 
     else:
-        if out_logging:
-            logger.info(
-                "Stage 4 of 5: compute_population for "
-                + str(country_rows.iloc[0]["country"])
-                + ": Expected size of file readout was "
-                + str(expected_bytes_input_read // 10**6)
-                + " Megabytes. As the limit is "
-                + str(worldpop_byte_limit // 10**6)
-                + " Megabytes switching to windowed approach"
-            )
         # Calculate the population using windows
         df_pop_count = windowed_compute_population(
             country_rows, WorldPop_inputfile, worldpop_byte_limit
@@ -1129,7 +1085,6 @@ def add_population_data(
     out_logging=False,
     mem_read_limit_per_process=1024,
     nprocesses=2,
-    nchunks=2,
     disable_progressbar=False,
 ):
     """
@@ -1184,7 +1139,7 @@ def add_population_data(
             mem_read_limit_per_process,
             out_logging,
         ),
-        "processes": 2,
+        "processes": nprocesses,
     }
     # Spawn processes with the parameters from kwargs
     with mp.get_context("spawn").Pool(**kwargs) as pool:
@@ -1218,7 +1173,6 @@ def gadm(
     out_logging=False,
     year=2020,
     nprocesses=None,
-    nchunks=None,
 ):
     if out_logging:
         logger.info("Stage 3 of 5: Creation GADM GeoDataFrame")
@@ -1248,7 +1202,6 @@ def gadm(
             out_logging,
             mem_read_limit_per_process,
             nprocesses=nprocesses,
-            nchunks=nchunks,
         )
 
     if gdp_method != False:
@@ -1305,11 +1258,6 @@ def gadm(
     worldpop_method = snakemake.config["build_shape_options"]["worldpop_method"]
     year = snakemake.config["build_shape_options"]["year"]
 
-    nchunks = snakemake.config["build_shape_options"]["nchunks"]
-    if nchunks < nprocesses:
-        logger.info(f"build_shapes data chunks set to nprocesses {nprocesses}")
-        nchunks = nprocesses
-
     country_shapes = countries(
         countries_list,
         geo_crs,
@@ -1342,6 +1290,5 @@ def gadm(
         out_logging,
         year,
         nprocesses=nprocesses,
-        nchunks=nchunks,
     )
     save_to_geojson(gadm_shapes, out.gadm_shapes)

From 41edcbbc85dddec7b1bddf4a2abc7ab4855fb521 Mon Sep 17 00:00:00 2001
From: GridGrapher <127969728+GridGrapher@users.noreply.github.com>
Date: Sun, 23 Apr 2023 13:16:19 +0200
Subject: [PATCH 21/29] Contested ISO codes fix for df_gadm

---
 scripts/build_shapes.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index ade7399c8..3282a3f6f 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -1215,11 +1215,9 @@ def gadm(
         )
 
     # renaming 3 letter to 2 letter ISO code before saving GADM file
-    # solves issue: https://github.com/pypsa-meets-earth/pypsa-earth/issues/671
-    df_gadm["GADM_ID"] = (
-        df_gadm["GADM_ID"]
-        .str.split(".")
-        .apply(lambda id: three_2_two_digits_country(id[0]) + "." + ".".join(id[1:]))
+    # In the case of a contested territory in the form 'Z00.00_0', save 'AA.Z00.00_0'
+    df_gadm["GADM_ID"] = df_gadm["country"] + df_gadm["GADM_ID"].apply(
+        lambda x: "." + x if x[0] == "Z" else x[3:]
     )
     df_gadm.set_index("GADM_ID", inplace=True)
     df_gadm["geometry"] = df_gadm["geometry"].map(_simplify_polys)

From 74177b27a8ea3da2c691b5937f280e03654e0fcc Mon Sep 17 00:00:00 2001
From: GridGrapher <127969728+GridGrapher@users.noreply.github.com>
Date: Sun, 23 Apr 2023 14:19:37 +0200
Subject: [PATCH 22/29] Readability update

---
 scripts/build_shapes.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index 3282a3f6f..b0e142656 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -731,9 +731,9 @@ def get_worldpop_val_xy(WorldPop_inputfile, window_dimensions):
         np_pop_valid: array filled with values for each nonzero pixel in the worldpop file
         np_pop_xy: array with [x,y] coordinates of the corresponding nonzero values in np_pop_valid
     """
-    col_off, row_off, width, height = window_dimensions
+    col_offset, row_offset, width, height = window_dimensions
 
-    current_window = Window(col_off, row_off, width, height)
+    current_window = Window(col_offset, row_offset, width, height)
 
     # Open the file using rasterio
     with rasterio.open(WorldPop_inputfile) as src:
@@ -780,7 +780,9 @@ def compute_geomask_region(
             DataFrame of the mapping from id (from counter) to GADM_ID
     """
     try:
-        col_off, row_off, x_axis_len, y_axis_len = [int(i) for i in window_dimensions]
+        col_offset, row_offset, x_axis_len, y_axis_len = [
+            int(i) for i in window_dimensions
+        ]
 
         if windowed:
             # Declare a transformer with given affine_transform
@@ -788,12 +790,12 @@ def compute_geomask_region(
 
             # Obtain the coordinates of the upper left corner of window
             window_topleft_longitude, window_topleft_latitude = transformer.xy(
-                row_off, col_off
+                row_offset, col_offset
             )
 
             # Obtain the coordinates of the bottom right corner of window
             window_botright_longitude, window_botright_latitude = transformer.xy(
-                row_off + y_axis_len, col_off + x_axis_len
+                row_offset + y_axis_len, col_offset + x_axis_len
             )
 
             # Set the current transform to the correct lat and long
@@ -887,7 +889,7 @@ def compute_population(
     expected_bytes_input_read = 4 * worldpop_y_dim * worldpop_x_dim
 
     # Introduce a max byte size to avoid overfilling RAM
-    # Ensure worldpop_byte_limit > 883 * 10**6 (minimum memory for 'US')
+    # Ensure worldpop_byte_limit >= 883 * 10**6 (minimum memory for 'US')
     worldpop_byte_limit = max(883, mem_read_limit_per_process) * 10**6
 
     # If the rasterio read will be within byte limit
@@ -948,7 +950,7 @@ def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_
     #   Hence worldpop_byte_limit has to be greater than 883 MB
 
     # As the window spans the x dimension, set column offset to 0
-    window_col_off = 0
+    window_col_offset = 0
 
     # Calculate the bytes for reading the window using window_x_dim (float32)
     read_block_size = 4 * block_y_dim * window_x_dim
@@ -963,13 +965,11 @@ def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_
 
     # Calculate the y ranges of the blocks to scan the image
     # y_range_start will serve as row offset
-    window_row_off = np.arange(0, worldpop_y_dim, window_y_dim)
-
-    # Calculate the percentage each window is of the image for pbar.update
-    window_percentage_of_image = 1 / len(window_row_off)
+    window_row_offset = np.arange(0, worldpop_y_dim, window_y_dim)
 
-    for row_off in window_row_off:
-        window_dimensions = [window_col_off, row_off, window_x_dim, window_y_dim]
+    # Loop the windows
+    for row_offset in window_row_offset:
+        window_dimensions = [window_col_offset, row_offset, window_x_dim, window_y_dim]
 
         np_pop_val, np_pop_xy = get_worldpop_val_xy(
             WorldPop_inputfile, window_dimensions

From 605c6b8d29f29fad88bdcdd1ff7bc596a3ef5e16 Mon Sep 17 00:00:00 2001
From: GridGrapher <127969728+GridGrapher@users.noreply.github.com>
Date: Sun, 23 Apr 2023 22:46:06 +0200
Subject: [PATCH 23/29] Restructuring for adaptability

---
 Snakefile               |   2 +-
 scripts/build_shapes.py | 473 +++++++++++++++++++---------------------
 2 files changed, 226 insertions(+), 249 deletions(-)

diff --git a/Snakefile b/Snakefile
index dfab7f120..1a19b56a7 100644
--- a/Snakefile
+++ b/Snakefile
@@ -221,7 +221,7 @@ rule build_shapes:
         "benchmarks/" + RDIR + "build_shapes"
     threads: 1
     resources:
-        mem_mb=2048,
+        mem_mb=3096,
     script:
         "scripts/build_shapes.py"
 
diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index b0e142656..b18e20bcd 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -678,45 +678,52 @@ def add_gdp_data(
     return df_gadm
 
 
-def _init_process_pop(
-    df_gadm_, dict_worldpop_file_locations_, mem_read_limit_per_process_, out_logging_
-):
-    global df_gadm, dict_worldpop_file_locations, mem_read_limit_per_process, out_logging
-    df_gadm, dict_worldpop_file_locations, mem_read_limit_per_process, out_logging = (
-        df_gadm_,
+def _init_process_pop(df_gadm_, df_tasks_, dict_worldpop_file_locations_):
+    global df_gadm, df_tasks
+    df_gadm, df_tasks = df_gadm_, df_tasks_
+
+    global dict_worldpop_file_locations, out_logging
+    dict_worldpop_file_locations, out_logging = (
         dict_worldpop_file_locations_,
-        mem_read_limit_per_process_,
-        out_logging_,
+        False,
     )
 
 
-def _process_function_population(c_code):
+def new_process_function_population(row_id):
+    # Get the current task values
+    current_row = df_tasks.iloc[row_id]
+
+    c_code = current_row["c_code"]
+    window_dimensions = current_row["window_dimensions"]
+    transform = current_row["affine_transform"]
+    latlong_topleft = current_row["latlong_coordinate_topleft"]
+    latlong_botright = current_row["latlong_coordinate_botright"]
+
+    # Obtain the inputfile location from dict
+    WorldPop_inputfile = dict_worldpop_file_locations[c_code]
     # get subset by country code
     country_rows = df_gadm.loc[df_gadm["country"] == c_code]
 
-    # Calculate the population for each geometry given in country_rows
-    df_pop_count = compute_population(
-        country_rows,
-        dict_worldpop_file_locations[c_code],
-        mem_read_limit_per_process,
-        out_logging,
+    np_pop_val, np_pop_xy = get_worldpop_val_xy(WorldPop_inputfile, window_dimensions)
+
+    # If no values are present in the current window skip the remaining steps
+    if len(np_pop_val) == 0:
+        return []
+
+    # get the geomask with id mappings
+    region_geomask, id_mapping = compute_geomask_region(
+        country_rows, transform, window_dimensions, latlong_topleft, latlong_botright
     )
-    return df_pop_count
 
+    # If no values are present in the id_mapping skip the remaining steps
+    if len(id_mapping) == 0:
+        return []
 
-def get_worldpop_features(WorldPop_inputfile):
-    """
-    Function to extract data from .tif input file
-    -------
-    Inputs:
-        WorldPop_inputfile: String pointing to location of file
-    --------
-    Outputs:
-        src.meta["transform"]: Representation of the affine transform
-        src.shape: Dimensions of the input image
-    """
-    with rasterio.open(WorldPop_inputfile) as src:
-        return src.meta["transform"], src.shape
+    # Calculate the population for each region
+    windowed_pop_count = sum_values_using_geomask(
+        np_pop_val, np_pop_xy, region_geomask, id_mapping
+    )
+    return windowed_pop_count
 
 
 def get_worldpop_val_xy(WorldPop_inputfile, window_dimensions):
@@ -762,7 +769,7 @@ def get_worldpop_val_xy(WorldPop_inputfile, window_dimensions):
 
 
 def compute_geomask_region(
-    country_rows, affine_transform, window_dimensions, windowed=False
+    country_rows, affine_transform, window_dimensions, latlong_topleft, latlong_botright
 ):
     """
     Function to mask geometries into np_map_ID using an incrementing counter
@@ -776,230 +783,60 @@ def compute_geomask_region(
     Outputs:
         np_map_ID.astype("H"): np_map_ID contains an ID for each location (undefined is 0)
             dimensions are taken from window_dimensions, .astype("H") for memory savings
-        pd.DataFrame(id_to_GADM_ID).set_index(0):
+        id_result:
             DataFrame of the mapping from id (from counter) to GADM_ID
     """
-    try:
-        col_offset, row_offset, x_axis_len, y_axis_len = [
-            int(i) for i in window_dimensions
-        ]
-
-        if windowed:
-            # Declare a transformer with given affine_transform
-            transformer = rasterio.transform.AffineTransformer(affine_transform)
-
-            # Obtain the coordinates of the upper left corner of window
-            window_topleft_longitude, window_topleft_latitude = transformer.xy(
-                row_offset, col_offset
-            )
-
-            # Obtain the coordinates of the bottom right corner of window
-            window_botright_longitude, window_botright_latitude = transformer.xy(
-                row_offset + y_axis_len, col_offset + x_axis_len
-            )
-
-            # Set the current transform to the correct lat and long
-            affine_transform = rasterio.Affine(
-                affine_transform[0],
-                affine_transform[1],
-                window_topleft_longitude,
-                affine_transform[3],
-                affine_transform[4],
-                window_topleft_latitude,
-            )
-
-        # Set an empty numpy array with the dimensions of the country .tif file
-        # np_map_ID will contain an ID for each location (undefined is 0)
-        # ID corresponds to a specific geometry in country_rows
-        np_map_ID = np.zeros((y_axis_len, x_axis_len))
-
-        # List to contain the mappings of id to GADM_ID
-        id_to_GADM_ID = []
-
-        # Loop the country_rows geoDataFrame
-        for i in range(len(country_rows)):
-            # Set the current geometry
-            cur_geometry = country_rows.iloc[i]["geometry"]
-
-            # In windowed mode we check if bounds of geometry overlap the window
-            if windowed:
-                latitude_min = cur_geometry.bounds[1]
-                latitude_max = cur_geometry.bounds[3]
-
-                # In the following cases we don't have to continue the loop
-                # If the geometry is above the window
-                if latitude_min > window_topleft_latitude:
-                    continue
-                # If the geometry is below the window
-                if latitude_max < window_botright_latitude:
-                    continue
-
-            # Generate a mask for the specific geometry
-            temp_mask = rasterio.features.geometry_mask(
-                [cur_geometry],
-                (y_axis_len, x_axis_len),
-                transform=affine_transform,
-                all_touched=True,
-                invert=True,
-            )
-
-            # Map the values of counter value to np_map_ID
-            np_map_ID[temp_mask] = i + 1
-
-            # Store the id -> GADM_ID mapping
-            id_to_GADM_ID.append([i + 1, country_rows.iloc[i]["GADM_ID"]])
-
-        if len(id_to_GADM_ID) > 0:
-            id_result = pd.DataFrame(id_to_GADM_ID).set_index(0)
-        else:
-            id_result = pd.DataFrame()
-
-        # Return np_map_ID as type 'H' np.ushort
-        # 'H' -> https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.ushort
-        # This lowers memory usage, note: ID has to be within the range [0,65535]
-        return np_map_ID.astype("H"), id_result
-    except:
-        print(country_rows, affine_transform, window_dimensions, windowed)
-        raise
-
-
-def compute_population(
-    country_rows, WorldPop_inputfile, mem_read_limit_per_process, out_logging=False
-):
-    """
-    Function computes the population for the given country rows
-    -------
-    Inputs:
-        country_rows: geoDataFrame filled with geometries and their GADM_ID
-        WorldPop_inputfile: file location of worldpop file
-        out_logging: Boolean to enable logging
-    --------
-    Outputs:
-        df_pop_count: Dataframe with columns
-            - "pop" containing population of GADM_ID region
-            - "GADM_ID"
-    """
-    # Get the features of the worldpop input file
-    transform, worldpop_dim = get_worldpop_features(WorldPop_inputfile)
-
-    worldpop_y_dim, worldpop_x_dim = worldpop_dim
-
-    # Rasterio doesn't support lower than float32 readout
-    # Hence reading the file will take up: nbytes = 4 * y_dim * x_dim
-    expected_bytes_input_read = 4 * worldpop_y_dim * worldpop_x_dim
-
-    # Introduce a max byte size to avoid overfilling RAM
-    # Ensure worldpop_byte_limit >= 883 * 10**6 (minimum memory for 'US')
-    worldpop_byte_limit = max(883, mem_read_limit_per_process) * 10**6
-
-    # If the rasterio read will be within byte limit
-    if expected_bytes_input_read < worldpop_byte_limit:
-        # Call functions with input dimensions for window
-        window_dim = [0, 0, worldpop_x_dim, worldpop_y_dim]
-
-        # Get population values and corresponding x,y coords
-        np_pop_val, np_pop_xy = get_worldpop_val_xy(WorldPop_inputfile, window_dim)
-
-        # get the geomask with id mappings
-        country_geomask, id_mapping = compute_geomask_region(
-            country_rows, transform, window_dim
-        )
-
-        # Calculate the population for each region
-        df_pop_count = sum_values_using_geomask(
-            np_pop_val, np_pop_xy, country_geomask, id_mapping
-        )
-
-    else:
-        # Calculate the population using windows
-        df_pop_count = windowed_compute_population(
-            country_rows, WorldPop_inputfile, worldpop_byte_limit
-        )
-
-    return df_pop_count
-
-
-def windowed_compute_population(country_rows, WorldPop_inputfile, worldpop_byte_limit):
-    """
-    Function
-    -------
-    Inputs:
-        country_rows: geoDataFrame filled with geometries and their GADM_ID
-        WorldPop_inputfile: file location of worldpop file
-        worldpop_byte_limit: Memory limit which determines the window size
-    --------
-    Outputs:
-        df_pop_count: Dataframe with columns
-            - "pop" containing population of GADM_ID region
-            - "GADM_ID"
-
-    """
-    # Create a dataframe to store the population data
-    df_pop_count = country_rows.loc[:, ["GADM_ID", "pop"]].copy()
-
-    # Open the file using rasterio
-    with rasterio.open(WorldPop_inputfile) as src:
-        transform = src.meta["transform"]
-        worldpop_y_dim, worldpop_x_dim = src.shape
-        block_y_dim, block_x_dim = src.block_shapes[0]
-
-    # Set the windows x dimension to the input x dimension (width)
-    window_x_dim = worldpop_x_dim
-    # From testing we can assume max x dimension will always fit in memory:
-    #   Largest memory requirement is 'US' at ~882.1 MB = 4 * 512 * window_x_dim
-    #   Hence worldpop_byte_limit has to be greater than 883 MB
-
-    # As the window spans the x dimension, set column offset to 0
-    window_col_offset = 0
+    col_offset, row_offset, x_axis_len, y_axis_len = window_dimensions
 
-    # Calculate the bytes for reading the window using window_x_dim (float32)
-    read_block_size = 4 * block_y_dim * window_x_dim
+    # Set an empty numpy array with the dimensions of the country .tif file
+    # np_map_ID will contain an ID for each location (undefined is 0)
+    # ID corresponds to a specific geometry in country_rows
+    np_map_ID = np.zeros((y_axis_len, x_axis_len))
 
-    # Calculate the amount of blocks that fit into the memory budget
-    # Using the calculated x dimension
-    window_block_count = worldpop_byte_limit // read_block_size
+    # List to contain the mappings of id to GADM_ID
+    id_to_GADM_ID = []
 
-    # Multiply the y_dimension by the amount of blocks in the window
-    # window_y_dim will be height of the window
-    window_y_dim = window_block_count * block_y_dim
+    # Loop the country_rows geoDataFrame
+    for i in range(len(country_rows)):
+        # Set the current geometry
+        cur_geometry = country_rows.iloc[i]["geometry"]
 
-    # Calculate the y ranges of the blocks to scan the image
-    # y_range_start will serve as row offset
-    window_row_offset = np.arange(0, worldpop_y_dim, window_y_dim)
+        # Check if bounds of geometry overlap the window
+        latitude_min = cur_geometry.bounds[1]
+        latitude_max = cur_geometry.bounds[3]
 
-    # Loop the windows
-    for row_offset in window_row_offset:
-        window_dimensions = [window_col_offset, row_offset, window_x_dim, window_y_dim]
-
-        np_pop_val, np_pop_xy = get_worldpop_val_xy(
-            WorldPop_inputfile, window_dimensions
-        )
-
-        # If no values are present in the current window skip the remaining steps
-        if len(np_pop_val) == 0:
+        # In the following cases we don't have to continue the loop
+        # If the geometry is above the window
+        if latitude_min > latlong_topleft[0]:
+            continue
+        # If the geometry is below the window
+        if latitude_max < latlong_botright[0]:
             continue
 
-        # get the geomask with id mappings
-        region_geomask, id_mapping = compute_geomask_region(
-            country_rows, transform, window_dimensions, windowed=True
+        # Generate a mask for the specific geometry
+        temp_mask = rasterio.features.geometry_mask(
+            [cur_geometry],
+            (y_axis_len, x_axis_len),
+            transform=affine_transform,
+            all_touched=True,
+            invert=True,
         )
 
-        # If no values are present in the id_mapping skip the remaining steps
-        if len(id_mapping) == 0:
-            continue
+        # Map the values of counter value to np_map_ID
+        np_map_ID[temp_mask] = i + 1
 
-        # Calculate the population for each region
-        windowed_pop_count = sum_values_using_geomask(
-            np_pop_val, np_pop_xy, region_geomask, id_mapping
-        )
+        # Store the id -> GADM_ID mapping
+        id_to_GADM_ID.append([i + 1, country_rows.iloc[i]["GADM_ID"]])
 
-        # Loop the regions and write population to df_pop_count
-        for i in range(len(windowed_pop_count)):
-            gadm_id, pop_count = windowed_pop_count.iloc[i]
-            # Select the row with the same "GADM_ID" and set the population count
-            df_pop_count.loc[df_pop_count["GADM_ID"] == gadm_id, "pop"] += pop_count
+    if len(id_to_GADM_ID) > 0:
+        id_result = pd.DataFrame(id_to_GADM_ID).set_index(0)
+    else:
+        id_result = pd.DataFrame()
 
-    return df_pop_count
+    # Return np_map_ID as type 'H' np.ushort
+    # 'H' -> https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.ushort
+    # This lowers memory usage, note: ID has to be within the range [0,65535]
+    return np_map_ID.astype("H"), id_result
 
 
 def sum_values_using_geomask(np_pop_val, np_pop_xy, region_geomask, id_mapping):
@@ -1076,6 +913,124 @@ def loop_and_extact_val_x_y(
     return np_pop_count
 
 
+def calculate_transform_and_coords_for_window(
+    current_transform, window_dimensions, original_window=False
+):
+    col_offset, row_offset, x_axis_len, y_axis_len = window_dimensions
+
+    # Declare a transformer with given affine_transform
+    transformer = rasterio.transform.AffineTransformer(current_transform)
+
+    # Obtain the coordinates of the upper left corner of window
+    window_topleft_longitude, window_topleft_latitude = transformer.xy(
+        row_offset, col_offset
+    )
+
+    # Obtain the coordinates of the bottom right corner of window
+    window_botright_longitude, window_botright_latitude = transformer.xy(
+        row_offset + y_axis_len, col_offset + x_axis_len
+    )
+
+    if original_window:
+        adjusted_transform = current_transform
+    else:
+        # Set the adjusted transform to the correct lat and long
+        adjusted_transform = rasterio.Affine(
+            current_transform[0],
+            current_transform[1],
+            window_topleft_longitude,
+            current_transform[3],
+            current_transform[4],
+            window_topleft_latitude,
+        )
+
+    coordinate_topleft = [window_topleft_latitude, window_topleft_longitude]
+    coordinate_botright = [window_botright_latitude, window_botright_longitude]
+
+    return [adjusted_transform, coordinate_topleft, coordinate_botright]
+
+
+def generate_df_tasks(c_code, mem_read_limit_per_process, WorldPop_inputfile):
+    """
+    Function to generate a list of tasks based on the memory constraints
+    One task represents a single window of the image
+    -------
+    Inputs:
+        c_code: country code
+        mem_read_limit_per_process: memory limit for src.read() operation
+    --------
+    Outputs:
+        Dataframe of task_list
+    """
+    task_list = []
+
+    # Read out dimensions and transform of image
+    with rasterio.open(WorldPop_inputfile) as src:
+        worldpop_y_dim, worldpop_x_dim = src.shape
+        transform = src.meta["transform"]
+        block_y_dim, block_x_dim = [int(i) for i in src.block_shapes[0]]
+
+    # Rasterio doesn't support lower than float32 readout
+    # Hence reading the file will take up: nbytes = 4 * y_dim * x_dim
+    expected_bytes_input_read = 4 * worldpop_y_dim * worldpop_x_dim
+
+    # Introduce a limit that avoids overfilling RAM during readout
+    # Ensure worldpop_byte_limit >= 883 * 10**6 (minimum memory for 'US')
+    worldpop_byte_limit = max(883, mem_read_limit_per_process) * 10**6
+
+    if expected_bytes_input_read < worldpop_byte_limit:
+        # If the rasterio read will be within byte limit
+        bool_original_window = True
+        # Set the window to entire image dimensions
+        current_window = [0, 0, worldpop_x_dim, worldpop_y_dim]
+        # Get latlong coordinates of window
+        transform_and_coords = calculate_transform_and_coords_for_window(
+            transform, current_window, bool_original_window
+        )
+        task_list.append(
+            [c_code, current_window, bool_original_window] + transform_and_coords
+        )
+    else:
+        # Reading operation has to be split up into multiple windows
+        bool_original_window = False
+        # Set the windows x dimension to the input x dimension (width)
+        window_x_dim = worldpop_x_dim
+        # From testing we can assume max x dimension will always fit in memory:
+        #   Largest memory requirement is 'US' at ~882.1 MB = 4 * 512 * window_x_dim
+        #   Hence worldpop_byte_limit has to be greater than 883 MB
+
+        # As the window spans the x dimension, set column offset to 0
+        window_col_offset = 0
+
+        # Calculate the bytes for reading the window using window_x_dim (float32)
+        read_block_size = 4 * block_y_dim * window_x_dim
+
+        # Calculate the amount of blocks that fit into the memory budget
+        # Using the calculated x dimension
+        window_block_count = int(worldpop_byte_limit // read_block_size)
+
+        # Multiply the y_dimension by the amount of blocks in the window
+        # window_y_dim will be height of the window
+        window_y_dim = window_block_count * block_y_dim
+
+        # Calculate row offsets for all windows
+        window_row_offset = np.arange(0, worldpop_y_dim, window_y_dim)
+
+        # Loop the windows and add task for each one to task_list
+        for row_offset in window_row_offset:
+            current_window = [window_col_offset, row_offset, window_x_dim, window_y_dim]
+
+            transform_and_coords = calculate_transform_and_coords_for_window(
+                transform, current_window, bool_original_window
+            )
+
+            task_list.append(
+                [c_code, current_window, bool_original_window] + transform_and_coords
+            )
+
+    return pd.DataFrame(task_list)
+
+
 def add_population_data(
     df_gadm,
     country_codes,
@@ -1109,6 +1064,9 @@ def add_population_data(
     # Initialize new dict to hold worldpop_inputfile strings
     dict_worldpop_file_locations = {}
 
+    # Initialize new dataframe to hold tasks for processing
+    df_tasks = pd.DataFrame()
+
     if out_logging:
         logger.info(
             "Stage 3 of 5: Download WorldPop datasets, method: " + str(worldpop_method)
@@ -1116,7 +1074,7 @@ def add_population_data(
 
     tqdm_kwargs_download = dict(
         ascii=False,
-        desc="Downloading worldpop file per country",
+        desc="Downloading worldpop file per country and assigning tasks",
     )
     with tqdm(total=len(country_codes), **tqdm_kwargs_download) as pbar:
         for c_code in country_codes:
@@ -1125,31 +1083,50 @@ def add_population_data(
                 c_code, worldpop_method, year, update, out_logging
             )
             dict_worldpop_file_locations[c_code] = WorldPop_inputfile
+
+            df_new_tasks = generate_df_tasks(
+                c_code, mem_read_limit_per_process, WorldPop_inputfile
+            )
+
+            df_tasks = pd.concat([df_tasks, df_new_tasks], ignore_index=True)
+
             pbar.update(1)
 
+    df_tasks.columns = [
+        "c_code",
+        "window_dimensions",
+        "is_original_window",
+        "affine_transform",
+        "latlong_coordinate_topleft",
+        "latlong_coordinate_botright",
+    ]
+
     if out_logging:
         logger.info("Stage 4 of 5: Add population data to GADM GeoDataFrame")
 
+    if out_logging:
+        logger.info("Stage 4 of 5: Starting multiprocessing")
+
+    tqdm_kwargs_compute = dict(
+        ascii=False, desc="Compute population per window", unit=" window"
+    )
     lock = mp.Lock()
     kwargs = {
         "initializer": _init_process_pop,
         "initargs": (
             df_gadm,
+            df_tasks,
             dict_worldpop_file_locations,
-            mem_read_limit_per_process,
-            out_logging,
         ),
         "processes": nprocesses,
     }
     # Spawn processes with the parameters from kwargs
     with mp.get_context("spawn").Pool(**kwargs) as pool:
         # Create a progress bar
-        with tqdm(
-            total=len(country_codes), desc="Compute population per country"
-        ) as pbar:
+        with tqdm(total=len(df_tasks), **tqdm_kwargs_compute) as pbar:
             # Give the pool a workload
             for df_pop_count in pool.imap_unordered(
-                _process_function_population, country_codes
+                new_process_function_population, range(len(df_tasks))
             ):
                 # Acquire the lock before accessing df_gadm
                 with lock:
@@ -1157,7 +1134,7 @@ def add_population_data(
                     for i in range(len(df_pop_count)):
                         gadm_id, pop_count = df_pop_count.iloc[i]
                         # Select the row with the same "GADM_ID" and set the population count
-                        df_gadm.loc[df_gadm["GADM_ID"] == gadm_id, "pop"] = pop_count
+                        df_gadm.loc[df_gadm["GADM_ID"] == gadm_id, "pop"] += pop_count
                 pbar.update(1)
 
 

From 0e168daf4bd1f431175800d2ec20c69eef765507 Mon Sep 17 00:00:00 2001
From: GridGrapher <127969728+GridGrapher@users.noreply.github.com>
Date: Wed, 26 Apr 2023 00:53:02 +0200
Subject: [PATCH 24/29] Docstrings and comments

---
 scripts/build_shapes.py | 79 +++++++++++++++++++++++++++++------------
 1 file changed, 57 insertions(+), 22 deletions(-)

diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index 6acf4c169..7537a4a7d 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -682,14 +682,23 @@ def _init_process_pop(df_gadm_, df_tasks_, dict_worldpop_file_locations_):
     global df_gadm, df_tasks
     df_gadm, df_tasks = df_gadm_, df_tasks_
 
-    global dict_worldpop_file_locations, out_logging
-    dict_worldpop_file_locations, out_logging = (
-        dict_worldpop_file_locations_,
-        False,
-    )
+    global dict_worldpop_file_locations
+    dict_worldpop_file_locations = dict_worldpop_file_locations_
 
 
-def new_process_function_population(row_id):
+def process_function_population(row_id):
+    """
+    Function that reads the task from df_tasks and executes all the methods
+    to obtain population values for the specified region
+    -------
+    Inputs:
+        row_id: integer which indicates a specific row of df_tasks
+    --------
+    Outputs:
+        windowed_pop_count: Dataframe containing "GADM_ID" and "pop" columns
+        It represents the amount of population per region (GADM_ID),
+        for the settings given by the row in df_tasks
+    """
     # Get the current task values
     current_row = df_tasks.iloc[row_id]
 
@@ -776,9 +785,10 @@ def compute_geomask_region(
     -------
     Inputs:
         country_rows: geoDataFrame filled with geometries and their GADM_ID
-        affine_transform: affine transform of current image
+        affine_transform: affine transform of current window
         window_dimensions: dimensions of window used when reading file
-        windowed: True if called during windowed processing (default = False)
+        latlong_topleft: [latitude, longitude] of top left corner of the window
+        latlong_botright: [latitude, longitude] of bottom right corner of the window
     --------
     Outputs:
         np_map_ID.astype("H"): np_map_ID contains an ID for each location (undefined is 0)
@@ -801,11 +811,11 @@ def compute_geomask_region(
         # Set the current geometry
         cur_geometry = country_rows.iloc[i]["geometry"]
 
-        # Check if bounds of geometry overlap the window
         latitude_min = cur_geometry.bounds[1]
         latitude_max = cur_geometry.bounds[3]
 
-        # In the following cases we don't have to continue the loop
+        # Check if bounds of geometry overlap the window
+        # In the following cases we can skip the rest of the loop
         # If the geometry is above the window
         if latitude_min > latlong_topleft[0]:
             continue
@@ -841,13 +851,15 @@ def compute_geomask_region(
 
 def sum_values_using_geomask(np_pop_val, np_pop_xy, region_geomask, id_mapping):
     """
-    Function
+    Function that sums all the population values in np_pop_val into the correct GADM_ID
+    It uses np_pop_xy to access the key stored in region_geomask[x][y]
+    The relation of this key to GADM_ID is stored in id_mapping
     -------
     Inputs:
         np_pop_val: array filled with values for each nonzero pixel in the worldpop file
         np_pop_xy: array with [x,y] coordinates of the corresponding nonzero values in np_pop_valid
-        region_geomask:
-        id_mapping:
+        region_geomask: array with dimensions of window, values are keys that map to GADM_ID using id_mapping
+        id_mapping: Dataframe that contains mappings of region_geomask values to GADM_IDs
     --------
     Outputs:
         df_pop_count: Dataframe with columns
@@ -870,6 +882,7 @@ def sum_values_using_geomask(np_pop_val, np_pop_xy, region_geomask, id_mapping):
     # Declare an array to contain population counts
     np_pop_count = np.zeros(len(id_mapping) + 1)
 
+    # Calculate population count of region using a numba njit compiled function
     np_pop_count = loop_and_extact_val_x_y(
         np_pop_count, np_pop_val, np_pop_xy, region_geomask, dict_id
     )
@@ -886,14 +899,16 @@ def loop_and_extact_val_x_y(
     np_pop_count, np_pop_val, np_pop_xy, region_geomask, dict_id
 ):
     """
-    Function
+    Function that will be compiled using @njit (numba)
+    It takes all the population values from np_pop_val and stores them in np_pop_count
+    where each location in np_pop_count is mapped to a GADM_ID through dict_id (id_mapping by extension)
     -------
     Inputs:
-        np_pop_count:
-        np_pop_val:
-        np_pop_xy:
-        region_geomask:
-        dict_id:
+        np_pop_count: np.zeros array, which will store population counts
+        np_pop_val: array filled with values for each nonzero pixel in the worldpop file
+        np_pop_xy: array with [x,y] coordinates of the corresponding nonzero values in np_pop_valid
+        region_geomask: array with dimensions of window, values are keys that map to GADM_ID using id_mapping
+        dict_id: numba typed.dict containing id_mapping.index -> location in np_pop_count
     --------
     Outputs:
         np_pop_count: np.array containing population counts
@@ -916,9 +931,26 @@ def loop_and_extact_val_x_y(
 def calculate_transform_and_coords_for_window(
     current_transform, window_dimensions, original_window=False
 ):
+    """
+    Function which calculates the [lat,long] corners of the window given window_dimensions,
+    if not(original_window) it also changes the affine transform to match the window
+    -------
+    Inputs:
+        current_transform: affine transform of source image
+        window_dimensions: dimensions of window used when reading file
+        original_window: boolean to track if window covers entire country
+    --------
+    Outputs:
+        A list of: [
+            adjusted_transform: affine transform adjusted to window
+            coordinate_topleft: [latitude, longitude] of top left corner of the window
+            coordinate_botright: [latitude, longitude] of bottom right corner of the window
+        ]
+
+    """
     col_offset, row_offset, x_axis_len, y_axis_len = window_dimensions
 
-    # Declare a transformer with given affine_transform
+    # Declare a affine transformer with given current_transform
     transformer = rasterio.transform.AffineTransformer(current_transform)
 
     # Obtain the coordinates of the upper left corner of window
@@ -932,6 +964,7 @@ def calculate_transform_and_coords_for_window(
     )
 
     if original_window:
+        # If the window covers the entire country then use original affine transform
         adjusted_transform = current_transform
     else:
         # Set the adjusted transform to the correct lat and long
@@ -958,6 +991,7 @@ def generate_df_tasks(c_code, mem_read_limit_per_process, WorldPop_inputfile):
     Inputs:
         c_code: country code
         mem_read_limit_per_process: memory limit for src.read() operation
+        WorldPop_inputfile: file location of worldpop file
     --------
     Outputs:
         Dataframe of task_list
@@ -1002,7 +1036,7 @@ def generate_df_tasks(c_code, mem_read_limit_per_process, WorldPop_inputfile):
         # As the window spans the x dimension, set column offset to 0
         window_col_offset = 0
 
-        # Calculate the bytes for reading the window using window_x_dim (float32)
+        # Calculate the bytes for reading the window using window_x_dim (readout is float32)
         read_block_size = 4 * block_y_dim * window_x_dim
 
         # Calculate the amount of blocks that fit into the memory budget
@@ -1084,6 +1118,7 @@ def add_population_data(
             )
             dict_worldpop_file_locations[c_code] = WorldPop_inputfile
 
+            # Given the memory constraint, generate a task for all windows to cover the country (c_code)
             df_new_tasks = generate_df_tasks(
                 c_code, mem_read_limit_per_process, WorldPop_inputfile
             )
@@ -1126,7 +1161,7 @@ def add_population_data(
         with tqdm(total=len(df_tasks), **tqdm_kwargs_compute) as pbar:
             # Give the pool a workload
             for df_pop_count in pool.imap_unordered(
-                new_process_function_population, range(len(df_tasks))
+                process_function_population, range(len(df_tasks))
             ):
                 # Acquire the lock before accessing df_gadm
                 with lock:

From a991c7018d5fec08fb42f48452eb33f64fe08f81 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 14 Aug 2023 11:23:14 +0000
Subject: [PATCH 25/29] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 scripts/build_shapes.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index 3f20a141d..32bf83e22 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -714,7 +714,8 @@ def _init_process_pop(df_gadm_, df_tasks_, dict_worldpop_file_locations_):
 
 def process_function_population(row_id):
     """
-    Function that reads the task from df_tasks and executes all the methods
+    Function that reads the task from df_tasks and executes all the methods.
+
     to obtain population values for the specified region
     -------
     Inputs:
@@ -877,8 +878,9 @@ def compute_geomask_region(
 
 def sum_values_using_geomask(np_pop_val, np_pop_xy, region_geomask, id_mapping):
     """
-    Function that sums all the population values in np_pop_val into the correct GADM_ID
-    It uses np_pop_xy to access the key stored in region_geomask[x][y]
+    Function that sums all the population values in np_pop_val into the correct
+    GADM_ID It uses np_pop_xy to access the key stored in region_geomask[x][y]
+
     The relation of this key to GADM_ID is stored in id_mapping
     -------
     Inputs:
@@ -891,7 +893,6 @@ def sum_values_using_geomask(np_pop_val, np_pop_xy, region_geomask, id_mapping):
         df_pop_count: Dataframe with columns
             - "GADM_ID"
             - "pop" containing population of GADM_ID region
-
     """
     # Initialize a dictionary
     dict_id = Dict.empty(
@@ -925,8 +926,9 @@ def loop_and_extact_val_x_y(
     np_pop_count, np_pop_val, np_pop_xy, region_geomask, dict_id
 ):
     """
-    Function that will be compiled using @njit (numba)
-    It takes all the population values from np_pop_val and stores them in np_pop_count
+    Function that will be compiled using @njit (numba) It takes all the
+    population values from np_pop_val and stores them in np_pop_count.
+
     where each location in np_pop_count is mapped to a GADM_ID through dict_id (id_mapping by extension)
     -------
     Inputs:
@@ -958,7 +960,9 @@ def calculate_transform_and_coords_for_window(
     current_transform, window_dimensions, original_window=False
 ):
     """
-    Function which calculates the [lat,long] corners of the window given window_dimensions,
+    Function which calculates the [lat,long] corners of the window given
+    window_dimensions,
+
     if not(original_window) it also changes the affine transform to match the window
     -------
     Inputs:
@@ -972,7 +976,6 @@ def calculate_transform_and_coords_for_window(
             coordinate_topleft: [latitude, longitude] of top left corner of the window
             coordinate_botright: [latitude, longitude] of bottom right corner of the window
         ]
-
     """
     col_offset, row_offset, x_axis_len, y_axis_len = window_dimensions
 
@@ -1011,7 +1014,8 @@ def calculate_transform_and_coords_for_window(
 
 def generate_df_tasks(c_code, mem_read_limit_per_process, WorldPop_inputfile):
     """
-    Function to generate a list of tasks based on the memory constraints
+    Function to generate a list of tasks based on the memory constraints.
+
     One task represents a single window of the image
     -------
     Inputs:

From 62364678555d0057e6f3ebd83125a28122a82963 Mon Sep 17 00:00:00 2001
From: davide-f <fioritidavidesubs@gmail.com>
Date: Tue, 15 Aug 2023 18:38:21 +0200
Subject: [PATCH 26/29] Minor improvements to code and documentation

---
 config.default.yaml                      |  1 -
 config.tutorial.yaml                     |  1 -
 doc/configtables/build_shape_options.csv |  1 -
 scripts/build_shapes.py                  | 20 +++++++++++++++++---
 4 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/config.default.yaml b/config.default.yaml
index 40a00662d..08a9ee890 100644
--- a/config.default.yaml
+++ b/config.default.yaml
@@ -90,7 +90,6 @@ build_shape_options:
   out_logging: true  # When true, logging is printed to console
   year: 2020  # reference year used to derive shapes, info on population and info on GDP
   nprocesses: 3  # number of processes to be used in build_shapes
-  nchunks: 3  # number of data chuncks for build_shapes. If not specified or smaller than nprocesses, specified as nprocesses
   worldpop_method: "standard"  # "standard" pulls from web 1kmx1km raster, "api" pulls from API 100mx100m raster, false (not "false") no pop addition to shape which is useful when generating only cutout
   gdp_method: "standard"  # "standard" pulls from web 1x1km raster, false (not "false") no gdp addition to shape which useful when generating only cutout
   contended_flag: "set_by_country" # "set_by_country" assigns the contended areas to the countries according to the GADM database, "drop" drops these contended areas from the model
diff --git a/config.tutorial.yaml b/config.tutorial.yaml
index 2115c00df..20f154200 100644
--- a/config.tutorial.yaml
+++ b/config.tutorial.yaml
@@ -104,7 +104,6 @@ build_shape_options:
   out_logging: true  # When true, logging is printed to console
   year: 2020  # reference year used to derive shapes, info on population and info on GDP
   nprocesses: 2  # number of processes to be used in build_shapes
-  nchunks: 2  # number of data chuncks for build_shapes. If not specified or smaller than nprocesses, specified as nprocesses
   worldpop_method: "standard"  # "standard" pulls from web 1kmx1km raster, "api" pulls from API 100mx100m raster, false (not "false") no pop addition to shape which is useful when generating only cutout
   gdp_method: "standard"  # "standard" pulls from web 1x1km raster, false (not "false") no gdp addition to shape which useful when generating only cutout
   contended_flag: "set_by_country" # "set_by_country" assigns the contended areas to the countries according to the GADM database, "drop" drops these contended areas from the model
diff --git a/doc/configtables/build_shape_options.csv b/doc/configtables/build_shape_options.csv
index 446dab8bd..78df78713 100644
--- a/doc/configtables/build_shape_options.csv
+++ b/doc/configtables/build_shape_options.csv
@@ -4,7 +4,6 @@ update_file, bool, "{True, False}", "True: all input files are downloaded again
 out_logging, bool, "{True, False}", "True: Logging is printed in the console."
 year,, "past year; e.g. YYYY", "Reference year used to derive shapes, info on population and info on GDP."
 nprocesses, int,, "Number of processes to be used in build_shapes."
-nchunks, int,, "Number of data chuncks for build_shapes. If not specified or smaller than nprocesses, specified as nprocesses."
 worldpop_method,, "{""standard"", ""api"", false}", "Specifies how population is added to every shape: ""standard"" pulls from web 1kmx1km raster; ""api"" pulls from API 100mx100m raster; false (not ""false"") no population addition to shape. This is useful when generating only cutout."
 gdp_method,, "{""standard"", false}", "Specifies how GDP is added to every shape: ""standard"" pulls from web 1x1km raster; false (not ""false"") no gdp addition to shape. This is useful when generating only cutout."
 contended_flag,, "{""set_by_country"", ""drop""}", "Specifies what to do with contended countries: ""set_by_country"" assigns the contended areas to the countries according to the GADM database; ""drop"" drops the contended areas from the model."
diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index 32bf83e22..38b17c661 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -941,7 +941,6 @@ def loop_and_extact_val_x_y(
     Outputs:
         np_pop_count: np.array containing population counts
     """
-
     # Loop the population data
     for i in range(len(np_pop_val)):
         cur_value = np_pop_val[i]
@@ -1108,6 +1107,19 @@ def add_population_data(
 ):
     """
     Function to add population data to arbitrary number of shapes in a country.
+    It loads data from WorldPop raster files where each pixel represents the population in that square region.
+    Each square polygon (or pixel) is then mapped into the corresponding GADM shape.
+    Then the population in a GADM shape is identified by summing over all pixels mapped to that region.
+    
+    This is performed with an iterative approach:
+    1. All necessary WorldPop data tiff file are downloaded
+    2. The so-called windows are created to handle RAM limitations related to large WorldPop files.
+       Large WorldPop files require significant RAM to handle, which may not be available,
+       hence, the entire activity is decomposed into multiple windows (or tasks).
+       Each window represents a subset of a raster file on which the following algorithm is applied.
+       Note: when enough RAM is available only a window is created for efficiency purposes.
+    3. Execute all tasks by summing the values of the pixels mapped into each GADM shape.
+       Parallelization applies in this task.
 
     Inputs:
     -------
@@ -1193,14 +1205,16 @@ def add_population_data(
             for df_pop_count in pool.imap_unordered(
                 process_function_population, range(len(df_tasks))
             ):
-                # Acquire the lock before accessing df_gadm
+                # Acquire the lock before accessing df_gadm and pbar
                 with lock:
                     # Loop the regions and write population to df_gadm
                     for i in range(len(df_pop_count)):
                         gadm_id, pop_count = df_pop_count.iloc[i]
                         # Select the row with the same "GADM_ID" and set the population count
                         df_gadm.loc[df_gadm["GADM_ID"] == gadm_id, "pop"] += pop_count
-                pbar.update(1)
+                    
+                    # update bar
+                    pbar.update(1)
 
 
 def gadm(

From b3074316e870426ad08c353af26e345552e7840e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 15 Aug 2023 17:07:19 +0000
Subject: [PATCH 27/29] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 scripts/build_shapes.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index 38b17c661..52a211a24 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -1107,10 +1107,11 @@ def add_population_data(
 ):
     """
     Function to add population data to arbitrary number of shapes in a country.
-    It loads data from WorldPop raster files where each pixel represents the population in that square region.
-    Each square polygon (or pixel) is then mapped into the corresponding GADM shape.
-    Then the population in a GADM shape is identified by summing over all pixels mapped to that region.
-    
+    It loads data from WorldPop raster files where each pixel represents the
+    population in that square region. Each square polygon (or pixel) is then
+    mapped into the corresponding GADM shape. Then the population in a GADM
+    shape is identified by summing over all pixels mapped to that region.
+
     This is performed with an iterative approach:
     1. All necessary WorldPop data tiff file are downloaded
     2. The so-called windows are created to handle RAM limitations related to large WorldPop files.
@@ -1212,7 +1213,7 @@ def add_population_data(
                         gadm_id, pop_count = df_pop_count.iloc[i]
                         # Select the row with the same "GADM_ID" and set the population count
                         df_gadm.loc[df_gadm["GADM_ID"] == gadm_id, "pop"] += pop_count
-                    
+
                     # update bar
                     pbar.update(1)
 

From b3953fea7b3cf45e7ef509d638304715cd5b89fd Mon Sep 17 00:00:00 2001
From: davide-f <fioritidavidesubs@gmail.com>
Date: Tue, 15 Aug 2023 20:55:04 +0200
Subject: [PATCH 28/29] Fix kosovo and spelling

---
 scripts/build_shapes.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index 52a211a24..84963764e 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -461,10 +461,10 @@ def download_WorldPop_standard(
     """
 
     if country_code == "XK":
-        WorldPop_filename = f"srb_ppp_{year}_UNadj_constrained.tif"
+        WorldPop_filename = f"kos_ppp_{year}_constrained.tif"
         WorldPop_urls = [
-            f"https://data.worldpop.org/GIS/Population/Global_2000_2020_Constrained/2020/BSGM/SRB/{WorldPop_filename}",
-            f"https://data.worldpop.org/GIS/Population/Global_2000_2020_Constrained/2020/maxar_v1/SRB/{WorldPop_filename}",
+            f"https://data.worldpop.org/GIS/Population/Global_2000_2020_Constrained/2020/BSGM/KOS/{WorldPop_filename}",
+            f"https://data.worldpop.org/GIS/Population/Global_2000_2020_Constrained/2020/maxar_v1/KOS/{WorldPop_filename}",
         ]
     else:
         WorldPop_filename = f"{two_2_three_digits_country(country_code).lower()}_ppp_{year}_UNadj_constrained.tif"
@@ -901,7 +901,7 @@ def sum_values_using_geomask(np_pop_val, np_pop_xy, region_geomask, id_mapping):
     )
     dict_id[0] = 0
     counter = 1
-    # Loop over ip mapping and add indicies to the dictionary
+    # Loop over ip mapping and add indices to the dictionary
     for ID_index in np.array(id_mapping.index):
         dict_id[ID_index] = counter
         counter += 1

From 9ddb40dfedd465729f5cb321000157f67e479d9c Mon Sep 17 00:00:00 2001
From: Davide Fioriti <fioritidavidesubs@gmail.com>
Date: Sat, 23 Dec 2023 17:36:06 +0100
Subject: [PATCH 29/29] Fix max snakemake version

---
 envs/environment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/envs/environment.yaml b/envs/environment.yaml
index 491e29052..b65376cb7 100644
--- a/envs/environment.yaml
+++ b/envs/environment.yaml
@@ -23,7 +23,7 @@ dependencies:
 - xlrd
 - openpyxl
 - seaborn
-- snakemake-minimal
+- snakemake-minimal<8
 - memory_profiler
 - ruamel.yaml<=0.17.26
 - pytables