Aggregate statistics by province

burggraaff · Feb 20, 2024 · 4121831 · 4121831
1 parent 2ca8c69
commit 4121831
Show file tree

Hide file tree

Showing 4 changed files with 86 additions and 9 deletions.
diff --git a/fpcup/__init__.py b/fpcup/__init__.py
@@ -1,3 +1,3 @@
-from . import constants, io, plotting, settings
+from . import analysis, constants, io, plotting, settings
 from . import agro, crop, site, soil, weather
 from .model import run_pcse_single, run_pcse_ensemble, run_pcse_ensemble_parallel
diff --git a/fpcup/analysis.py b/fpcup/analysis.py
@@ -0,0 +1,20 @@
+"""
+Functions for analysis
+"""
+import numpy as np
+import pandas as pd
+import geopandas as gpd
+
+from ._typing import Callable
+
+def weighted_mean_for_DF(data: pd.DataFrame, *, weightby: str="area") -> Callable:
+    """
+    Generate a weighted mean function for the given dataframe, to be used in .agg.
+    Example:
+        weight_by_area = weighted_mean_for_DF(summary)
+        weighted_average_yield = summary.agg(wm_twso=("TWSO", weight_by_area))
+    """
+    def weighted_mean(x):
+        return np.average(x, weights=data.loc[x.index, weightby])
+
+    return weighted_mean
diff --git a/fpcup/province.py b/fpcup/province.py
@@ -4,10 +4,10 @@
 
 import geopandas as gpd
 import numpy as np
-from pandas import Series
+from pandas import DataFrame, Series
 from tqdm import tqdm
 
-from ._typing import Iterable
+from ._typing import Iterable, Optional
 from .constants import CRS_AMERSFOORT
 from .settings import DEFAULT_DATA
 
@@ -16,14 +16,14 @@
 nl_boundary = nl.boundary
 
 # Load the provinces
-provinces = gpd.read_file(DEFAULT_DATA/"NL_provinces.geojson")
-provinces_coarse = gpd.read_file(DEFAULT_DATA/"NL_provinces_coarse.geojson")
-province_names = list(provinces["naamOfficieel"]) + ["Friesland"]
+_provinces = gpd.read_file(DEFAULT_DATA/"NL_provinces.geojson")
+_provinces_coarse = gpd.read_file(DEFAULT_DATA/"NL_provinces_coarse.geojson")
+province_names = list(_provinces["naamOfficieel"]) + ["Friesland"]
 
 # Access individual provinces using a dictionary, e.g. province_boundary["Zuid-Holland"]
-province_area = {name: poly for name, poly in zip(provinces["naamOfficieel"], provinces["geometry"])}
-province_boundary = {name: gpd.GeoSeries(outline) for name, outline in zip(provinces["naamOfficieel"], provinces.boundary)}
-province_coarse = {name: poly for name, poly in zip(provinces_coarse["naamOfficieel"], provinces_coarse["geometry"])}
+province_area = {name: poly for name, poly in zip(_provinces["naamOfficieel"], _provinces["geometry"])}
+province_boundary = {name: gpd.GeoSeries(outline) for name, outline in zip(_provinces["naamOfficieel"], _provinces.boundary)}
+province_coarse = {name: poly for name, poly in zip(_provinces_coarse["naamOfficieel"], _provinces_coarse["geometry"])}
 
 # Add an alias for Friesland/Fryslân
 province_area["Friesland"] = province_area["Fryslân"]
@@ -92,3 +92,31 @@ def add_provinces(data: gpd.GeoDataFrame, *, new_column: str="province", provinc
 
     # Add the series to the dataframe
     data[new_column] = province_list
+
+def add_province_geometry(data: DataFrame, which: str="area", *, column_name: Optional[str]=None, crs: str=CRS_AMERSFOORT) -> gpd.GeoDataFrame:
+    """
+    Add a column with province geometry to a DataFrame with a province name column/index.
+    """
+    # Remove entries that are not in the province list
+    pass
+
+    # Use the index if no column was provided
+    if column_name is None:
+        column = data.index.to_series()
+    else:
+        column = data[column_name]
+
+    # Apply the dictionary mapping
+    geometry = column.map(province_area)
+    data_new = gpd.GeoDataFrame(data, geometry=geometry, crs=crs)
+
+    # Change to outline if desired
+    # (Doing this at the start gives an error)
+    if which.lower() == "area":
+        pass
+    elif which.lower() == "outline":
+        data_new["geometry"] = data_new.boundary
+    else:
+        raise ValueError(f"Cannot add geometries of type `{which}`")
+
+    return data_new
diff --git a/plot_results.py b/plot_results.py
@@ -61,6 +61,35 @@
 if args.verbose:
     print(f"Saved batch results plot to {filename_summary.absolute()}")
 
+# Aggregate results by province
+weighted_mean = fpcup.analysis.weighted_mean_for_DF(summary)
+byprovince = summary.groupby("province")
+
+keys_to_aggregate = ["DVS", "LAIMAX", "TWSO", "CTRAT", "CEVST", "RD", "DOS", "DOE", "DOM"]
+
+# Calculate the mean per province of several variables, weighted by plot area if possible
+keys_to_average = ["DVS", "LAIMAX", "TWSO", "CTRAT", "CEVST", "RD"]
+try:
+    byprovince_mean = byprovince[keys_to_average].agg(weighted_mean)
+# Use a normal mean if there is no area information available
+except KeyError:
+    byprovince_mean = byprovince[keys_to_average].mean()
+    filename_means = results_dir / f"WOFOST_{tag}-mean.csv"
+    if args.verbose:
+        print("Could not calculate weighted means because there is no 'area' column -- defaulting to a regular mean")
+else:  # This runs if the original `try` block succeeded
+    filename_means = results_dir / f"WOFOST_{tag}-weighted-mean.csv"
+    if args.verbose:
+        print("Calculated weighted means")
+
+byprovince_mean.to_csv(filename_means)
+if args.verbose:
+    print(f"Saved aggregate (weighted) mean columns to {filename_means.absolute()}")
+
+# Add geometry and plot the results
+byprovince_mean = fpcup.province.add_province_geometry(byprovince_mean, "area")
+byprovince_mean.plot("LAIMAX")
+
 # Space between summary and outputs sections
 if args.verbose:
     print()