Merge branch 'main' of https://github.com/KnowledgeEdgeAI/PETs_for_Pu…

…blic_Health_Challenge
KnowledgeEdgeAI · Nov 12, 2024 · 4e73a20 · 4e73a20
2 parents d86472b + cf53f4c
commit 4e73a20
Show file tree

Hide file tree

Showing 17 changed files with 597 additions and 23 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,31 @@
+name: Python CI
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v2
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install .[dev]
+
+    - name: Run tests
+      run: |
+        pytest tests/test.py
diff --git a/dist/dp_epidemiology-0.0.7-py3-none-any.whl b/dist/dp_epidemiology-0.0.7-py3-none-any.whl
diff --git a/dist/dp_epidemiology-0.0.7.tar.gz b/dist/dp_epidemiology-0.0.7.tar.gz
diff --git a/dist/dp_epidemiology-0.0.8-py3-none-any.whl b/dist/dp_epidemiology-0.0.8-py3-none-any.whl
diff --git a/dist/dp_epidemiology-0.0.8.tar.gz b/dist/dp_epidemiology-0.0.8.tar.gz
diff --git a/docs/requirements.txt b/docs/requirements.txt
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "DP_epidemiology"
-version = "0.0.7"
+version = "0.0.8"
 
 dependencies = [
   "pandas>=2.1.4",
@@ -15,6 +15,8 @@ dependencies = [
   "plotly",
   "dash",
   "nbformat",
+  "scipy",
+  "matplotlib"
 ]
 
 authors = [

diff --git a/src/DP_epidemiology/__pycache__/contact_matrix.cpython-310.pyc b/src/DP_epidemiology/__pycache__/contact_matrix.cpython-310.pyc
diff --git a/src/DP_epidemiology/__pycache__/mobility_analyzer.cpython-310.pyc b/src/DP_epidemiology/__pycache__/mobility_analyzer.cpython-310.pyc
diff --git a/src/DP_epidemiology/__pycache__/pandemic_adherence_analyzer.cpython-310.pyc b/src/DP_epidemiology/__pycache__/pandemic_adherence_analyzer.cpython-310.pyc
diff --git a/src/DP_epidemiology/__pycache__/utilities.cpython-310.pyc b/src/DP_epidemiology/__pycache__/utilities.cpython-310.pyc
diff --git a/src/DP_epidemiology/__pycache__/viz.cpython-310.pyc b/src/DP_epidemiology/__pycache__/viz.cpython-310.pyc
diff --git a/src/DP_epidemiology/mobility_analyzer.py b/src/DP_epidemiology/mobility_analyzer.py
@@ -3,14 +3,15 @@
 import sys
 import os
 from datetime import datetime
+import scipy.stats as stats
 import opendp.prelude as dp
 
 dp.enable_features("contrib", "floating-point", "honest-but-curious")
 
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 from DP_epidemiology.utilities import *
 
-def mobility_analyzer(df:pd.DataFrame,start_date:datetime,end_date:datetime,city: str, epsilon:float):
+def mobility_analyzer_airline(df:pd.DataFrame,start_date:datetime,end_date:datetime,city: str, epsilon:float):
     """final function to predict hotspots"""
     bounds = (0, 600)
     upper_bound=600
@@ -40,4 +41,48 @@ def mobility_analyzer(df:pd.DataFrame,start_date:datetime,end_date:datetime,city
     >>make_private_sum_by(transaction_data_col, groupby_col, bounds, scale)
    )
 
-    return analyzer(new_df)
+    return analyzer(new_df)
+
+def mobility_analyzer(df:pd.DataFrame,start_date:datetime,end_date:datetime,city: str,category:str, epsilon:float):
+    """final function to predict hotspots"""
+    bounds = (0, 600)
+    upper_bound=600
+    transaction_data_col = "nb_transactions"
+    groupby_col = "date"
+
+    city_col="city"
+    time_col="date"
+    merch_category_col="merch_super_category"
+    # merch_filter="Airlines"
+
+
+    """time steps calculation"""
+    nb_timesteps = (end_date - start_date).days // 7
+
+    """scale calculation"""
+    scale=(np.sqrt(3.0)*nb_timesteps*upper_bound)/epsilon
+
+    new_df=df.copy()
+
+
+    analyzer=(
+    make_preprocess_location()
+    >>make_preprocess_merchant_mobility()
+    >>make_filter(city_col,city)
+    >>make_filter(merch_category_col, category)
+    >>make_truncate_time(start_date, end_date, time_col)
+    >>make_private_sum_by(transaction_data_col, groupby_col, bounds, scale)
+   )
+
+    return analyzer(new_df)
+
+def mobility_validation_with_google_mobility(df_transactional_data:pd.DataFrame, df_google_mobility_data:pd.DataFrame, start_date:datetime, end_date:datetime, city:str, category:str, epsilon:float):
+    df_transactional_mobility= mobility_analyzer(df_transactional_data,start_date,end_date,city,category,epsilon)
+    offset=df_transactional_mobility["date"][0]
+    df_google_mobility = preprocess_google_mobility(df_google_mobility_data,start_date,end_date,city,category,offset)
+
+    length =min(len(df_transactional_mobility),len(df_google_mobility))
+    # print(df_transactional_mobility.head())
+    # print(df_google_mobility.head())
+    r, p = stats.pearsonr(df_transactional_mobility['nb_transactions'][:length], df_google_mobility[category][:length])
+    print(f"Scipy computed Pearson r: {r} and p-value: {p}")
diff --git a/src/DP_epidemiology/utilities.py b/src/DP_epidemiology/utilities.py
@@ -307,4 +307,94 @@ def private_count(df):
         output_measure=dp.max_divergence(T=int),
         function=private_count,
         privacy_map=lambda d_in: d_in*epsilon,
-    )
+    )
+
+def make_preprocess_merchant_mobility():
+    """Create a 1-stable transformation to bin `merch_postal_code` by city"""
+
+    def categorize_merchant(merch):
+        if merch in ['General Retail Stores','Restaurants','Bars/Discotheques']:
+            return "retail_and_recreation"
+        elif merch in ['Grocery Stores/Supermarkets','Drug Stores/Pharmacies']:
+            return "grocery_and_pharmacy"
+        elif merch in ['Airlines']:
+            return "transit_stations"
+        else:
+            return "other"
+
+    def merchant_preprocess(df):
+        loc_df = df.copy()
+        # Convert merchant_postal_code into str type
+        loc_df["merch_category"] = loc_df["merch_category"].astype(str)
+        # Apply the function to create a new column
+        loc_df["merch_super_category"] = loc_df["merch_category"].apply(
+            categorize_merchant
+        )
+        return loc_df
+
+    return dp.t.make_user_transformation(
+        input_domain=dataframe_domain(),
+        input_metric=identifier_distance(),
+        output_domain=dataframe_domain(),
+        output_metric=identifier_distance(),
+        function=merchant_preprocess,
+        stability_map=lambda d_in: d_in,
+    )
+
+def preprocess_google_mobility(df:pd.DataFrame,start_date:datetime, end_date:datetime,  city:str, category:str, offset: datetime=None):
+
+    # df = df.copy()
+
+    def region_filter(df, country_code, region_1, region_2, all=False):
+        df=df.copy()
+        mask = ((df["country_region_code"].isin(country_code)) & (df["sub_region_1"].isin(region_1)))
+        if all:
+            mask = (mask & (df["sub_region_2"].isin(region_2)))
+        return df[mask]
+
+    def time_preprocess(df):
+        df = df.copy()
+
+        # Filter the DataFrame based on the specified dates
+        return df[(df["date"] >= start_date) & (df["date"] <= end_date)]
+
+    if city == "Bogota":
+        df = region_filter(df, ["CO"], ["Bogota"], "")
+    elif city == "Medellin":
+        df = region_filter(df, ["CO"], ["Antioquia"], ["Medellin"], all=True)
+    elif city == "Santiago":
+        df = region_filter(df, ["CL"], ["Santiago Metropolitan Region"], ["Santiago Province"], all=True)
+    elif city == "Brasilia":
+        df = region_filter(df, ["BR"], ["Federal District"], "")
+    else:
+        raise ValueError("Invalid city")
+
+    df = df.drop(["place_id", "country_region_code", "country_region", "sub_region_1", "sub_region_2", "metro_area", "iso_3166_2_code", "census_fips_code","parks_percent_change_from_baseline","workplaces_percent_change_from_baseline","residential_percent_change_from_baseline"], axis=1)
+
+    # Ensure the date column is in datetime format
+    df['date'] = pd.to_datetime(df['date'])
+
+    df=time_preprocess(df)
+    # Set the date column as the index
+    df.set_index('date', inplace=True)
+
+    # Group by week and calculate the sum
+    df_weekly = df.resample('W', label="right").sum()
+
+    # Shift dates back to original dates
+    if offset:
+
+        if not (offset >= (df_weekly.index[0]-pd.DateOffset(7)) and  offset<=df_weekly.index[1]):
+            raise ValueError("Invalid offset date")
+
+        df_weekly.index = df_weekly.index - pd.DateOffset(days=(df_weekly.index[0] - offset).days)
+
+    # Reset the index to keep the first date of each week
+    df_weekly.reset_index(inplace=True)
+
+    df_weekly.rename(columns={"retail_and_recreation_percent_change_from_baseline": "retail_and_recreation", "grocery_and_pharmacy_percent_change_from_baseline": "grocery_and_pharmacy", "transit_stations_percent_change_from_baseline": "transit_stations"}, inplace=True)
+
+    # Filter the dataframe based on the category
+    df_final = df_weekly[["date", category]]
+
+    return df_final