Merge branch 'main' of https://github.com/KnowledgeEdgeAI/PETs_for_Pu…

…blic_Health_Challenge
KnowledgeEdgeAI · Nov 13, 2024 · ba36b3d · ba36b3d
2 parents a19017b + 0220a86
commit ba36b3d
Show file tree

Hide file tree

Showing 19 changed files with 843 additions and 118 deletions.
diff --git a/README.rst b/README.rst
@@ -35,15 +35,15 @@ Sensitivity and Epsilon Analysis
 * Sensitivity : In a single time stamp, ``1`` merchant can come only once in a particular zip code but can appear in upto ``3`` zip codes. So, if we wanted to release measures about a single zip code sensitivity would be ``1``  but since we want to release data for all zip codes, the sensitivity used for each zip code is ``3``.
 * Scaling with Time: For multiple time stamps, sensitivity is ``3 * no_of_time_stamps``.
 * Epsilon Budget: The epsilon spent for each query is ``∈``.
-* Scale Calculation: ``Scale = (sqrt(3) * no_of_time_stamps) / ∈``.
+* Scale Calculation: ``Scale = (3 * no_of_time_stamps* upper_bound) / ∈``.
 
 
-Mobility Detection (Airline Merch Category)
--------------------------------------------
+Mobility Detection 
+------------------
 
 Description
 
-This analysis tracks mobility by monitoring differential private time series release of financial transactions in the "Airlines" category, which reflects the transportation sector.
+This analysis tracks mobility by monitoring differential private time series release of financial transactions in the ``retail_and_recreation``, ``grocery_and_pharmacy`` and ``transit_stations`` super categories which matches with google mobility data for easy validation.
 
 Assumptions
 
@@ -54,17 +54,18 @@ Assumptions
 Algorithm
 
 #. Add City Column: A new ``city`` column is added based on postal codes (``make_preprocess_location``).
+#. Add Super Category Column : A new ``merch_super_category`` column is added for classifying transactions into retail_and_recreation, grocery_and_pharmacy and transit_stations categories (``make_preprocess_merchant_mobility``).
 #. Filter for City: Data for the selected city is filtered (``make_filter``).
-#. Filter for Airline Category: Only transactions in the ``Airline`` category are considered (``make_filter``).
+#. Filter for super category: data is filtered for retail_and_recreation, grocery_and_pharmacy and transit_stations categories (``make_filter``).
 #. Filter by Time Frame: Data is filtered for the selected time frame (``make_truncate_time``).
 #. Transaction Summing & Noise Addition: Sum the number of transactions by postal code for each timestep and add Gaussian noise (``make_private_sum_by``).
 
 Sensitivity and Epsilon Analysis
 
-* Sensitivity per Merchant: Sensitivity is 3 for each merchant in the ``Airline`` category.
+* Sensitivity per Merchant: Sensitivity is 3 for each merchant.
 * Scaling with Time: For multiple timesteps, sensitivity is ``3 * no_of_time_steps``.
 * Epsilon Budget: The epsilon spent per timestep is ∈ .
-* Scale Calculation: ``Scale = (3 * no_of_time_steps) / ∈``.
+* Scale Calculation: ``Scale = (3 * no_of_time_steps* upper_bound) / ∈``.
 
 Validation
 
@@ -100,7 +101,7 @@ Sensitivity and Epsilon Analysis
 * Sensitivity per Category : Sensitivity is ``3`` for each category (essential or luxurious goods).
 * Scaling with Time : For multiple timesteps, sensitivity is ``3 * no_of_time_steps``.
 * Epsilon Budget : The epsilon spent per timestep is ∈.
-* Scale Calculation : ``Scale = (3 * no_of_time_steps) / ∈``.
+* Scale Calculation : ``Scale = (3 * no_of_time_steps* upper_bound) / ∈``.
 
 
 

diff --git a/dist/dp_epidemiology-0.0.8-py3-none-any.whl b/dist/dp_epidemiology-0.0.8-py3-none-any.whl
diff --git a/dist/dp_epidemiology-0.0.8.tar.gz b/dist/dp_epidemiology-0.0.8.tar.gz
diff --git a/dist/dp_epidemiology-0.0.9-py3-none-any.whl b/dist/dp_epidemiology-0.0.9-py3-none-any.whl
diff --git a/dist/dp_epidemiology-0.0.9.tar.gz b/dist/dp_epidemiology-0.0.9.tar.gz
diff --git a/docs/requirements.txt b/docs/requirements.txt
diff --git a/docs/usage.rst b/docs/usage.rst
@@ -61,21 +61,22 @@ For example:
 
 
 To do mobility inference,
-you can use the ``mobility_analyzer.mobility_analyzer()`` function to generate differential private time series of trnsactional data in the "Airlines" category:
+you can use the ``mobility_analyzer.mobility_analyzer()`` function to generate differential private time series of trnsactional data in the ``retail_and_recreation``, ``grocery_and_pharmacy`` and ``transit_stations`` super categories:
 
 .. autofunction:: mobility_analyzer.mobility_analyzer
 
 The ``df`` parameter take pandas dataframe as input with columns ``[ "ID", "date", "merch_category", "merch_postal_code", "transaction_type", "spendamt", "nb_transactions"]``.
 The ``start_date`` and  ``end_date`` parameters take the start and end date of the time frame for which the analysis is to be done.
 The ``city`` parameter takes the name of the city for which the analysis is to be done.
+The ``category`` parameter takes the value of ``retail_and_recreation``, ``grocery_and_pharmacy`` or ``transit_stations`` for which the analysis is to be done.
 The ``epsilon`` parameter takes the value of epsilon for differential privacy.
 
 For example:
 
 >>> from DP_epidemiology import mobility_analyzer
 >>> from datetime import datetime
 >>> df = pd.read_csv('data.csv')
->>> mobility_analyzer.mobility_analyzer(df,datetime(2020, 9, 1),datetime(2021, 3, 31),"Medellin",10)
+>>> mobility_analyzer.mobility_analyzer(df,datetime(2020, 9, 1),datetime(2021, 3, 31),"Medellin","retail_and_recreation",10)
    nb_transactions       date
 0              1258 2020-09-01
 1              1328 2020-09-08

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "DP_epidemiology"
-version = "0.0.8"
+version = "0.0.9"
 
 dependencies = [
   "pandas>=2.1.4",
@@ -16,7 +16,8 @@ dependencies = [
   "dash",
   "nbformat",
   "scipy",
-  "matplotlib"
+  "matplotlib",
+  "dtw",
 ]
 
 authors = [

diff --git a/src/DP_epidemiology/__pycache__/contact_matrix.cpython-310.pyc b/src/DP_epidemiology/__pycache__/contact_matrix.cpython-310.pyc
diff --git a/src/DP_epidemiology/__pycache__/hotspot_analyzer.cpython-310.pyc b/src/DP_epidemiology/__pycache__/hotspot_analyzer.cpython-310.pyc
diff --git a/src/DP_epidemiology/__pycache__/mobility_analyzer.cpython-310.pyc b/src/DP_epidemiology/__pycache__/mobility_analyzer.cpython-310.pyc
diff --git a/src/DP_epidemiology/__pycache__/pandemic_adherence_analyzer.cpython-310.pyc b/src/DP_epidemiology/__pycache__/pandemic_adherence_analyzer.cpython-310.pyc
diff --git a/src/DP_epidemiology/__pycache__/utilities.cpython-310.pyc b/src/DP_epidemiology/__pycache__/utilities.cpython-310.pyc
diff --git a/src/DP_epidemiology/hotspot_analyzer.py b/src/DP_epidemiology/hotspot_analyzer.py
@@ -25,7 +25,7 @@ def hotspot_analyzer(df:pd.DataFrame, start_date:datetime,end_date:datetime,city
     nb_timesteps = (end_date - start_date).days // 7
 
     """scale calculation"""
-    scale=(np.sqrt(3.0)*nb_timesteps*upper_bound)/epsilon
+    scale=(3.0*nb_timesteps*upper_bound)/epsilon
 
     new_df=df.copy()
 

diff --git a/src/DP_epidemiology/mobility_analyzer.py b/src/DP_epidemiology/mobility_analyzer.py
@@ -5,6 +5,8 @@
 from datetime import datetime
 import scipy.stats as stats
 import opendp.prelude as dp
+import matplotlib.pyplot as plt
+from dtw import dtw,accelerated_dtw
 
 dp.enable_features("contrib", "floating-point", "honest-but-curious")
 
@@ -28,7 +30,7 @@ def mobility_analyzer_airline(df:pd.DataFrame,start_date:datetime,end_date:datet
     nb_timesteps = (end_date - start_date).days // 7
 
     """scale calculation"""
-    scale=(np.sqrt(3.0)*nb_timesteps*upper_bound)/epsilon
+    scale=(3.0*nb_timesteps*upper_bound)/epsilon
 
     new_df=df.copy()
 
@@ -60,7 +62,7 @@ def mobility_analyzer(df:pd.DataFrame,start_date:datetime,end_date:datetime,city
     nb_timesteps = (end_date - start_date).days // 7
 
     """scale calculation"""
-    scale=(np.sqrt(3.0)*nb_timesteps*upper_bound)/epsilon
+    scale=(3.0*nb_timesteps*upper_bound)/epsilon
 
     new_df=df.copy()
 
@@ -85,4 +87,15 @@ def mobility_validation_with_google_mobility(df_transactional_data:pd.DataFrame,
     # print(df_transactional_mobility.head())
     # print(df_google_mobility.head())
     r, p = stats.pearsonr(df_transactional_mobility['nb_transactions'][:length], df_google_mobility[category][:length])
-    print(f"Scipy computed Pearson r: {r} and p-value: {p}")
+    print(f"Scipy computed Pearson r: {r} and p-value: {p}")
+
+    d1 = df_transactional_mobility['nb_transactions'][:length].interpolate().values
+    d2 = df_google_mobility[category][:length].interpolate().values
+    d, cost_matrix, acc_cost_matrix, path = accelerated_dtw(d1,d2, dist='euclidean')
+
+    plt.imshow(acc_cost_matrix.T, origin='lower', cmap='gray', interpolation='nearest')
+    plt.plot(path[0], path[1], 'w')
+    plt.xlabel('Subject1')
+    plt.ylabel('Subject2')
+    plt.title(f'DTW Minimum Path with minimum distance: {np.round(d,2)}')
+    plt.show()
diff --git a/src/DP_epidemiology/pandemic_adherence_analyzer.py b/src/DP_epidemiology/pandemic_adherence_analyzer.py
@@ -26,7 +26,7 @@ def pandemic_adherence_analyzer(df:pd.DataFrame,start_date:datetime,end_date:dat
     nb_timesteps = (end_date - start_date).days // 7
 
     """scale calculation"""
-    scale=(np.sqrt(3.0)*nb_timesteps*upper_bound)/epsilon
+    scale=(3.0*nb_timesteps*upper_bound)/epsilon
 
     new_df=df.copy()
 

diff --git a/src/DP_epidemiology/utilities.py b/src/DP_epidemiology/utilities.py
@@ -118,15 +118,15 @@ def function(df):
 
 def make_private_sum_by(column, by, bounds, scale):
     """Create a measurement that computes the grouped bounded sum of `column`"""
-    space = dp.vector_domain(dp.atom_domain(T=int)), dp.l2_distance(T=float)
-    m_gauss = space >> dp.m.then_gaussian(scale)
+    space = dp.vector_domain(dp.atom_domain(T=int)), dp.l1_distance(T=int)
+    m_lap = space >> dp.m.then_laplace(scale)
     t_sum = make_sum_by(column, by, bounds)
 
     def function(df):
         exact = t_sum(df)
         # print(exact)
         noisy_sum = pd.Series(
-            np.maximum(m_gauss(exact.to_numpy().flatten()), 0), 
+            np.maximum(m_lap(exact.to_numpy().flatten()), 0), 
         )
         # print(noisy_sum)
         noisy_sum=noisy_sum.to_frame(name=column)
@@ -138,7 +138,7 @@ def function(df):
         input_metric=dp.symmetric_distance(),
         output_measure=dp.zero_concentrated_divergence(T=float),
         function=function,
-        privacy_map=lambda d_in: m_gauss.map(t_sum.map(d_in)),
+        privacy_map=lambda d_in: m_lap.map(t_sum.map(d_in)),
     )
 
 def make_filter(column,entry):