Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
manandraj20 committed Nov 12, 2024
2 parents d86472b + cf53f4c commit 4e73a20
Show file tree
Hide file tree
Showing 17 changed files with 597 additions and 23 deletions.
31 changes: 31 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: Python CI

on:
push:
branches:
- main
pull_request:
branches:
- main

jobs:
test:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v2

- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.x'

- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install .[dev]
- name: Run tests
run: |
pytest tests/test.py
Binary file removed dist/dp_epidemiology-0.0.7-py3-none-any.whl
Binary file not shown.
Binary file removed dist/dp_epidemiology-0.0.7.tar.gz
Binary file not shown.
Binary file added dist/dp_epidemiology-0.0.8-py3-none-any.whl
Binary file not shown.
Binary file added dist/dp_epidemiology-0.0.8.tar.gz
Binary file not shown.
Binary file modified docs/requirements.txt
Binary file not shown.
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "DP_epidemiology"
version = "0.0.7"
version = "0.0.8"

dependencies = [
"pandas>=2.1.4",
Expand All @@ -15,6 +15,8 @@ dependencies = [
"plotly",
"dash",
"nbformat",
"scipy",
"matplotlib"
]

authors = [
Expand Down
Binary file modified src/DP_epidemiology/__pycache__/contact_matrix.cpython-310.pyc
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified src/DP_epidemiology/__pycache__/utilities.cpython-310.pyc
Binary file not shown.
Binary file modified src/DP_epidemiology/__pycache__/viz.cpython-310.pyc
Binary file not shown.
49 changes: 47 additions & 2 deletions src/DP_epidemiology/mobility_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@
import sys
import os
from datetime import datetime
import scipy.stats as stats
import opendp.prelude as dp

dp.enable_features("contrib", "floating-point", "honest-but-curious")

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from DP_epidemiology.utilities import *

def mobility_analyzer(df:pd.DataFrame,start_date:datetime,end_date:datetime,city: str, epsilon:float):
def mobility_analyzer_airline(df:pd.DataFrame,start_date:datetime,end_date:datetime,city: str, epsilon:float):
"""final function to predict hotspots"""
bounds = (0, 600)
upper_bound=600
Expand Down Expand Up @@ -40,4 +41,48 @@ def mobility_analyzer(df:pd.DataFrame,start_date:datetime,end_date:datetime,city
>>make_private_sum_by(transaction_data_col, groupby_col, bounds, scale)
)

return analyzer(new_df)
return analyzer(new_df)

def mobility_analyzer(df:pd.DataFrame,start_date:datetime,end_date:datetime,city: str,category:str, epsilon:float):
"""final function to predict hotspots"""
bounds = (0, 600)
upper_bound=600
transaction_data_col = "nb_transactions"
groupby_col = "date"

city_col="city"
time_col="date"
merch_category_col="merch_super_category"
# merch_filter="Airlines"


"""time steps calculation"""
nb_timesteps = (end_date - start_date).days // 7

"""scale calculation"""
scale=(np.sqrt(3.0)*nb_timesteps*upper_bound)/epsilon

new_df=df.copy()


analyzer=(
make_preprocess_location()
>>make_preprocess_merchant_mobility()
>>make_filter(city_col,city)
>>make_filter(merch_category_col, category)
>>make_truncate_time(start_date, end_date, time_col)
>>make_private_sum_by(transaction_data_col, groupby_col, bounds, scale)
)

return analyzer(new_df)

def mobility_validation_with_google_mobility(df_transactional_data:pd.DataFrame, df_google_mobility_data:pd.DataFrame, start_date:datetime, end_date:datetime, city:str, category:str, epsilon:float):
df_transactional_mobility= mobility_analyzer(df_transactional_data,start_date,end_date,city,category,epsilon)
offset=df_transactional_mobility["date"][0]
df_google_mobility = preprocess_google_mobility(df_google_mobility_data,start_date,end_date,city,category,offset)

length =min(len(df_transactional_mobility),len(df_google_mobility))
# print(df_transactional_mobility.head())
# print(df_google_mobility.head())
r, p = stats.pearsonr(df_transactional_mobility['nb_transactions'][:length], df_google_mobility[category][:length])
print(f"Scipy computed Pearson r: {r} and p-value: {p}")
92 changes: 91 additions & 1 deletion src/DP_epidemiology/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,4 +307,94 @@ def private_count(df):
output_measure=dp.max_divergence(T=int),
function=private_count,
privacy_map=lambda d_in: d_in*epsilon,
)
)

def make_preprocess_merchant_mobility():
"""Create a 1-stable transformation to bin `merch_postal_code` by city"""

def categorize_merchant(merch):
if merch in ['General Retail Stores','Restaurants','Bars/Discotheques']:
return "retail_and_recreation"
elif merch in ['Grocery Stores/Supermarkets','Drug Stores/Pharmacies']:
return "grocery_and_pharmacy"
elif merch in ['Airlines']:
return "transit_stations"
else:
return "other"

def merchant_preprocess(df):
loc_df = df.copy()
# Convert merchant_postal_code into str type
loc_df["merch_category"] = loc_df["merch_category"].astype(str)
# Apply the function to create a new column
loc_df["merch_super_category"] = loc_df["merch_category"].apply(
categorize_merchant
)
return loc_df

return dp.t.make_user_transformation(
input_domain=dataframe_domain(),
input_metric=identifier_distance(),
output_domain=dataframe_domain(),
output_metric=identifier_distance(),
function=merchant_preprocess,
stability_map=lambda d_in: d_in,
)

def preprocess_google_mobility(df:pd.DataFrame,start_date:datetime, end_date:datetime, city:str, category:str, offset: datetime=None):

# df = df.copy()

def region_filter(df, country_code, region_1, region_2, all=False):
df=df.copy()
mask = ((df["country_region_code"].isin(country_code)) & (df["sub_region_1"].isin(region_1)))
if all:
mask = (mask & (df["sub_region_2"].isin(region_2)))
return df[mask]

def time_preprocess(df):
df = df.copy()

# Filter the DataFrame based on the specified dates
return df[(df["date"] >= start_date) & (df["date"] <= end_date)]

if city == "Bogota":
df = region_filter(df, ["CO"], ["Bogota"], "")
elif city == "Medellin":
df = region_filter(df, ["CO"], ["Antioquia"], ["Medellin"], all=True)
elif city == "Santiago":
df = region_filter(df, ["CL"], ["Santiago Metropolitan Region"], ["Santiago Province"], all=True)
elif city == "Brasilia":
df = region_filter(df, ["BR"], ["Federal District"], "")
else:
raise ValueError("Invalid city")

df = df.drop(["place_id", "country_region_code", "country_region", "sub_region_1", "sub_region_2", "metro_area", "iso_3166_2_code", "census_fips_code","parks_percent_change_from_baseline","workplaces_percent_change_from_baseline","residential_percent_change_from_baseline"], axis=1)

# Ensure the date column is in datetime format
df['date'] = pd.to_datetime(df['date'])

df=time_preprocess(df)
# Set the date column as the index
df.set_index('date', inplace=True)

# Group by week and calculate the sum
df_weekly = df.resample('W', label="right").sum()

# Shift dates back to original dates
if offset:

if not (offset >= (df_weekly.index[0]-pd.DateOffset(7)) and offset<=df_weekly.index[1]):
raise ValueError("Invalid offset date")

df_weekly.index = df_weekly.index - pd.DateOffset(days=(df_weekly.index[0] - offset).days)

# Reset the index to keep the first date of each week
df_weekly.reset_index(inplace=True)

df_weekly.rename(columns={"retail_and_recreation_percent_change_from_baseline": "retail_and_recreation", "grocery_and_pharmacy_percent_change_from_baseline": "grocery_and_pharmacy", "transit_stations_percent_change_from_baseline": "transit_stations"}, inplace=True)

# Filter the dataframe based on the category
df_final = df_weekly[["date", category]]

return df_final
Loading

0 comments on commit 4e73a20

Please sign in to comment.