Skip to content

Commit

Permalink
updated on new data
Browse files Browse the repository at this point in the history
  • Loading branch information
manandraj20 committed Nov 13, 2024
1 parent 4e73a20 commit a19017b
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 47 deletions.
135 changes: 91 additions & 44 deletions learnConsumptionDistribution.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,10 @@
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>date</th>\n",
" <th>merch_category</th>\n",
" <th>merch_postal_code</th>\n",
" <th>transaction_type</th>\n",
" <th>date</th>\n",
" <th>spendamt</th>\n",
" <th>nb_transactions</th>\n",
" </tr>\n",
Expand All @@ -63,71 +63,71 @@
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2019-01-01</td>\n",
" <td>Grocery Stores/Supermarkets</td>\n",
" <td>8700000</td>\n",
" <td>Hospitals</td>\n",
" <td>111921</td>\n",
" <td>ONLINE</td>\n",
" <td>11238.128450</td>\n",
" <td>160</td>\n",
" <td>2019-01-01</td>\n",
" <td>80797.323317</td>\n",
" <td>398</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>Bars/Discotheques</td>\n",
" <td>050025</td>\n",
" <td>OFFLINE</td>\n",
" <td>2019-01-01</td>\n",
" <td>Grocery Stores/Supermarkets</td>\n",
" <td>500034</td>\n",
" <td>ONLINE</td>\n",
" <td>12848.165221</td>\n",
" <td>183</td>\n",
" <td>5331.031100</td>\n",
" <td>283</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>Bars/Discotheques</td>\n",
" <td>050032</td>\n",
" <td>OFFLINE</td>\n",
" <td>2019-01-01</td>\n",
" <td>Grocery Stores/Supermarkets</td>\n",
" <td>110621</td>\n",
" <td>ONLINE</td>\n",
" <td>12116.165569</td>\n",
" <td>173</td>\n",
" <td>5180.722635</td>\n",
" <td>268</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>2019-01-01</td>\n",
" <td>Hotels/Motels</td>\n",
" <td>8900000</td>\n",
" <td>Drug Stores/Pharmacies</td>\n",
" <td>050012</td>\n",
" <td>OFFLINE</td>\n",
" <td>7745.998879</td>\n",
" <td>38</td>\n",
" <td>2019-01-01</td>\n",
" <td>5032.333763</td>\n",
" <td>177</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>2019-01-01</td>\n",
" <td>Restaurants</td>\n",
" <td>111941</td>\n",
" <td>3</td>\n",
" <td>Drug Stores/Pharmacies</td>\n",
" <td>050031</td>\n",
" <td>OFFLINE</td>\n",
" <td>6927.424754</td>\n",
" <td>173</td>\n",
" <td>2019-01-01</td>\n",
" <td>4899.182326</td>\n",
" <td>150</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ID date merch_category merch_postal_code \\\n",
"0 1 2019-01-01 Grocery Stores/Supermarkets 8700000 \n",
"1 1 2019-01-01 Grocery Stores/Supermarkets 500034 \n",
"2 2 2019-01-01 Grocery Stores/Supermarkets 110621 \n",
"3 3 2019-01-01 Hotels/Motels 8900000 \n",
"4 4 2019-01-01 Restaurants 111941 \n",
" ID merch_category merch_postal_code transaction_type date \\\n",
"0 1 Hospitals 111921 ONLINE 2019-01-01 \n",
"1 2 Bars/Discotheques 050025 OFFLINE 2019-01-01 \n",
"2 2 Bars/Discotheques 050032 OFFLINE 2019-01-01 \n",
"3 3 Drug Stores/Pharmacies 050012 OFFLINE 2019-01-01 \n",
"4 3 Drug Stores/Pharmacies 050031 OFFLINE 2019-01-01 \n",
"\n",
" transaction_type spendamt nb_transactions \n",
"0 ONLINE 11238.128450 160 \n",
"1 ONLINE 12848.165221 183 \n",
"2 ONLINE 12116.165569 173 \n",
"3 OFFLINE 7745.998879 38 \n",
"4 OFFLINE 6927.424754 173 "
" spendamt nb_transactions \n",
"0 80797.323317 398 \n",
"1 5331.031100 283 \n",
"2 5180.722635 268 \n",
"3 5032.333763 177 \n",
"4 4899.182326 150 "
]
},
"execution_count": 3,
Expand All @@ -136,7 +136,7 @@
}
],
"source": [
"data = pd.read_csv(r\"C:\\Users\\Milan Anand Raj\\Desktop\\KNOWLEDGEEDGEAI\\PET\\pets_mockdata\\Technical_Phase_Data\\technical_phase_data.csv\")\n",
"data = pd.read_csv(r\"C:\\Users\\Milan Anand Raj\\Desktop\\KNOWLEDGEEDGEAI\\PET\\final_data\\final_technical_data.csv\")\n",
"data.head()"
]
},
Expand Down Expand Up @@ -194,6 +194,36 @@
" return \"Santiago\""
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"data_t = data\n",
"data_t[\"city\"] = data[postal_code_col].astype(str).apply(categorize_city)\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['70640-000', '70000-000'], dtype=object)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_t[data_t[\"city\"]==\"Brasilia\"][\"merch_postal_code\"].unique()"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -203,18 +233,34 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"City: Santiago\n",
"City: Medellian\n",
"City: Bogota\n",
"{'Airlines': 694.2645988925163, 'Bars/Discotheques': 938.8304299835585, 'Computer Network/Information Services': 138.27595054301423, 'Drug Stores/Pharmacies': 745.0404457585972, 'General Retail Stores': 1098.1383922062796, 'Grocery Stores/Supermarkets': 4105.163373337119, 'Hospitals': 1536.995204166018, 'Hotels/Motels': 211.33068313976673, 'Restaurants': 5750.492793907174, 'Utilities: Electric, Gas, Water': 1164.8615359779465}\n",
"City: Santiago\n",
"{'Airlines': 775.690312082757, 'Bars/Discotheques': 1192.1697652298033, 'Computer Network/Information Services': 130.0848846505698, 'Drug Stores/Pharmacies': 1080.6051216491312, 'General Retail Stores': 1432.3301065873393, 'Grocery Stores/Supermarkets': 5128.047125312137, 'Hospitals': 2178.9284555521135, 'Hotels/Motels': 281.88545031373854, 'Restaurants': 7335.879379150183, 'Utilities: Electric, Gas, Water': 1422.6421925557495}\n",
"City: Brasilia\n"
]
},
{
"ename": "OpenDPException",
"evalue": "\n FFI(\"Continued stack trace from Exception in user-defined function:\nTraceback (most recent call last):\n File \"c:\\Users\\Public\\anaconda3\\envs\\.venv\\Lib\\site-packages\\opendp\\_convert.py\", line 629, in wrapper_func\n py_out = func(py_arg)\n ^^^^^^^^^^^^\n File \"c:\\Users\\Milan Anand Raj\\Desktop\\KNOWLEDGEEDGEAI\\PET\\src\\DP_epidemiology\\utilities.py\", line 279, in compute_private_sum\n return dp_sum/dp_dataset_size\n ~~~~~~^~~~~~~~~~~~~~~~\nZeroDivisionError: float division by zero\n\")",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mOpenDPException\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[8], line 9\u001b[0m\n\u001b[0;32m 7\u001b[0m end_date \u001b[38;5;241m=\u001b[39m datetime\u001b[38;5;241m.\u001b[39mstrptime(week, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mY-\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mm-\u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m 8\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCity: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcity\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m----> 9\u001b[0m transactions_per_category \u001b[38;5;241m=\u001b[39m get_private_counts(data, categories\u001b[38;5;241m=\u001b[39mcategories, start_date\u001b[38;5;241m=\u001b[39mstart_date, end_date\u001b[38;5;241m=\u001b[39mend_date, city\u001b[38;5;241m=\u001b[39mcity, epsilon\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1.0\u001b[39m)\n\u001b[0;32m 10\u001b[0m \u001b[38;5;28mprint\u001b[39m(transactions_per_category)\n\u001b[0;32m 11\u001b[0m transactions_per_city\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;28mlist\u001b[39m(transactions_per_category\u001b[38;5;241m.\u001b[39mvalues()))\n",
"File \u001b[1;32mc:\\Users\\Milan Anand Raj\\Desktop\\KNOWLEDGEEDGEAI\\PET\\src\\DP_epidemiology\\contact_matrix.py:73\u001b[0m, in \u001b[0;36mget_private_counts\u001b[1;34m(df, categories, start_date, end_date, city, epsilon)\u001b[0m\n\u001b[0;32m 64\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m category \u001b[38;5;129;01min\u001b[39;00m categories:\n\u001b[0;32m 65\u001b[0m m_count \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 66\u001b[0m t_pre\n\u001b[0;32m 67\u001b[0m \u001b[38;5;66;03m# TODO: The scale has to be equal to bound/epsilon, which can be equal to the mean itself in cases where number of entries\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 71\u001b[0m \u001b[38;5;241m>>\u001b[39m make_private_nb_transactions_avg_count(merch_category\u001b[38;5;241m=\u001b[39mcategory, upper_bound\u001b[38;5;241m=\u001b[39mUPPER_BOUND, dp_dataset_size\u001b[38;5;241m=\u001b[39mdp_count, scale\u001b[38;5;241m=\u001b[39m(\u001b[38;5;241m3\u001b[39m\u001b[38;5;241m*\u001b[39mUPPER_BOUND\u001b[38;5;241m*\u001b[39mnumber_of_timesteps)\u001b[38;5;241m/\u001b[39mepsilon)\n\u001b[0;32m 72\u001b[0m )\n\u001b[1;32m---> 73\u001b[0m nb_transactions_avg_count_map[category] \u001b[38;5;241m=\u001b[39m m_count(df)\n\u001b[0;32m 75\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m nb_transactions_avg_count_map\n",
"File \u001b[1;32mc:\\Users\\Public\\anaconda3\\envs\\.venv\\Lib\\site-packages\\opendp\\mod.py:74\u001b[0m, in \u001b[0;36mMeasurement.__call__\u001b[1;34m(self, arg)\u001b[0m\n\u001b[0;32m 72\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, arg):\n\u001b[0;32m 73\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mopendp\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m measurement_invoke\n\u001b[1;32m---> 74\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m measurement_invoke(\u001b[38;5;28mself\u001b[39m, arg)\n",
"File \u001b[1;32mc:\\Users\\Public\\anaconda3\\envs\\.venv\\Lib\\site-packages\\opendp\\core.py:370\u001b[0m, in \u001b[0;36mmeasurement_invoke\u001b[1;34m(this, arg)\u001b[0m\n\u001b[0;32m 367\u001b[0m lib_function\u001b[38;5;241m.\u001b[39margtypes \u001b[38;5;241m=\u001b[39m [Measurement, AnyObjectPtr]\n\u001b[0;32m 368\u001b[0m lib_function\u001b[38;5;241m.\u001b[39mrestype \u001b[38;5;241m=\u001b[39m FfiResult\n\u001b[1;32m--> 370\u001b[0m output \u001b[38;5;241m=\u001b[39m c_to_py(unwrap(lib_function(c_this, c_arg), AnyObjectPtr))\n\u001b[0;32m 372\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m output\n",
"File \u001b[1;32mc:\\Users\\Public\\anaconda3\\envs\\.venv\\Lib\\site-packages\\opendp\\_lib.py:254\u001b[0m, in \u001b[0;36munwrap\u001b[1;34m(result, type_)\u001b[0m\n\u001b[0;32m 252\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpolars\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mstr\u001b[39m(message)\u001b[38;5;241m.\u001b[39mlower() \u001b[38;5;129;01mand\u001b[39;00m pl\u001b[38;5;241m.\u001b[39m__version__ \u001b[38;5;241m!=\u001b[39m _EXPECTED_POLARS_VERSION:\n\u001b[0;32m 253\u001b[0m message \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mInstalled python polars version (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpl\u001b[38;5;241m.\u001b[39m__version__\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m) != expected version (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m_EXPECTED_POLARS_VERSION\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m). \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmessage\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m \u001b[38;5;66;03m# pragma: no cover\u001b[39;00m\n\u001b[1;32m--> 254\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m OpenDPException(variant, message, backtrace)\n",
"\u001b[1;31mOpenDPException\u001b[0m: \n FFI(\"Continued stack trace from Exception in user-defined function:\nTraceback (most recent call last):\n File \"c:\\Users\\Public\\anaconda3\\envs\\.venv\\Lib\\site-packages\\opendp\\_convert.py\", line 629, in wrapper_func\n py_out = func(py_arg)\n ^^^^^^^^^^^^\n File \"c:\\Users\\Milan Anand Raj\\Desktop\\KNOWLEDGEEDGEAI\\PET\\src\\DP_epidemiology\\utilities.py\", line 279, in compute_private_sum\n return dp_sum/dp_dataset_size\n ~~~~~~^~~~~~~~~~~~~~~~\nZeroDivisionError: float division by zero\n\")"
]
}
],
"source": [
Expand All @@ -227,6 +273,7 @@
" end_date = datetime.strptime(week, '%Y-%m-%d')\n",
" print(f\"City: {city}\")\n",
" transactions_per_category = get_private_counts(data, categories=categories, start_date=start_date, end_date=end_date, city=city, epsilon=1.0)\n",
" print(transactions_per_category)\n",
" transactions_per_city.append(list(transactions_per_category.values()))"
]
},
Expand Down
6 changes: 4 additions & 2 deletions src/DP_epidemiology/contact_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ def get_private_counts(df, categories, start_date: datetime, end_date: datetime,
number_of_timesteps = 1 if end_date == start_date else (
end_date - start_date).days // 7
input_space = dp.vector_domain(
dp.atom_domain(T=int)), dp.symmetric_distance()
dp.atom_domain(T=str)), dp.symmetric_distance()
df_new = t_pre(df)
zip_code_list = df_new[postal_code_col].unique().astype(int)
zip_code_list = df_new[postal_code_col].unique().astype(str)
count_meas = input_space >> dp.t.then_count() >> dp.m.then_laplace(
(3 * number_of_timesteps)/epsilon)
dp_count = count_meas(zip_code_list)
Expand Down Expand Up @@ -92,6 +92,8 @@ def get_age_group_count_map(df, age_groups, consumption_distribution, start_date
return age_group_count_map

# get average contact matrix for a group of cities


def get_contact_matrix(counts_per_city, population_distribution, fractions_offline):
age_bins = np.array(counts_per_city)
num_cities = len(counts_per_city)
Expand Down
2 changes: 1 addition & 1 deletion src/DP_epidemiology/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ def compute_private_sum(df):
df = df.copy()
sum = df[df["merch_category"]==merch_category]["nb_transactions"].clip(lower=0, upper=upper_bound).sum()
dp_sum = np.random.laplace(loc=sum, scale=scale)
return dp_sum
return dp_sum/dp_dataset_size

return dp.m.make_user_measurement(
input_domain=dataframe_domain(),
Expand Down

0 comments on commit a19017b

Please sign in to comment.