-
Notifications
You must be signed in to change notification settings - Fork 1
/
ProtectedDataset.py
163 lines (144 loc) · 6.89 KB
/
ProtectedDataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import numpy as np
import pandas as pd
import functools
import copy
class ProtectedDataset:
"""
ProtectedDataset answers count queries on the given dataset based on the group-k laplace mechanism.
The constructor takes:
:param file_name: string filename of the dataset to answer queries on
:param k: int value of k to use for group-k differential privacy
:param categoricalColumns: List containing [string column name] of categorical columns that must be converted to a numerical representation
:param numericalColumns: List containing [string column name] of numerical columns
"""
def __init__(self, file_name, k, categoricalColumns, numericalColumns):
self.queriesAnswered = 0
self.__df = pd.read_csv(file_name, sep=';')
self.categoricalColumns = categoricalColumns
self.numericalColumns = numericalColumns
self._cat_df = self.convertCategoricalToNumbering() # is protected instead of private because direct access is needed to set up targeting experiment
self.__cat_df_list = np.array(self._cat_df).T.tolist()
self.__cat_df_list_T = np.array(self._cat_df).tolist()
self.columns = self._cat_df.columns.tolist()
self.k = k
self.numRows = self.__df.shape[0]
self.numColumns = self.__df.shape[1]
# setup to make the processing of the count queries faster
self.__columnSubset = []
self.__tempDataSubset = []
self.__colIndex = -1
self.__storedCount = -1
self.__comparisonListV2 = [np.Infinity]
self.__previousComparisonDict = {}
# self.__u = np.Infinity
# self.__v = np.Infinity
# print debugging
# print('-----------------Original dataframe----------------')
# print(self.__df)
# print(self.__cat_df)
def laplaceMechanism(self, x, epsilon):
"""
Adds noise from the Laplace distribution to the given value
:param x: float value to add noise to
:param epsilon: float scale of noise to be added
:return: noisy value
"""
return x + np.random.laplace(0, 1.0 / epsilon, 1)[0]
def count(self, u, v, b, epsilon, column, comparisonDict=None):
"""
Models the data custodian's response to the count query
:param u: float u
:param v: float v
:param b: int b
:param epsilon: float privacy budget to spend on this response
:param column: int main column number being answered on
:param comparisonDict: Dictionary containing {key=string column name : value=(float u, float v) }
for conditioning on previous columns
:return:
"""
# u, v, and column are separate from the comparisons purely for purposes of optimizing the speed of the answering based on the experiment
# they would likely be lumped into the comparisons dictionary in a real thing
# handle simple cases up front
if b >= self.numRows:
return 0
if b < 0:
return 1
self.queriesAnswered += 1
count = 0
# comparisons is a dictionary of column numbers to U and V values
# construct the subset of the dataset that fits the query bounds only if it is different from the previous query
# (otherwise, used already constructed subset)
# This check and storage of the database subset is quality-of-life to speed up the experiment's runtime,
# but likely would not exist in a real query-answering mechanism
# if (self.__u != u) or (self.__v != v) or (self.__colIndex != column) or (not self.__comparisonsV3 == comparisons):
# self.__columnSubset = []
self.__colIndex = column
if comparisonDict is None: # single column, can ignore the comparisons to previous columns
self.__previousComparisonDict = None
for element in self.__cat_df_list[self.__colIndex]:
if u <= element < v:
count += 1
else: # multiple columns are included in the query
if not self.__previousComparisonDict == comparisonDict: # check exists purely for experiment speed purposes
self.__tempDataSubset = []
self.__previousComparisonDict = copy.deepcopy(comparisonDict)
for row in self.__cat_df_list_T:
# check if the values in the given columns are within the prescribed boundaries
if functools.reduce(lambda x, y: x and y, map(lambda z: z[1][0] <= row[z[0]] < z[1][1], comparisonDict.items()), True):
self.__tempDataSubset.append(row)
# compute the number of elements in the comparison subset that match the parameters for the "main" column
for row in self.__tempDataSubset:
if u <= row[column] < v:
count += 1
# bounding to account for it being impossible to construct neighboring databases with negative rows or more rows than the size of the database
LowerNoiseBound = max(count - self.k, 0)
UpperNoiseBound = min(count + self.k, self.numRows)
if b < LowerNoiseBound:
return 1
elif b >= UpperNoiseBound:
return 0
else:
if b > len(self.__columnSubset):
Answer = self.laplaceMechanism(1, epsilon)
return Answer
else:
Answer = self.laplaceMechanism(0, epsilon)
return Answer
def convertCategoricalToNumbering(self):
"""
Converts categorical values to a numerical schema representation
:return: DataFrame with numerical representation of categorical values
"""
cat_df = pd.DataFrame()
for column in self.categoricalColumns:
cat_df[column] = pd.Categorical(self.__df[column]).codes
for column in self.numericalColumns:
cat_df[column] = self.__df[column]
return cat_df
def getNumQueries(self):
"""
:return: number of queries that have been answered so far
"""
return self.queriesAnswered
def resetQueryCounter(self):
"""
Resets the counter for number of queries answered
(for use between experiments)
:return: None
"""
self.queriesAnswered = 0
def getColumns(self):
"""
:return: List containing [string column name] column names of the protected dataset
"""
return self._cat_df.columns.tolist()
def getCategoricalBounds(self, column):
"""
Gives the bounds for the numerical schema for the given categorical column
:param column: string column name
:return: Tuple containing (int lower bound, int upper bound)
"""
if column in self.categoricalColumns:
index = self.columns.index(column)
return min(self.__cat_df_list[index]), max(self.__cat_df_list[index])
raise NotImplementedError