-
Notifications
You must be signed in to change notification settings - Fork 0
/
rules.py
424 lines (384 loc) · 15.2 KB
/
rules.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
##Copyright (c) 2015 duncan g. smith
##
##Permission is hereby granted, free of charge, to any person obtaining a
##copy of this software and associated documentation files (the "Software"),
##to deal in the Software without restriction, including without limitation
##the rights to use, copy, modify, merge, publish, distribute, sublicense,
##and/or sell copies of the Software, and to permit persons to whom the
##Software is furnished to do so, subject to the following conditions:
##
##The above copyright notice and this permission notice shall be included
##in all copies or substantial portions of the Software.
##
##THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
##OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
##FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
##THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
##OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
##ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
##OTHER DEALINGS IN THE SOFTWARE.
from __future__ import division
from heapq import heappush, heappop
import numpy
"""
Decision rules for Fellegi-Sunter record linkage:
Fellegi, I.P. and Sunter A.B. (1969) A theory for record
linkage, JASA Vol. 64, No. 238, pp.1183-1210
In the following docstrings A1, A2 and A3 are the sets of correct matches,
uncertain matches and incorrect matches respectively.
We have two (conditional) error rates:
mu = the probability of allocation to A1 given an incorrect match
lambda = the probability of allocation to A3 given a correct match
"""
def error_rates_from_thresholds(u, m, p, probs, b=None):
"""
Takes (c,n) shaped arrays I{u} and I{m}, the marginal
probability of a correct match I{p}, and a list of thresholds
on the posterior probability of a correct match. Returns
the error rates corresponding to the thresholds.
If supplied, C{b} is a mapping of variable indices to indices
of levels to be used for blocking (generally the highest
similarity score level).
C{probs} must be sorted.
@type u: numpy array
@param u: (c,n) shaped array of I{u}-probabilities
@type m: numpy array
@param m: (c,n) shaped array of I{m}-probabilities
@type p: C{float}
@param p: marginal probability of a correct match
@type probs: C{list}
@param probs: list of thresholds on the posterior
probability of a correct match
@type b: C{None} or C{dict}
@param b: mapping of variable indices to blocking levels
@rtype: C{list}
@return: tuples containing (mu, lambda) for each threshold
@raises: ValueError if C{probs} is not sorted
"""
# does not deal with Pu or Pm (floating point issues
# would make it unreliable)
if not sorted(probs) == probs:
raise ValueError('probs must be sorted')
res = []
lists, m_sum = _get_lists(u, m, b)
mus = []
lambdas = []
likelihoods = [prob*(1-p)/(1-prob)/p for prob in probs]
u_sum = 0
gen = _gen_sorted_data(lists, desc=True)
ratio = float('inf')
for likelihood in reversed(likelihoods):
while likelihood < ratio:
ratio, arr = next(gen)
u_sum += arr[0]
mu = u_sum - arr[0]
mus.append(mu)
mus.reverse()
gen = _gen_sorted_data(lists, desc=False)
ratio = 0
for likelihood in likelihoods:
while likelihood > ratio:
ratio, arr = next(gen)
m_sum += arr[1]
lamb = m_sum - arr[1]
lambdas.append(lamb)
return zip(mus, lambdas)
def error_rates_from_threshold(u, m, p, prob, b=None):
"""
Takes (c,n) shaped arrays I{u} and I{m}, the marginal
probability of a correct match I{p}, and a threshold
on the posterior probability of a correct match. Returns
the error rates corresponding to the threshold.
If supplied, C{b} is a mapping of variable indices to indices
of levels to be used for blocking (generally the highest
similarity score level).
@type u: numpy array
@param u: (c,n) shaped array of I{u}-probabilities
@type m: numpy array
@param m: (c,n) shaped array of I{m}-probabilities
@type p: C{float}
@param p: marginal probability of a correct match
@type prob: C{float}
@param prob: threshold on the posterior
probability of a correct match
@type b: C{None} or C{dict}
@param b: mapping of variable indices to blocking levels
@rtype: C{tuple}
@return: tuple containing (mu, lambda) for the threshold
"""
return error_rates_from_thresholds(u, m, p, [prob], b=b)[0]
def thresholds_from_error_rates(u, m, p, mu, lamb, b=None):
"""
Takes (c,n) shaped arrays I{u} and I{m}, the marginal
probability of a correct match I{p}, and error rates
mu and lambda. Returns the parameters for the decision
rule that provides the desired error rates.
The parameters for the decision rule are the threshold on the posterior
probability of a correct match and the probability with which a
record pair should be allocated to the relevant class if the posterior
probability is equal to the threshold. These are returned as two pairs,
the first pair corresponding to the lower threshold and allocation to A3.
If supplied, C{b} is a mapping of variable indices to indices
of levels to be used for blocking (generally the highest
similarity score level).
@type u: numpy array
@param u: (c,n) shaped array of I{u}-probabilities
@type m: numpy array
@param m: (c,n) shaped array of I{m}-probabilities
@type p: C{float}
@param p: marginal probability of a correct match
@type mu: C{float}
@param mu: the probability of allocation to A1 given an incorrect match
@type lamb: C{float}
@param lamb: the probability of allocation to A3 given a correct match
@type b: C{None} or C{dict}
@param b: mapping of variable indices to blocking levels
@rtype: C{tuple}
@return: parameters for lower threshold, and parameters for upper threshold
@raises: ValueError if blocking is inconsistent with the lower bound
on lambda, or if the calculated thresholds are inconsistent
"""
lists, m_sum = _get_lists(u, m, b)
if m_sum > lamb:
raise ValueError('Blocking implies a lower bound of %s on lambda' % m_sum)
#
u_sum = 0
for u_ratio, arr in _gen_sorted_data(lists, desc=True):
u_sum += arr[0]
if u_sum > mu:
# calculate Pu
Pu = (mu - (u_sum - arr[0])) / arr[0]
break
for m_ratio, arr in _gen_sorted_data(lists, desc=False):
m_sum += arr[1]
if m_sum > lamb:
# calculate Pm
Pm = (lamb - (m_sum - arr[1])) / arr[1]
break
u_prob = u_ratio*p / (1-p + u_ratio*p)
m_prob = m_ratio*p / (1-p + m_ratio*p)
if m_prob > u_prob or (m_prob == u_prob and Pm + Pu > 1):
raise ValueError('Inadmissible values for mu and lambda:\nTry lower error rates')
return (m_prob, Pm), (u_prob, Pu)
def threshold_from_ratio(u, m, p, ratio, b=None):
"""
Takes (c,n) shaped arrays I{u} and I{m}, the marginal
probability of a correct match I{p}, and a ratio for
lambda / mu. Returns the parameters for the decision
rule that provides the desired ratio for the error rates and the
values for mu and lambda under the decision rule.
The parameters for the decision rule are the threshold on the posterior
probability of a correct match and the probability with which a
record pair should be assigned to A1 if the posterior probability is
equal to the threshold.
If supplied, C{b} is a mapping of variable indices to indices
of levels to be used for blocking (generally the highest
similarity score level).
@type u: numpy array
@param u: (c,n) shaped array of I{u}-probabilities
@type m: numpy array
@param m: (c,n) shaped array of I{m}-probabilities
@type p: C{float}
@param p: marginal probability of a correct match
@type ratio: C{float}
@param ratio: threshold on the posterior
probability of a correct match
@type b: C{None} or C{dict}
@param b: mapping of variable indices to blocking levels
@rtype: C{tuple}
@return: tuple containing (mu, lambda) for the threshold
"""
# single iterator approach
lists, sum_ = _get_lists(u, m, b)
u_sum = 0
m_sum = 1
for u_rat, arr in _gen_sorted_data(lists, desc=True):
# linear search
u_sum += arr[0]
m_sum -= arr[1]
if m_sum/u_sum < ratio:
break
else:
if sum_/u_sum >= ratio:
return 0.0, 1.0, u_sum, m_sum
m_sum = sum_
gen = _gen_sorted_data(lists, desc=False)
for rat, arr in gen:
m_sum += arr[1]
if rat == u_rat:
u_sum -= arr[0]
break
while m_sum/u_sum < ratio:
rat, arr = next(gen)
u_sum -= arr[0]
m_sum += arr[1]
Pu = (m_sum - ratio * u_sum) / (arr[1] + ratio * arr[0])
mu = u_sum + Pu * arr[0]
lamb = m_sum - Pu * arr[1]
m_odds = rat * p/(1-p)
prob = m_odds/(1+m_odds)
return prob, Pu, mu, lamb
def threshold_from_cost_function(C_MU, C_UM, C_UU=0, C_MM=0):
"""
Takes costs of allocation and returns a threshold on
the posterior proability of a correct match that
minimises total cost.
Assumes that C_MU > C_UU and C_UM > C_MM.
@type C_MU: C{int} or C{float}
@param C_MU: cost of allocating to M when true class is U
@type C_UM: C{int} or C{float}
@param C_UM: cost of allocating to U when true class is M
@type C_UU: C{int} or C{float}
@param C_UU: cost of allocating to U when true class is U
@type C_MM: C{int} or C{float}
@param C_MM: cost of allocating to M when true class is M
@rtype: C{float}
@return: threshold on the prosterior probability of a correct match
that minimises the total cost
"""
return (C_MU - C_UU) / (C_MU - C_UU + C_UM - C_MM)
def threshold_from_linear_function(p, alpha, beta):
"""
Returns the threshold that minimises the cost function:
z = alpha*mu + beta*lambda
@type p: C{float}
@param p: marginal probability of a correct match
@type alpha: C{float}
@param alpha: false negative rate
@type beta: C{float}
@param beta: false positive rate
@rtype: C{tuple}
@return: threshold on the posterior probability of a correct match
that minimises z
"""
return alpha*p / (alpha*p + beta*(1-p))
def _get_lists(u, m, b=None):
# m has shape (c,n)
# u has shape (c,n)
# b is a mapping of variable indices to blocking levels
# generate all m/u
# generate list of sublists
b = b or {}
c, n = u.shape
lists = []
m_prod = 1
for i in range(n):
if i in b:
# explicit match
m_prod *= m[b[i],i]
lis = [numpy.array([u[j,i], m[j,i]]) for j in [b[i]] if (u[j,i] or m[j,i])]
if lis:
lists.append(lis)
else:
lis = [numpy.array([u[j,i], m[j,i]]) for j in range(c) if (u[j,i] or m[j,i])]
if lis:
lists.append(lis)
return lists, 1-m_prod
def _gen_sorted_data(lists, desc):
# desc is True if descending
# sort lists
#with numpy.errstate(divide='ignore'):
lists = [sorted(lis, reverse=desc, key=lambda arr: arr[1]/arr[0]) for lis in lists]
#print lists
mult = [1, -1][desc]
# generate data
heap = []
max_index = [len(lis)-1 for lis in lists]
index = [0]*len(lists)
arr = _get_vector_data(lists, index)
#with numpy.errstate(divide='ignore'):
ratio = mult * arr[1]/arr[0]
heappush(heap, (ratio, list(arr), index))
cnt = 0
last_ratio = -1
last_arr = numpy.array([0,0])
while heap:
cnt += 1
ratio, arr, index = heappop(heap)
arr = numpy.array(arr)
if ratio == last_ratio:
last_arr += arr
else:
if not last_ratio == -1:
yield mult * last_ratio, last_arr
last_ratio, last_arr = ratio, arr
for index in _children(index, max_index):
arr = _get_vector_data(lists, index)
#with numpy.errstate(divide='ignore'):
heappush(heap, (mult * arr[1]/arr[0], list(arr), index))
if not last_ratio == -1:
yield mult * last_ratio, last_arr
#print 'count', cnt
def _get_vector_data(lists, index):
# returns an array for comparison
# vector, array([u, m])
prod = 1
for i, j in enumerate(index):
prod *= lists[i][j]
return prod
def _children(node, max_index):
# node is a list e.g. [0,2,1]
# lex smallest parent
for i, val in enumerate(node):
if val < max_index[i]:
new_node = list(node)
new_node[i] += 1
yield new_node
if val:
break
############## plots ##############
def plot_error_rates(filename, u, m, p, b=None, axis=None, format='jpeg', dpi=300, **kwargs):
"""
Takes (c,n) shaped arrays I{u} and I{m} and the marginal
probability of a correct match I{p}; then plots the admissible
region for error rates mu and lambda. The curve corresponds to error
rates pairs under single thresholds (empty A2). Points below the
curve correspond to possible error rate pairs with non empty A2.
Returns the parameters for the decision
rule that provides the desired error rates.
If supplied, C{b} is a mapping of variable indices to indices
of levels to be used for blocking (generally the highest
similarity score level).
Any keyword arguments in **kwargs are passed to
the Matplotlib 'plot' function.
@type filename: C{str}
@param filename: location of saved file
@type u: numpy array
@param u: (c,n) shaped array of I{u}-probabilities
@type m: numpy array
@param m: (c,n) shaped array of I{m}-probabilities
@type p: C{float}
@param p: marginal probability of a correct match
@type axis: C{None} or C{list}
@param axis: Matplotlib axis
@type adj: C{None} or C{dict}
@param adj: mapping of variable indices to blocking levels
"""
lists, m_sum = _get_lists(u, m, b)
gen = _gen_sorted_data(lists, desc=True)
x = [0]
y =[]
for u_ratio, arr in gen:
x.append(x[-1]+arr[0])
y.append(arr[1])
y.append(m_sum)
# accumulate the y
for i in range(-1, -len(y), -1):
y[i-1] = y[i-1] + y[i]
# plot it
import pylab
pylab.plot(x, y, **kwargs)
if not axis:
xmin, xmax = pylab.xlim()
pylab.xlim(xmin=xmin-0.02*(xmax-xmin))
ymin, ymax = pylab.ylim()
pylab.ylim(ymin=ymin-0.02*(ymax-ymin))
else:
pylab.axis(axis)
if m_sum:
pylab.annotate(r"$\lambda \geq %s$" % m_sum, xy=(0.6,0.9), xycoords='axes fraction')
pylab.annotate(r"$\mu \leq %s$" % x[-1], xy=(0.6,0.8), xycoords='axes fraction')
pylab.xlabel(r"$\mu$")
lab = pylab.ylabel(r"$\lambda$")
pylab.savefig(filename, format=format, dpi=dpi)
pylab.clf()