-
Notifications
You must be signed in to change notification settings - Fork 0
/
record_linkage.py
642 lines (569 loc) · 23.9 KB
/
record_linkage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
##Copyright (c) 2015 duncan g. smith
##
##Permission is hereby granted, free of charge, to any person obtaining a
##copy of this software and associated documentation files (the "Software"),
##to deal in the Software without restriction, including without limitation
##the rights to use, copy, modify, merge, publish, distribute, sublicense,
##and/or sell copies of the Software, and to permit persons to whom the
##Software is furnished to do so, subject to the following conditions:
##
##The above copyright notice and this permission notice shall be included
##in all copies or substantial portions of the Software.
##
##THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
##OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
##FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
##THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
##OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
##ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
##OTHER DEALINGS IN THE SOFTWARE.
from __future__ import division
from collections import defaultdict
import csv
import random
import operator
import itertools
import numpy
############## utility functions ##############
def parse_file(filename, **kwargs):
"""
A utility function that uses the I{csv} module to parse
a file containing variable names and records. Returns the
variable names and the records.
@type filename: C{str}
@param filename: path to file to be parsed
@rtype: C{tuple}
@return: a list of headers and a list of records
"""
with open(filename, 'rb') as f:
reader = csv.reader(f, **kwargs)
return next(reader), list(reader)
def write_file(filename, names, records, **kwargs):
"""
A utility function that uses the I{csv} module to write
a file containing variable names and records.
@type filename: C{str}
@param filename: file path
@type names: C{list}
@param names: list of variable names
@type records: C{list}
@param records: list of records
"""
with open(filename, 'wb') as f:
writer = csv.writer(f, **kwargs)
writer.writerow(names)
for rec in records:
writer.writerow(rec)
def aligned(records, indices):
"""
Takes I{N} records and a list of indices. Returns a transposed
(and possibly marginalised) list of records.
@type records: a sequence
@param records: I{N} records (fixed length lists of values)
@type indices: C{list}
@param indices: a list of indices that specify a transposition. e.g. The
value at index C{indices[0]} will appear at index 0 in
the transposed record
@rtype: C{list}
@return: I{N} match statuses (1 for a match, 0 otherwise)
"""
new_recs = []
for rec in records:
new_recs.append([rec[i] for i in indices])
return new_recs
def blocked(records, indices):
"""
Takes records and indices for blocking. Returns
a mapping of blocking values to a corresponding list of records.
@type records: a sequence
@param records: records as tuples of values
@type indices: C{list}
@param indices: indices of fields which are equal if a pair of records are a match
@rtype: C{defaultdict}
@return: mapping of tuples of values on blocked fields to corresponding records
"""
res = defaultdict(list)
for rec in records:
key = tuple(rec[index] for index in indices)
res[key].append(rec)
return res
def simscores(pairs, keys, sim_funcs=None, missing=''):
"""
Takes I{N} record pairs, I{n} indices for the key variables
and I{n} similarity score functions; and returns an array of
similarity scores. If I{sim_funcs} is C{None}, then comparison
defaults to strict equality.
Record pairs must be aligned on key variables.
@type pairs: a sequence
@param pairs: I{N} aligned record pairs
@type keys: a sequence
@param keys: I{n} indices of the key variables
@type sim_funcs: C{None} or C{list}
@param sim_funcs: similarity functions corresponding to key variables
@type missing: C{object}
@param missing: object denoting a missing record value
@rtype: numpy array
@return: (I{N},I{n}) shaped array containing similarity scores
or -99999999 for missing values
"""
scores = numpy.zeros((len(pairs), len(keys)), dtype=numpy.float64)
for i, (a,b) in enumerate(pairs):
for j, key in enumerate(keys):
if (a[key] == missing) or (b[key] == missing):
scores[i,j] = -99999999
elif not sim_funcs:
if a[key] == b[key]:
scores[i,j] = 1
else:
scores[i,j] = sim_funcs[j](a[key], b[key])
return scores
def categorized(simscores, limits):
"""
Takes an (N,n) shaped array containing similarity scores for I{N} record
pairs and I{n} variables. Returns a (N,c,n) shaped array of 1s and 0s such that
a 1 denotes the relevant similarity score interval. The intervals are specified
in I{limits}. Care should be taken to ensure that a missing value (perhaps specified
by a large negative integer) generates I{c} 0s. Intervals include their
upper limits, but not their lower limits.
@type simscores: numpy array
@param simscores: (N,n) shaped array of similarity scores
@type limits: C{list}
@param limits: a list containing a lower limit followed by I{c} upper limits
@rtype: numpy array
@return: (N,c,n) shaped data array
"""
arrs = []
for i, low in enumerate(limits[:-1]):
high = limits[i+1]
arr = numpy.where((low < simscores) & (simscores <= high), 1, 0)
arrs.append(arr)
shape = list(simscores.shape)
shape.insert(1,1)
return numpy.concatenate([arr.reshape(shape) for arr in arrs], axis=1)
def u_probs(data):
"""
Takes a data array of shape (N,c,n) - I{N} record pairs, I{c} similarity
score intervals, and I{n} variables. Returns an estimate for the
I{u}-probabilities based on the assumption that all record pairs
are non matches.
@type data: numpy array
@param data: (N,c,n) shaped data array
@rtype: numpy array
@return: (c,n) shaped array of estimated I{u}-probabilities
"""
N, c, n = data.shape
margin = data.sum(axis=0)
return margin / margin.sum(axis=0)
############## record linkage functions ##############
def EM(data, u=None, p=None, u_start=None, m_start=None, p_start=None, u_prior=1, m_prior=1, p_prior=1, max_its=10000, tol=1e-12):
"""
Takes a data array and optional arguments and uses Expectation
Maximization to generate either maximum likelihood or maximum a posteriori
parameter estimates. The data array has shape (N,c,n) - I{N} record pairs, I{c} similarity
score categories, and I{n} variables.
If C{u} or C{p} are provided, then these remain fixed.
C{u_start}, C{m_start} and C{p_start} are starting values. Sensible
defaults are used for any of these that are not provided.
C{u_prior}, C{m_prior} and C{p_prior} are Dirichlet priors for maximum
a posteriori estimation. The default values provide maximum likelihood
estimation. If C{int}s or C{float}s are provided, then all Dirichlet parameters
take the supplied value.
A C{ValueError} is raised if the algorithm does not converge within C{max_its} iterations.
@type data: numpy array
@param data: (N,c,n) shaped array containing similarity scores
@type u: C{None} or numpy array
@param u: fixed (c,n) shaped array of I{u}-probabilities
@type p: C{None} or C{float}
@param p: fixed marginal probability of a correct match
@type u_start: C{None} or numpy array
@param u_start: (c,n) shaped array of I{u}-probabilities
@type m_start: C{None} or numpy array
@param m_start: (c,n) shaped array of I{m}-probabilities
@type p_start: C{None} or C{float}
@param p_start: marginal probability of a correct match
@type u_prior: C{int}, C{float} or numpy array
@param u_prior: Dirichlet prior for I{u}-probabilities
@type m_prior: C{int}, C{float} or numpy array
@param m_prior: Dirichlet prior for I{m}-probabilities
@type p_prior: C{int}, C{float} or numpy array
@param p_prior: Dirichlet prior for [1-p,p]
@type max_its: C{int}
@param max_its: maximum number of iterations
@type tol: C{float}
@param tol: results are returned when consecutive log likelihoods
differ by less than this value
@rtype: tuple
@return: estimates for I{u}, I{m} and I{p}; posterior probabilities of a correct match
and the log likelihoods
@raises ValueError: if C{max_its} iterations are reached before convergence
"""
N, c, n = data.shape
fit_u = fit_p = False
if u is None:
fit_u = True
# produce starting values
if u_start is None:
u = u_probs(data)
else:
# ensure normalisation
u = u_start / u_start.sum(axis=0)
# produce starting values
if m_start is None:
m = numpy.zeros((c,n))
m[:] = 0.05/(c-1)
m[-1,:] = 0.95
else:
# ensure normalisation
m = m_start / m_start.sum(axis=0)
if p is None:
fit_p = True
p = p_start or 1/N**0.5
# avoid degenerate case of p == 1
if p == 1:
p = 0.99
last_u = u
last_m = m
log_likelihoods = []
try:
p_u, p_m = p_prior
except TypeError: # not an array
p_u = p_m = p_prior
# EM
for i in range(max_its):
# expectation
_u = numpy.prod(numpy.prod(u**data, 1), 1) * (1-p)
_m = numpy.prod(numpy.prod(m**data, 1), 1) * p
div = _u + _m
g_u = _u / div
g_m = _m / div
#
log_likelihoods.append(numpy.log(div).sum())
#
if fit_u:
u = numpy.sum(g_u.reshape((N,1,1))*data , 0) + (u_prior-1)
u = u / u.sum(axis=0)
m = numpy.sum(g_m.reshape((N,1,1))*data, 0) + (m_prior-1)
# maximization
m = m / m.sum(axis=0)
if fit_p:
p = (numpy.sum(g_m)+(p_m-1)) / (N+p_u+p_m-2)
# check for convergence
if i and log_likelihoods[-1] - log_likelihoods[-2] <= tol:
return u, m, p, g_m, log_likelihoods
raise ValueError('max_its iterations reached without convergence')
def average_frequencies(records, indices, missing=''):
"""
Takes records and indices of the key variables and returns
the reciprocals of the numbers of distinct variable values.
Used in EpiLink record linkage.
@type records: a sequence
@param records: records as tuples of values
@type indices: C{list}
@param indices: indices of key variables
@type missing: C{object}
@param missing: object denoting a missing record value
@rtype: numpy array
@return: reciprocals of numbers of distinct variable values
"""
freqs = numpy.zeros((len(indices),))
for i in range(len(indices)):
index = indices[i]
vals = set(rec[i] for rec in records)
vals.discard(missing)
freqs[i] = len(vals)
return 1/freqs
def EpiLink(simscores, w=None, error_rates=None, av_freqs=None):
"""
Takes an (N,n) shaped array containing similarity scores for I{N} record
pairs and I{n} variables, plus optional arguments. Returns a (N,) shaped array of
EpiLink scores.
If I{w} contains weights, then the weights are used to calculate EpiLink scores.
If I{w} is C{None}, then C{error_rates} and C{freqs} must be supplied so that
weights can be calculated.
Missing values are handled by using the mean weight for the relevant field.
@type simscores: numpy array
@param simscores: (N,n) shaped array of similarity scores
@type w: C{None} or numpy array
@param w: (n,) shaped array of weights
@type error_rates: C{None} or numpy array
@param error_rates: (N,n) or (n,) shaped array of error rates
@type av_freqs: C{None} or numpy array
@param av_freqs: (n,) shaped array of variable sizes (numbers of distinct categories)
@rtype: numpy array
@return: (N,) shaped array of EpiLink scores
"""
masked = numpy.ma.masked_array(simscores, mask=numpy.where(simscores < 0, 1, 0))
means = numpy.mean(masked, axis=0)
simscores = numpy.where(simscores < 0, means, simscores)
if w is None:
w = numpy.log2((1-error_rates)/av_freqs)
return (w*simscores).sum(1)/w.sum()
def likelihood_weighted_post(simscores, u, m, p):
"""
Takes the similarity scores for I{N} record pairs over I{n} variables
and combines them with Fellegi-Sunter I{u} and I{m}-probabilities
and the marginal probability of a correct match to generate
pseudo-posterior probabilities of a correct match. Essentially, it
performs linear interpolation on the likelihoods.
C{u[1]} and C{m[1]} are the probabilities of equal values given
non-matching and matching records respectively.
@type simscores: numpy array
@param simscores: (N,n) shaped array containing similarity scores
@type u: numpy array
@param u: (2,n) shaped array of I{u}-probabilities
@type m: numpy array
@param m: (2,n) shaped array of I{m}-probabilities
@type p: C{float}
@param p: marginal probability of a correct match
@rtype: numpy array
@return: I{N} pseudo-posterior probabilities of a correct match
"""
N,n = simscores.shape
arr = simscores*(m[1]/u[1]) + (1-simscores)*(m[0]/u[0])
arr = numpy.where(simscores < 0, 1, arr)
arr = numpy.prod(arr, axis=1)
post = arr * (p/(1-p))
return post/(1+post)
def log_likelihood_weighted_post(simscores, u, m, p):
"""
Takes the similarity scores for I{N} record pairs over I{n} variables
and combines them with Fellegi-Sunter I{u} and I{m}-probabilities
and the marginal probability of a correct match to generate
pseudo-posterior probabilities of a correct match. Essentially, it
performs linear interpolation on the log likelihoods / match weights.
C{u[1]} and C{m[1]} are the probabilities of equal values given
non-matching and matching records respectively.
@type simscores: numpy array
@param simscores: (N,n) shaped array containing similarity scores
@type u: numpy array
@param u: (2,n) shaped array of I{u}-probabilities
@type m: numpy array
@param m: (2,n) shaped array of I{m}-probabilities
@type p: C{float}
@param p: marginal probability of a correct match
@rtype: numpy array
@return: I{N} pseudo-posterior probabilities of a correct match
"""
N,n = simscores.shape
arr = (m[1]/u[1])**simscores * (m[0]/u[0])**(1-simscores)
arr = numpy.where(simscores < 0, 1, arr)
arr = numpy.prod(arr, axis=1)
post = arr * (p/(1-p))
return post/(1+post)
def Winkler_weighted_post(simscores, u, m, p, c=9/2):
"""
Takes the similarity scores for I{N} record pairs over I{n} variables
and combines them with Fellegi-Sunter I{u} and I{m}-probabilities
and the marginal probability of a correct match to generate
pseudo-posterior probabilities of a correct match. The approach used
is that described in:
Winkler, W. E. (1990). String Comparator Metrics and Enhanced
Decision Rules in the Fellegi-Sunter Model of Record Linkage.
Proceedings of the Section on Survey Research Methods (American
Statistical Association) pp.354-359
C{u[1]} and C{m[1]} are the probabilities of equal values given
non-matching and matching records respectively.
@type simscores: numpy array
@param simscores: (N,n) shaped array containing similarity scores
@type u: numpy array
@param u: (2,n) shaped array of I{u}-probabilities
@type m: numpy array
@param m: (2,n) shaped array of I{m}-probabilities
@type p: C{float}
@param p: marginal probability of a correct match
@type c: C{float}
@param c: additional interpolation parameter
@rtype: numpy array
@return: I{N} pseudo-posterior probabilities of a correct match
"""
N,n = simscores.shape
arr = (m[1]/u[1])**(1-(1-simscores)*c) * (m[0]/u[0])**((1-simscores)*c)
arr = numpy.where(simscores < 0, 1, arr)
arr = numpy.prod(arr, axis=1)
post = arr * (p/(1-p))
return post/(1+post)
############## outputs ##############
def matches(pairs, ID_index):
"""
Takes I{N} record pairs and an index for a unique ID field. Returns
an array of match statuses.
@type pairs: a sequence
@param pairs: I{N} record pairs aligned on C{ID_index}
@type ID_index: C{int}
@param ID_index: index of ID field (which is equal iff records are a match)
@rtype: numpy array
@return: I{N} match statuses (1 for a match, 0 otherwise)
"""
arr = numpy.zeros((len(pairs),))
for i, (rec1, rec2) in enumerate(pairs):
if rec1[ID_index] == rec2[ID_index]:
arr[i] = 1
return arr
def predictions(scores, cutoff, op=operator.gt):
"""
Takes I{N} scores (e.g. posterior probabilities or match weights),
cutoff and an operator. Returns an array of predictions.
@type scores: numpy array
@param scores: I{N} similarity scores
@type cutoff: C{int} or C{float}
@param cutoff: cutoff for classification
@type op: C{function}
@param op: function returning a C{bool}
@rtype: numpy array
@return: predicted values (1 for a match, 0 otherwise
if using the default function)
"""
return numpy.where(op(scores, cutoff), 1, 0)
def classification_table(matches, predictions):
"""
Takes I{N} match statuses and I{N} predictions. Returns
a classification table.
@type matches: numpy array
@param matches: I{N} match statuses (1 for a match, 0 otherwise)
@type predictions: numpy array
@param predictions: I{N} predictions (1 for a match, 0 otherwise)
@rtype: numpy array
@return: (2,2) shaped array containing true and false positives
in the first row, and false and true negatives in the second row
"""
# [[TP, FP],
# [FN, TN]]
if not matches.shape == predictions.shape:
raise ValueError('matches and predictions must have equal lengths')
tbl = numpy.zeros((2,2))
tbl[0,0] = (matches * predictions).sum()
tbl[0,1] = predictions.sum() - tbl[0,0]
tbl[1,0] = matches.sum() - tbl[0,0]
tbl[1,1] = len(matches) - tbl.sum()
return tbl
############## plots ##############
def log_likelihood_plot(filename, log_likelihoods, axis=None, format='jpeg', dpi=300, **kwargs):
"""
Creates and saves a plot of log likelihoods.
Any keyword arguments in **kwargs are passed to
the Matplotlib 'plot' function.
@type filename: C{str}
@param filename: location of saved file
@type log_likelihoods: sequence
@param log_likelihoods: log likelihoods
@type axis: C{None} or C{list}
@param axis: Matplotlib axis
"""
import pylab
pylab.plot(range(len(log_likelihoods)), log_likelihoods, **kwargs)
if axis:
pylab.axis(axis)
pylab.xlabel(r"$Iterations$")
lab = pylab.ylabel(r"$\ln(f(x\vert\Phi))$")
pylab.savefig(filename, format=format, dpi=dpi)
pylab.clf()
def series(pairs, ID_index, post):
"""
Takes pairs of records and corresponding posterior probabilities and
returns a series suitable for plotting ROC and precision recall
curves.
Record pairs must be aligned on a unique ID field so that match statuses
can be determined.
@type pairs: a sequence
@param pairs: aligned record pairs
@type ID_index: C{int}
@param ID_index: index of ID field (which is equal iff records are a match)
@type post: a sequence
@param post: posterior match probabilities for I{pairs}
@rtype: C{list}
@return: (posterior probability, match status) pairs sorted from
largest to smallest posterior probability
"""
tups = [(post[i], pair[0][ID_index] == pair[1][ID_index]) for i, pair in enumerate(pairs)]
random.shuffle(tups)
tups.sort(reverse=True)
return [tup[1] for tup in tups]
def ROC_curve(filename, series, legends, loc='lower right',
colours=None, axis=None, format='jpeg', dpi=300, **kwargs):
"""
Creates and saves an ROC curve.
Any keyword arguments in **kwargs are passed to
the Matplotlib 'plot' function.
@type filename: C{str}
@param filename: location of saved file
@type series: C{list}
@param series: (posterior probability, match status) pairs sorted from
largest to smallest posterior probability
@type legends: C{list}
@param legends: names for series
@type loc: C{str}
@param loc: Matplotlib location for legends
@type colours: C{None} or C{list}
@param colours: Matplotlib colours
@type axis: C{None} or C{list}
@param axis: Matplotlib axis
"""
if not len(series) == len(legends):
raise ValueError('Number of series does not match number of legends')
import pylab
pylab.hold(True)
if axis:
pylab.axis(axis)
pylab.gca().set_color_cycle(colours)
for values in series:
x_ser = [0]
y_ser = [0]
positives = values.count(True)
negatives = len(values) - positives
tp = fp = 0
for val in values:
tp += val
fp += 1-val
x_ser.append(fp/negatives)
y_ser.append(tp/positives)
pylab.plot(x_ser, y_ser, **kwargs)
pylab.plot([-0.02,1], [-0.02,1], '%s--' % 'k')
pylab.xlabel('False positive rate')
lab = pylab.ylabel('True positive rate')
pylab.legend(legends, loc=loc)
pylab.savefig(filename, format=format, dpi=dpi)
pylab.clf()
def precision_recall_curve(filename, series, legends, loc='lower left',
colours=None, axis=None, format='jpeg', dpi=300, **kwargs):
"""
Creates and saves a precision recall curve.
Any keyword arguments in **kwargs are passed to
the Matplotlib 'plot' function.
@type filename: C{str}
@param filename: location of saved file
@type series: C{list}
@param series: (posterior probability, match status) pairs sorted from
largest to smallest posterior probability
@type legends: C{list}
@param legends: names for series
@type loc: C{str}
@param loc: Matplotlib location for legends
@type colours: C{None} or C{list}
@param colours: Matplotlib colours
@type axis: C{None} or C{list}
@param axis: Matplotlib axis
"""
if not len(series) == len(legends):
raise ValueError('Number of series does not match number of legends')
import pylab
pylab.hold(True)
if axis:
pylab.axis(axis)
pylab.gca().set_color_cycle(colours)
for values in series:
x_ser = [0]
y_ser = [1]
positives = values.count(True)
negatives = len(values) - positives
tp = fp = 0
for val in values:
tp += val
fp += 1-val
x_ser.append(tp/positives)
y_ser.append(tp/(tp+fp))
pylab.plot(x_ser, y_ser, **kwargs)
pylab.xlabel('Recall')
lab = pylab.ylabel('Precision')
pylab.legend(legends, loc=loc)
pylab.savefig(filename, format=format, dpi=dpi)
pylab.clf()