-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathNSString+DamerauLevenshtein.m
469 lines (347 loc) · 14.5 KB
/
NSString+DamerauLevenshtein.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
//
// NSString+DamerauLevenshtein.m
// DamerauLevenshtein
//
// Created by Jan on 02.01.11.
// Copyright 2011-2015 geheimwerk.de. All rights reserved.
//
// MIT License. See NSString+DamerauLevenshtein.h for details.
// Based on a code snippet by Wandering Mango:
// http://weblog.wanderingmango.com/articles/14/fuzzy-string-matching-and-the-principle-of-pleasant-surprises
#import "NSString+DamerauLevenshtein.h"
#import "JXLDStringDistanceUtilities.h"
#import "JXLDStringTokenUtilities.h"
@implementation NSString (DamerauLevenshtein)
CFIndex levensteinStringDistance(CFStringRef string1, CFStringRef string2);
CFIndex levensteinUniCharDistance(const UniChar *string1_chars, CFIndex n, const UniChar *string2_chars, CFIndex m);
CFIndex levensteinUniCharDistanceCore(const UniChar *string1_chars, CFIndex n, const UniChar *string2_chars, CFIndex m);
CFIndex tokenLengthTotal(CFRange token_ranges[], size_t token_count);
float valuePhrase(const UniChar *string1_chars, CFIndex n, const UniChar *string2_chars, CFIndex m);
float valueWords(CFStringRef string1, const UniChar *string1_chars, CFIndex n, CFStringRef string2, const UniChar *string2_chars, CFIndex m);
float semanticStringDistance(CFStringRef string1, CFStringRef string2, JXLDWeights weights);
- (NSUInteger)distanceFromString:(NSString *)comparisonString;
{
return [self distanceFromString:comparisonString options:0];
}
- (NSUInteger)distanceFromString:(NSString *)comparisonString options:(JXLDStringDistanceOptions)options;
{
CFStringRef string1, string2;
CFMutableStringRef string1_mutable = NULL;
CFMutableStringRef string2_mutable = NULL;
if (options & JXLDLiteralComparison) {
string1 = (__bridge CFStringRef)self;
string2 = (__bridge CFStringRef)comparisonString;
}
else {
string1_mutable = (CFMutableStringRef)CFBridgingRetain([self mutableCopy]);
string2_mutable = (CFMutableStringRef)CFBridgingRetain([comparisonString mutableCopy]);
// Processing options and pre-processing the strings accordingly
// The string lengths may change during pre-processing
jxld_CFStringPreprocessWithOptions(string1_mutable, options);
jxld_CFStringPreprocessWithOptions(string2_mutable, options);
string1 = (CFStringRef)string1_mutable;
string2 = (CFStringRef)string2_mutable;
}
NSUInteger distance = levensteinStringDistance(string1, string2);
if (string1_mutable != NULL) CFRelease(string1_mutable);
if (string2_mutable != NULL) CFRelease(string2_mutable);
return distance;
}
CFIndex levensteinStringDistance(CFStringRef string1, CFStringRef string2) {
// This implementation can be improved further if execution speed or memory constraints should ever pose a problem:
// http://en.wikipedia.org/wiki/Levenstein_Distance#Possible_improvements
// Step 1a (Steps follow description at http://www.merriampark.com/ld.htm )
CFIndex n, m;
n = CFStringGetLength(string1);
m = CFStringGetLength(string2);
UniChar *string1_buffer = NULL;
UniChar *string2_buffer = NULL;
CFIndex distance = kCFNotFound;
// This loop is here just so we don’t have to use goto
while (distance == kCFNotFound) {
if (n == 0) {
distance = m;
break;
}
if (m == 0) {
distance = n;
break;
}
// Prepare access to chars array for string1
const UniChar *string1_chars;
jxld_CFStringPrepareUniCharBuffer(string1, &string1_chars, &string1_buffer, CFRangeMake(0, n));
// Prepare access to chars array for string2
const UniChar *string2_chars;
jxld_CFStringPrepareUniCharBuffer(string2, &string2_chars, &string2_buffer, CFRangeMake(0, m));
distance = levensteinUniCharDistanceCore(string1_chars, n, string2_chars, m);
if (string1_buffer != NULL) {
free(string1_buffer);
string1_buffer = NULL;
}
if (string2_buffer != NULL) {
free(string2_buffer);
string2_buffer = NULL;
}
}
return distance;
}
CFIndex levensteinUniCharDistanceCore(const UniChar *string1_chars, CFIndex n, const UniChar *string2_chars, CFIndex m) {
#define string1CharacterAtIndex(A) string1_chars[(A)]
#define string2CharacterAtIndex(A) string2_chars[(A)]
CFIndex distance = 0;
// Step 1b
// Indexes into strings string1 and string2
CFIndex i; // Iterates through string1
CFIndex j; // Iterates through string2
CFIndex cost;
// Ignore common prefix (reducing memory footprint).
while (m > 0 && n > 0 && (*string1_chars == *string2_chars)) {
m -= 1;
n -= 1;
string1_chars++;
string2_chars++;
}
// Ignore common suffix (reducing memory footprint).
while (m > 0 && n > 0 && (string1_chars[n-1] == string2_chars[m-1])) {
m -= 1;
n -= 1;
}
// This implementation is based on Chas Emerick’s Java implementation:
// http://www.merriampark.com/ldjava.htm
CFIndex *d = calloc(n+1, sizeof(CFIndex)); // Cost array, horizontally
CFIndex *p = calloc(n+1, sizeof(CFIndex)); // 'previous' cost array, horizontally
CFIndex *_d; // Placeholder to assist in swapping p and d
#ifndef DISABLE_DAMERAU_TRANSPOSITION
CFIndex *p2 = calloc(n+1, sizeof(CFIndex)); // cost array before 'previous', horizontally
#endif
// Step 2
for (i = 0; i <= n; i++) {
p[i] = i;
}
// Step 3 and 4
for (j = 1; j <= m; j++) {
d[0] = j;
for (i = 1; i <= n; i++) {
// Step 5
cost = (string1CharacterAtIndex(i-1) == string2CharacterAtIndex(j-1)) ? 0 : 1;
// Step 6
// Minimum of cell to the left+1, to the top+1, diagonally left and up +cost
d[i] = jxld_smallestCFIndex(d[i-1]+1, p[i]+1, p[i-1]+cost);
#ifndef DISABLE_DAMERAU_TRANSPOSITION
// This conditional adds Damerau transposition to the Levenshtein distance
if (i > 1 && j > 1
&& string1CharacterAtIndex(i-1) == string2CharacterAtIndex(j-2)
&& string1CharacterAtIndex(i-2) == string2CharacterAtIndex(j-1) )
{
d[i] = MIN(d[i], p2[i-2] + cost );
}
#endif
}
// Copy current distance counts to 'previous row' distance counts
#ifdef DISABLE_DAMERAU_TRANSPOSITION
_d = p;
#else
_d = p2;
p2 = p;
#endif
p = d;
d = _d;
}
// Our last action in the above loop was to switch d and p, so p now
// actually has the most recent cost counts
distance = p[n];
free(d);
free(p);
#ifndef DISABLE_DAMERAU_TRANSPOSITION
free(p2);
#endif
return distance;
#undef string1CharacterAtIndex
#undef string2CharacterAtIndex
}
CFIndex levensteinUniCharDistance(const UniChar *string1_chars, CFIndex n, const UniChar *string2_chars, CFIndex m) {
CFIndex distance = 0;
if (n == 0) {
distance = m;
return distance;
}
if (m == 0) {
distance = n;
return distance;
}
distance = levensteinUniCharDistanceCore(string1_chars, n, string2_chars, m);
return distance;
}
float valuePhrase(const UniChar *string1_chars, CFIndex n, const UniChar *string2_chars, CFIndex m) {
float normalizedDistance = 0.0f;
NSUInteger longStringLength = MAX(n, m);
CFIndex levensteinDistance = levensteinUniCharDistance(string1_chars, n, string2_chars, m);
normalizedDistance = (float)levensteinDistance/longStringLength;
return normalizedDistance;
}
CFIndex tokenLengthTotal(CFRange token_ranges[], size_t token_count) {
CFIndex token_length_total = 0;
for (size_t i = 0; i < token_count; i++) {
token_length_total += token_ranges[i].length;
}
return token_length_total;
}
float valueWords(CFStringRef string1, const UniChar *string1_chars, CFIndex n, CFStringRef string2, const UniChar *string2_chars, CFIndex m) {
// We might be able to speed this up using JXTrie
// We could penalize words that are similar the farther their indexes are apart
CFRange *word_ranges1, *word_ranges2;
size_t word_count1 = jxst_CFStringPrepareTokenRangesArray(string1, CFRangeMake(0, n), kCFStringTokenizerUnitWord, &word_ranges1, NULL);
size_t word_count2 = jxst_CFStringPrepareTokenRangesArray(string2, CFRangeMake(0, m), kCFStringTokenizerUnitWord, &word_ranges2, NULL);
CFIndex word_length_total1 = tokenLengthTotal(word_ranges1, word_count1);
CFIndex word_length_total2 = tokenLengthTotal(word_ranges2, word_count2);
CFIndex longer_word_length_total = MAX(word_length_total1, word_length_total2);
CFIndex distance_total = 0;
CFRange word1Range, word2Range;
for (size_t i = 0; i < word_count1; i++) {
word1Range = word_ranges1[i];
CFIndex best_distance = m;
for (size_t j = 0; j < word_count2; j++) {
word2Range = word_ranges2[j];
CFIndex this_distance = levensteinUniCharDistance(&(string1_chars[word1Range.location]), word1Range.length,
&(string2_chars[word2Range.location]), word2Range.length);
if (this_distance < best_distance) best_distance = this_distance;
if (this_distance == 0) {
break;
}
}
distance_total += best_distance;
}
free(word_ranges1);
free(word_ranges2);
return distance_total/(float)longer_word_length_total;
}
float semanticStringDistance(CFStringRef string1, CFStringRef string2, JXLDWeights weights) {
CFIndex n, m;
n = CFStringGetLength(string1);
m = CFStringGetLength(string2);
UniChar *string1_buffer = NULL;
UniChar *string2_buffer = NULL;
float distance;
// This loop is here just so we don’t have to use goto
while (1) {
if (n == 0) {
distance = m;
break;
}
if (m == 0) {
distance = n;
break;
}
// Prepare access to chars array for string1
const UniChar *string1_chars;
jxld_CFStringPrepareUniCharBuffer(string1, &string1_chars, &string1_buffer, CFRangeMake(0, n));
// Prepare access to chars array for string2
const UniChar *string2_chars;
jxld_CFStringPrepareUniCharBuffer(string2, &string2_chars, &string2_buffer, CFRangeMake(0, m));
float phrase_value = (weights.phrase_to_word >= 1.0f) ? 0.0f : valuePhrase(string1_chars, n, string2_chars, m);
float words_value = (weights.phrase_to_word <= 0.0f) ? 0.0f : valueWords(string1, string1_chars, n, string2, string2_chars, m);
float length_value = ABS(n - m)/(float)MAX(n, m);
float phrase_weighted = phrase_value * weights.phrase_to_word;
float words_weighted = words_value * (1.0f - weights.phrase_to_word);
float min, max;
if (phrase_weighted < words_weighted) {
min = phrase_weighted;
max = words_weighted;
} else {
min = words_weighted;
max = phrase_weighted;
}
distance = (min * weights.min
+ max * weights.max
+ length_value * weights.length);
break;
}
if (string1_buffer != NULL) free(string1_buffer);
if (string2_buffer != NULL) free(string2_buffer);
return distance;
}
- (float)semanticDistanceFromString:(NSString *)comparisonString;
{
return [self semanticDistanceFromString:comparisonString weights:JXLDWeightsDefault()];
}
- (float)semanticDistanceFromString:(NSString *)comparisonString weights:(JXLDWeights)weights;
{
JXLDStringDistanceOptions options = JXLDDelimiterInsensitiveComparison | JXLDWhitespaceTrimmingComparison;
CFStringRef string1, string2;
CFMutableStringRef string1_mutable = (CFMutableStringRef)CFBridgingRetain([self mutableCopy]);
CFMutableStringRef string2_mutable = (CFMutableStringRef)CFBridgingRetain([comparisonString mutableCopy]);
// Processing options and pre-processing the strings accordingly
// The string lengths may change during pre-processing
jxld_CFStringPreprocessWithOptions(string1_mutable, options);
jxld_CFStringPreprocessWithOptions(string2_mutable, options);
string1 = (CFStringRef)string1_mutable;
string2 = (CFStringRef)string2_mutable;
float distance = semanticStringDistance(string1, string2, weights);
CFRelease(string1_mutable);
CFRelease(string2_mutable);
return distance;
}
- (float)semanticSimilarityToString:(NSString *)comparisonString;
{
return (1.0f - [self semanticDistanceFromString:comparisonString weights:JXLDWeightsDefault()]);
}
- (float)semanticSimilarityToString:(NSString *)comparisonString weights:(JXLDWeights)weights;
{
return (1.0f - [self semanticDistanceFromString:comparisonString weights:weights]);
}
- (float)normalizedDistanceFromString:(NSString *)comparisonString;
{
return [self normalizedDistanceFromString:comparisonString options:0 maximumDistance:FLT_MAX];
}
- (float)normalizedDistanceFromString:(NSString *)comparisonString options:(JXLDStringDistanceOptions)options;
{
return [self normalizedDistanceFromString:comparisonString options:options maximumDistance:FLT_MAX];
}
- (float)normalizedDistanceFromString:(NSString *)comparisonString options:(JXLDStringDistanceOptions)options maximumDistance:(float)maxDistance;
{
NSUInteger selfLength = self.length;
NSUInteger comparisonStringLength = comparisonString.length;
float normalizedDistance = jxld_normalizeDistance(selfLength, comparisonStringLength, maxDistance, ^NSUInteger{
return [self distanceFromString:comparisonString options:options];
});
return normalizedDistance;
}
- (float)similarityToString:(NSString *)comparisonString;
{
return (1.0f - [self normalizedDistanceFromString:comparisonString options:0 maximumDistance:FLT_MAX]);
}
- (float)similarityToString:(NSString *)comparisonString options:(JXLDStringDistanceOptions)options;
{
return (1.0f - [self normalizedDistanceFromString:comparisonString options:options maximumDistance:FLT_MAX]);
}
- (float)similarityToString:(NSString *)comparisonString options:(JXLDStringDistanceOptions)options minimumSimilarity:(float)minSimilarity;
{
return (1.0f - [self normalizedDistanceFromString:comparisonString options:options maximumDistance:(1.0f - minSimilarity)]);
}
- (BOOL)hasSimilarityToString:(NSString *)comparisonString options:(JXLDStringDistanceOptions)options minimumSimilarity:(float)minSimilarity;
{
return ((1.0f - [self normalizedDistanceFromString:comparisonString options:options maximumDistance:(1.0f - minSimilarity)]) >= minSimilarity);
}
- (NSComparisonResult)jxld_compare:(NSString *)comparisonString options:(JXLDStringDistanceOptions)options;
{
NSString *string1, *string2;
CFMutableStringRef string1_mutable = (CFMutableStringRef)CFBridgingRetain([self mutableCopy]);
CFMutableStringRef string2_mutable = (CFMutableStringRef)CFBridgingRetain([comparisonString mutableCopy]);
jxld_CFStringPreprocessWithOptions(string1_mutable, options);
jxld_CFStringPreprocessWithOptions(string2_mutable, options);
string1 = (__bridge NSString *)string1_mutable;
string2 = (__bridge NSString *)string2_mutable;
NSComparisonResult comparisonResult = [string1 compare:string2 options:NSLiteralSearch];
CFRelease(string1_mutable);
CFRelease(string2_mutable);
return comparisonResult;
}
- (NSString *)jxld_transformWithOptions:(JXLDStringDistanceOptions)options;
{
NSString *result;
CFMutableStringRef string1_mutable = (__bridge CFMutableStringRef)[self mutableCopy];
jxld_CFStringPreprocessWithOptions(string1_mutable, options);
result = (__bridge NSString *)string1_mutable;
return result;
}
@end