-
Notifications
You must be signed in to change notification settings - Fork 5
/
LSH.ecl
570 lines (473 loc) · 20.4 KB
/
LSH.ecl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
/**
* Implementation of Locality Sensitive Hashing, at scale. Both index creation and searching
* are supported.
*
* This module contains some toplevel exported attributes, primarily data types and record
* definitions, to make it easier for external code to interface with the data created here.
* A shared module contains the implementation, then two exported modules act as the interface
* (Build and Search). The exported functions are:
*
* Build('<fileScope>').CreateFiles()
* Search('<fileScope>').SearchMany()
* Search('<fileScope>').SearchOne()
*
* <fileScope> refers to an HPCC logical file scope. Files created by this module will be
* created there, and the search functions will expect those same files to be available.
* The build process writes a configuration file, containing build parameters, to the file
* scope; the search functions will use that config to ensure that the same parameters
* are used. This makes it easy to create different search indexes with different parameters
* and (more) easily switch between them during read.
*
* The tag "entity" is used to denote "the thing being indexed/searched." In reality, you
* can index/search anything that is a string, and an entity ID is really just a GUID.
* Also, note that the string can be anything: a simple string, a string that is a
* concatenation of other data, etc. As long as you normalize and construct the string the
* same way for both building and searching, It Just Works. You just need to use an
* EntityLayout record structure for both building and searching.
*
* Building the indexes requires passing three LSH-specific parameters. Changing these parameters
* "tunes" the build and search. Tuning is beyond the scope of this comment block;
* Wikipedia is your friend (maybe). Those three parameters are:
*
* denseSignatureSize: LSH signatures have a size
* hashBandSize: LSH signatures are cut up and hashed, and those hash
* values are what are searched to create the initial
* matching candidate list. This parameter indicates how
* large the band is.
* ngramLength: The raw string data is cut up into ngrams (character runs).
* Typically this value is 2, but you can change it if you want.
*
* Searching requires you to determine the minimum hash band overlap for the initial
* filter. Think of it this way: Entity information (both search corpus and search term) is
* cut up in N bands. Out of N, how many bands should overlap between a search term and a
* corpus entry for you to consider it a candidate match? N/2 means you want at least a
* 50% overlap, for example.
*
* A complete example of both build and search is in a comment block at the end of this file.
*
* Tutorial on LSH: https://www.pinecone.io/learn/series/faiss/locality-sensitive-hashing/
* Another tutorial: https://medium.com/@hbrylkowski/locality-sensitive-hashing-explained-304eb39291e4
*
* Origin: https://github.com/hpccsystems-solutions-lab/Useful_ECL
*/
EXPORT LSH := MODULE
IMPORT Std;
//--------------------------------------------------------------------
SHARED DEFAULT_NGRAM_LENGTH := 2;
//--------------------------------------------------------------------
EXPORT EntityID_t := UNSIGNED8;
EXPORT Hash_t := UNSIGNED8;
EXPORT EntityLayout := RECORD
EntityID_t id; // Entity GUID
UTF8 s; // Data associated with entity
END;
EXPORT HashFunctionLayout := RECORD
SET OF Hash_t hash_value_set;
END;
EXPORT DenseSigLayout := RECORD
EntityID_t id;
SET OF Hash_t sig;
END;
EXPORT LookupLayout := RECORD
Hash_t lookup_hash;
EntityID_t id;
END;
EXPORT SearchResultLayout := RECORD
EntityID_t search_id;
EntityID_t entity_id;
UNSIGNED6 match_cnt;
REAL8 similarity;
END;
EXPORT ConfigLayout := RECORD
UNSIGNED1 ngram_length;
UNSIGNED1 hash_band_size;
SET OF Hash_t hashes;
END;
//====================================================================
SHARED Util(STRING fileScope) := MODULE
EXPORT FS := MODULE
SHARED fsPrefix := Std.Str.RemoveSuffix(fileScope, '::');
EXPORT LOOKUP_FILENAME := fsPrefix + '::lookup';
EXPORT SIGNATURES_FILENAME := fsPrefix + '::signatures';
EXPORT CONFIG_FILENAME := fsPrefix + '::config';
EXPORT lookupDS := DATASET(LOOKUP_FILENAME, LookupLayout, FLAT);
EXPORT signaturesDS := INDEX({DenseSigLayout.id}, {DenseSigLayout}, SIGNATURES_FILENAME);
EXPORT configDS := DATASET(CONFIG_FILENAME, ConfigLayout, FLAT);
END;
//--------------------------------------------------------------------
EXPORT CreateHashFunctions(UNSIGNED2 hashCount) := FUNCTION
hashes := DATASET
(
hashCount,
TRANSFORM
(
{
Hash_t h
},
SELF.h := (RANDOM() << 32) | RANDOM()
)
);
RETURN SET(hashes, h);
END;
//--------------------------------------------------------------------
EXPORT SET OF Hash_t MakeDenseSignature(CONST UTF8 s, UNSIGNED1 ngram_length, SET OF Hash_t hashes) := EMBED(C++)
#option pure
#include <algorithm>
#include <string>
#include <utility>
#include <vector>
typedef unsigned __int64 HashType;
inline size_t countTrailingBytes(byte value)
{
if (value < 0xc0) return 0;
if (value < 0xe0) return 1;
if (value < 0xf0) return 2;
if (value < 0xf8) return 3;
if (value < 0xfc) return 4;
return 5;
}
inline size_t bytesForChar(byte ch)
{
size_t trailingByteCount = countTrailingBytes(ch);
if (trailingByteCount > 4)
return 0;
return trailingByteCount + 1;
}
size_t byteCountForChar(const char* inputString, size_t inputStringSize, size_t currentPos)
{
size_t byteCount = bytesForChar(inputString[currentPos]);
if (byteCount == 0 || (currentPos + byteCount > inputStringSize))
{
// Error condition
rtlFail(-1, "Invalid UTF-8 encoding");
}
return byteCount;
}
#body
std::vector<HashType> minHashes;
__lenResult = 0;
__result = nullptr;
__isAllResult = false;
if (lenS >= ngram_length && ngram_length > 0 && lenHashes > 0)
{
const HashType* hashSet = static_cast<const HashType*>(hashes);
unsigned long numHashes = lenHashes / sizeof(HashType);
size_t sSize = rtlUtf8Size(lenS, s);
size_t currentPos = 0;
std::vector<std::pair<size_t, size_t>> byteSizes;
std::vector<HashType> ngramHashes;
// Precompute bytes used for each character
byteSizes.reserve(lenS);
for (size_t x = 0; x < lenS; x++)
{
size_t numBytesToCopy = byteCountForChar(s, sSize, currentPos);
byteSizes.push_back(std::make_pair(currentPos, numBytesToCopy));
currentPos += numBytesToCopy;
}
for (size_t x = 0; x < (lenS - ngram_length + 1); x++)
{
// Extract ngram bytes
size_t numBytesToCopy = 0;
currentPos = byteSizes[x].first;
for (size_t y = 0; y < ngram_length; y++)
numBytesToCopy += byteSizes[x + y].second;
ngramHashes.push_back(rtlHash64Data(numBytesToCopy, s + currentPos, HASH64_INIT));
}
// Find the min hash for each hash function
for (size_t x = 0; x < numHashes; x++)
{
HashType minHash = UINT64_MAX;
for (auto& ngramHash : ngramHashes)
minHash = std::min(minHash, ngramHash ^ hashSet[x]);
minHashes.push_back(minHash);
}
// Sort the hash values
std::sort(minHashes.begin(), minHashes.end());
// Compute result buffer size and allocate
__lenResult = sizeof(HashType) * minHashes.size();
__result = rtlMalloc(__lenResult);
// Populate the result buffer
HashType* outPtr = static_cast<HashType*>(__result);
for (size_t x = 0; x < minHashes.size(); x++)
{
outPtr[x] = minHashes[x];
}
}
ENDEMBED;
//--------------------------------------------------------------------
EXPORT CreateHashBands(DATASET(DenseSigLayout) entitySignatures, UNSIGNED1 bandSize) := FUNCTION
bands := NORMALIZE
(
entitySignatures,
COUNT(LEFT.sig) / bandSize,
TRANSFORM
(
LookupLayout,
startPos := (COUNTER - 1) * bandSize + 1;
endPos := startPos + bandSize - 1;
SELF.lookup_hash := HASH64(LEFT.sig[startPos .. endPos]),
SELF := LEFT
)
);
RETURN bands;
END;
//--------------------------------------------------------------------
// Assumption: set1 and set2 are sorted ascending
EXPORT REAL8 JaccardSimilarity(SET OF Hash_t set1, SET OF Hash_t set2) := EMBED(C++)
#option pure;
typedef unsigned __int64 HashType;
#body
const HashType * numSet1 = static_cast<const HashType *>(set1);
unsigned long numElements1 = lenSet1 / sizeof(HashType);
unsigned long pos1 = 0;
const HashType * numSet2 = static_cast<const HashType *>(set2);
unsigned long numElements2 = lenSet2 / sizeof(HashType);
unsigned long pos2 = 0;
unsigned long intersectionCount = 0;
unsigned long unionCount = 0;
while (pos1 < numElements1 || pos2 < numElements2)
{
if (pos1 < numElements1 && pos2 < numElements2)
{
++unionCount;
if (numSet1[pos1] == numSet2[pos2])
{
++intersectionCount;
++pos1;
++pos2;
}
else if (numSet1[pos1] < numSet2[pos2])
{
++pos1;
}
else
{
++pos2;
}
}
else if (pos1 < numElements1)
{
unionCount += (numElements1 - pos1);
break;
}
else
{
unionCount += (numElements2 - pos2);
break;
}
}
return static_cast<double>(intersectionCount) / static_cast<double>(unionCount);
ENDEMBED;
END; // Module Util
//====================================================================
EXPORT Build(STRING fileScope) := MODULE
SHARED UtilMod := Util(fileScope);
SHARED FSMod := UtilMod.FS;
EXPORT CreateFiles(DATASET(EntityLayout) entities,
UNSIGNED2 denseSignatureSize,
UNSIGNED2 hashBandSize,
UNSIGNED1 ngramLength = DEFAULT_NGRAM_LENGTH) := FUNCTION
// Hashes we will use for MinHashing
hashSet := UtilMod.CreateHashFunctions(denseSignatureSize);
// Distribute the corpus for efficiency
distEntities := DISTRIBUTE(entities(LENGTH(s) >= ngramLength), SKEW(0.02));
entitySigs := PROJECT
(
distEntities,
TRANSFORM
(
DenseSigLayout,
SELF.id := LEFT.id,
SELF.sig := UtilMod.MakeDenseSignature(LEFT.s, ngramLength, hashSet)
)
);
createSignaturesFileAction := BUILD(FSMod.signaturesDS, entitySigs, OVERWRITE);
lookupInfo := UtilMod.CreateHashBands(entitySigs, hashBandSize);
createLookupFileAction := OUTPUT(lookupInfo, {lookupInfo}, FSMod.LOOKUP_FILENAME, COMPRESSED, OVERWRITE);
// Create a single-record config file that records some of these parameters
config := DATASET
(
[
{
ngramLength,
hashBandSize,
hashSet
}
],
ConfigLayout
);
createConfigFileAction := OUTPUT(config, {config}, FSMod.CONFIG_FILENAME, COMPRESSED, OVERWRITE);
buildAllAction := PARALLEL
(
createSignaturesFileAction,
createLookupFileAction,
createConfigFileAction
);
RETURN IF
(
hashBandSize < denseSignatureSize AND denseSignatureSize % hashBandSize = 0,
buildAllAction,
FAIL(-1, 'hashBandSize must be both less than denseSignatureSize and an even divisor of it')
);
END;
END; // Module Build
//====================================================================
EXPORT Search(STRING fileScope) := MODULE
SHARED UtilMod := Util(fileScope);
SHARED FSMod := UtilMod.FS;
EXPORT SearchMany(DATASET(EntityLayout) searchEntities, UNSIGNED2 minHashBandMatchCount, REAL8 minSimilarity) := FUNCTION
// Files we need
lookupDS := FSMod.lookupDS;
corpusSignaturesDS := FSMod.signaturesDS;
config := FSMod.configDS;
// Create dense signatures for the search entities
searchSigs := PROJECT
(
searchEntities,
TRANSFORM
(
DenseSigLayout,
SELF.id := LEFT.id,
SELF.sig := UtilMod.MakeDenseSignature(LEFT.s, config[1].ngram_length, config[1].hashes)
)
);
// Break up signatures into hash bands
searchHashBands := UtilMod.CreateHashBands(searchSigs, config[1].hash_band_size);
// Find initial matches
matches := JOIN
(
lookupDS,
searchHashBands,
LEFT.lookup_hash = RIGHT.lookup_hash,
TRANSFORM
(
{
EntityID_t search_id,
EntityID_t entity_id
},
SELF.entity_id := LEFT.id,
SELF.search_id := RIGHT.id
),
SMART
);
// Count matches and filter out those that don't match enough
matchSummary0 := TABLE
(
matches,
{
search_id,
entity_id,
UNSIGNED6 match_cnt := COUNT(GROUP)
},
search_id, entity_id,
MERGE
);
matchSummary := matchSummary0(match_cnt >= minHashBandMatchCount);
// Append search signatures
withSearchSig := JOIN
(
matchSummary,
searchSigs,
LEFT.search_id = RIGHT.id,
TRANSFORM
(
{
RECORDOF(LEFT),
SET OF Hash_t search_sig
},
SELF.search_sig := RIGHT.sig,
SELF := LEFT
),
LOOKUP
) : ONWARNING(4531, IGNORE);
// Append corpus signatures
withCorpusSig := JOIN
(
withSearchSig,
corpusSignaturesDS,
LEFT.entity_id = RIGHT.id,
TRANSFORM
(
{
RECORDOF(LEFT),
SET OF Hash_t entity_sig
},
SELF.entity_sig := RIGHT.sig,
SELF := LEFT
),
KEEP(1)
);
// Compute the Jaccard similarities
similarities := PROJECT
(
withCorpusSig,
TRANSFORM
(
SearchResultLayout,
sim := UtilMod.JaccardSimilarity(LEFT.entity_sig, LEFT.search_sig);
SELF.similarity := IF(sim >= minSimilarity, sim, SKIP),
SELF := LEFT
)
);
RETURN similarities;
END;
//--------------------------------------------------------------------
EXPORT SearchOne(UTF8 searchString, UNSIGNED2 minHashBandMatchCount, REAL8 minSimilarity) := FUNCTION
RETURN SearchMany(DATASET([{0, searchString}], EntityLayout), minHashBandMatchCount, minSimilarity);
END;
END; // Module Search
END; // Module LHS
/******* EXAMPLE CODE ***********************************************************************
NGRAM_SIZE := 2;
SIG_SIZE := 60;
BAND_SIZE := 2; // Must equally divide into SIG_SIZE
MIN_BAND_MATCH_COUNT := 1; // Must be between 1 and (SIG_SIZE / BAND_SIZE), inclusive
MIN_SIMILARITY := 0.10; // Intentionally low for this test
//*** MIN_BAND_MATCH_COUNT is only 1 because our test vocabulary is very small
// Make sure the above constants adhere to our setup
ASSERT(SIG_SIZE % BAND_SIZE = 0, FAIL);
ASSERT(MIN_BAND_MATCH_COUNT BETWEEN 1 AND (SIG_SIZE / BAND_SIZE), FAIL);
//--------------------------------
DO_BUILD := TRUE;
DO_SEARCH := TRUE;
FILE_SCOPE := '~lsh_test';
//--------------------------------
corpus0 := DATASET
(
[
{1001, u8'CAMPER'},
{1002, u8'AAAAAABAAAAAAA'},
{1003, u8'FUBAR'},
{1004, u8'Z'}
],
LSH.EntityLayout,
DISTRIBUTED
);
corpus := NOFOLD(corpus0);
buildAction := LSH.Build(FILE_SCOPE).CreateFiles(corpus, SIG_SIZE, BAND_SIZE, NGRAM_SIZE);
//--------------------------------
searchEntities0 := DATASET
(
[
{1, u8'CAMPER'},
{2, u8'DAN'},
{3, u8'PERSON'},
{4, u8'CAMPBELL'}
],
LSH.EntityLayout
);
searchEntities := NOFOLD(searchEntities0);
searchManyResults := LSH.Search(FILE_SCOPE).SearchMany(searchEntities, MIN_BAND_MATCH_COUNT, MIN_SIMILARITY);
searchOneResults := LSH.Search(FILE_SCOPE).SearchOne(NOFOLD(u8'FUBARSKY'), MIN_BAND_MATCH_COUNT, MIN_SIMILARITY);
//--------------------------------
SEQUENTIAL
(
IF(DO_BUILD, buildAction),
IF(DO_SEARCH,
PARALLEL
(
OUTPUT(TOPN(searchManyResults, 100, -similarity), NAMED('search_many_results')),
OUTPUT(TOPN(searchOneResults, 100, -similarity), NAMED('search_one_results'))
))
);
*/