-
Notifications
You must be signed in to change notification settings - Fork 12
/
lshensemble_benchmark_test.go
80 lines (73 loc) · 2.08 KB
/
lshensemble_benchmark_test.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
package lshensemble
import (
"log"
"sort"
"time"
)
const (
numHash = 256
numPart = 32
maxK = 4
// useOptimalPartitions = true
useOptimalPartitions = false
)
func benchmarkLshEnsemble(rawDomains []rawDomain, rawQueries []rawDomain,
threshold float64, outputFilename string) {
numHash := 256
numPart := 32
maxK := 4
// Minhash domains
start := time.Now()
domainRecords := minhashDomains(rawDomains, numHash)
log.Printf("Minhash %d domains in %s", len(domainRecords),
time.Now().Sub(start).String())
// Minhash queries
start = time.Now()
queries := minhashDomains(rawQueries, numHash)
log.Printf("Minhash %d query domains in %s", len(queries),
time.Now().Sub(start).String())
// Start main body of lsh ensemble
// Indexing
log.Print("Start building LSH Ensemble index")
sort.Sort(BySize(domainRecords))
var index *LshEnsemble
if useOptimalPartitions {
index, _ = BootstrapLshEnsemblePlusOptimal(numPart, numHash, maxK,
func() <-chan *DomainRecord { return Recs2Chan(domainRecords) })
} else {
index, _ = BootstrapLshEnsemblePlusEquiDepth(numPart, numHash, maxK,
len(domainRecords), Recs2Chan(domainRecords))
}
log.Print("Finished building LSH Ensemble index")
// Querying
log.Printf("Start querying LSH Ensemble index with %d queries", len(queries))
results := make(chan queryResult)
go func() {
for _, query := range queries {
r, d := index.QueryTimed(query.Signature, query.Size, threshold)
results <- queryResult{
queryKey: query.Key,
duration: d,
candidates: r,
}
}
close(results)
}()
outputQueryResults(results, outputFilename)
log.Printf("Finished querying LSH Ensemble index, output %s", outputFilename)
}
func minhashDomains(rawDomains []rawDomain, numHash int) []*DomainRecord {
domainRecords := make([]*DomainRecord, 0)
for _, domain := range rawDomains {
mh := NewMinhash(benchmarkSeed, numHash)
for v := range domain.values {
mh.Push([]byte(v))
}
domainRecords = append(domainRecords, &DomainRecord{
Key: domain.key,
Size: len(domain.values),
Signature: mh.Signature(),
})
}
return domainRecords
}