From e7c7e7023915425f459f0254d422057009c27dbe Mon Sep 17 00:00:00 2001 From: Stefan Hengl Date: Mon, 17 Jul 2023 11:35:07 +0200 Subject: [PATCH 1/3] indexdata: read posting list iff all ng exist --- btree.go | 79 +++++++++++++++++++++++++++++++++++++++++++++++++--- indexdata.go | 71 +++++++++++++++++++++++++++++++--------------- 2 files changed, 123 insertions(+), 27 deletions(-) diff --git a/btree.go b/btree.go index 80e973913..e300847b0 100644 --- a/btree.go +++ b/btree.go @@ -304,6 +304,77 @@ func (b btreeIndex) SizeBytes() (sz int) { return } +func (b btreeIndex) NgramIndexes(ngrams []ngram, caseSensitive bool) ([][]int, int) { + lookups := 0 + ngramIndexes := make([][]int, 0, len(ngrams)) + + if caseSensitive { + for _, ng := range ngrams { + ix := b.ngramIndex(ng) + lookups++ + if ix == -1 { + return nil, lookups + } + ngramIndexes = append(ngramIndexes, []int{ix}) + + } + } else { + for _, ng := range ngrams { + var variantIndexes []int + for _, variant := range generateCaseNgrams(ng) { + ix := b.ngramIndex(variant) + lookups++ + if ix == -1 { + continue + } + variantIndexes = append(variantIndexes, ix) + } + + if len(variantIndexes) == 0 { + return nil, lookups + } + + ngramIndexes = append(ngramIndexes, variantIndexes) + } + } + + return ngramIndexes, lookups +} + +func (b btreeIndex) ngramIndex(ng ngram) int { + if b.bt == nil { + return -1 + } + + // find bucket + bucketIndex, postingIndexOffset := b.bt.find(ng) + + // read bucket into memory + off, sz := b.getBucket(bucketIndex) + bucket, err := b.file.Read(off, sz) + if err != nil { + return -1 + } + + // find ngram in bucket + getNGram := func(i int) ngram { + i *= ngramEncoding + return ngram(binary.BigEndian.Uint64(bucket[i : i+ngramEncoding])) + } + + bucketSize := len(bucket) / ngramEncoding + x := sort.Search(bucketSize, func(i int) bool { + return ng <= getNGram(i) + }) + + // return index of associated posting list + if x >= bucketSize || getNGram(x) != ng { + return -1 + } + + return postingIndexOffset + x +} + // Get returns the simple section of the posting list associated with the // ngram. The logic is as follows: // 1. Search the inner nodes to find the bucket that may contain ng (in MEM) @@ -341,15 +412,15 @@ func (b btreeIndex) Get(ng ngram) (ss simpleSection) { return simpleSection{} } - return b.getPostingList(postingIndexOffset + x) + return b.GetPostingList(postingIndexOffset + x) } -// getPostingList returns the simple section pointing to the posting list of +// GetPostingList returns the simple section pointing to the posting list of // the ngram at ngramIndex. // // Assumming we don't hit a page boundary, which should be rare given that we // only read 8 bytes, we need 1 disk access to read the posting offset. -func (b btreeIndex) getPostingList(ngramIndex int) simpleSection { +func (b btreeIndex) GetPostingList(ngramIndex int) simpleSection { relativeOffsetBytes := uint32(ngramIndex) * 4 if relativeOffsetBytes+8 <= b.postingIndex.sz { @@ -422,7 +493,7 @@ func (b btreeIndex) DumpMap() map[ngram]simpleSection { // decode all ngrams in the bucket and fill map for i := 0; i < len(bucket)/ngramEncoding; i++ { gram := ngram(binary.BigEndian.Uint64(bucket[i*8:])) - m[gram] = b.getPostingList(int(n.postingIndexOffset) + i) + m[gram] = b.GetPostingList(int(n.postingIndexOffset) + i) } case *innerNode: return diff --git a/indexdata.go b/indexdata.go index f3e288eca..1bad80500 100644 --- a/indexdata.go +++ b/indexdata.go @@ -353,6 +353,32 @@ func (data *indexData) ngramFrequency(ng ngram, filename bool) uint32 { return data.ngrams.Get(ng).sz } +// ngramIndexes returns the indexes of the ngrams in the index. We return a +// slice of slices because we have to keep track of ngram variants in case of +// case-insensitive search. +func (data *indexData) ngramIndexes(ngrams []ngram, filename bool, caseSensitive bool) ([][]int, int) { + if filename { + return data.fileNameNgrams.NgramIndexes(ngrams, caseSensitive) + } + return data.ngrams.NgramIndexes(ngrams, caseSensitive) +} + +// ngramIndexFrequency returns the sum of the frequencies of the ngrams at the +// given indexes. +func (data *indexData) ngramIndexFrequency(ngramIndex []int, filename bool) uint32 { + var freq uint32 + if filename { + for _, i := range ngramIndex { + freq += data.fileNameNgrams.GetPostingList(i).sz + } + return freq + } + for _, i := range ngramIndex { + freq += data.ngrams.GetPostingList(i).sz + } + return freq +} + type ngramIterationResults struct { matchIterator @@ -387,32 +413,31 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult slices.SortFunc(ngramOffs, func(a, b runeNgramOff) bool { return a.ngram < b.ngram }) - frequencies := make([]uint32, 0, len(ngramOffs)) - ngramLookups := 0 - for _, o := range ngramOffs { - var freq uint32 - if query.CaseSensitive { - freq = d.ngramFrequency(o.ngram, query.FileName) - ngramLookups++ - } else { - for _, v := range generateCaseNgrams(o.ngram) { - freq += d.ngramFrequency(v, query.FileName) - ngramLookups++ - } - } - if freq == 0 { - return &ngramIterationResults{ - matchIterator: &noMatchTree{ - Why: "freq=0", - Stats: Stats{ - NgramLookups: ngramLookups, - }, + ngrams := make([]ngram, 0, len(ngramOffs)) + for _, ng := range ngramOffs { + ngrams = append(ngrams, ng.ngram) + } + + + // Look up ngram indexes without loading posting lists. This way we can stop + // early if a ngram does not exist. On the flip side we incur an additional + // loop. + ngramIndexes, ngramLookups := d.ngramIndexes(ngrams, query.FileName, query.CaseSensitive) + if len(ngramIndexes) == 0 { + return &ngramIterationResults{ + matchIterator: &noMatchTree{ + Why: "freq=0", + Stats: Stats{ + NgramLookups: ngramLookups, }, - }, nil - } + }, + }, nil + } - frequencies = append(frequencies, freq) + frequencies := make([]uint32, 0, len(ngramOffs)) + for _, ngramIndex := range ngramIndexes { + frequencies = append(frequencies, d.ngramIndexFrequency(ngramIndex, query.FileName)) } var first, last runeNgramOff From 0d12632db42b111ccd69a2a822013f7d7dcd8def Mon Sep 17 00:00:00 2001 From: Stefan Hengl Date: Mon, 17 Jul 2023 12:45:11 +0200 Subject: [PATCH 2/3] remove ngramFrequency --- indexdata.go | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/indexdata.go b/indexdata.go index 1bad80500..f377023ce 100644 --- a/indexdata.go +++ b/indexdata.go @@ -22,8 +22,9 @@ import ( "math/bits" "unicode/utf8" - "github.com/sourcegraph/zoekt/query" "golang.org/x/exp/slices" + + "github.com/sourcegraph/zoekt/query" ) // indexData holds the pattern-independent data that we have to have @@ -346,13 +347,6 @@ func lastMinarg(xs []uint32) uint32 { return uint32(j) } -func (data *indexData) ngramFrequency(ng ngram, filename bool) uint32 { - if filename { - return data.fileNameNgrams.Get(ng).sz - } - return data.ngrams.Get(ng).sz -} - // ngramIndexes returns the indexes of the ngrams in the index. We return a // slice of slices because we have to keep track of ngram variants in case of // case-insensitive search. @@ -419,7 +413,6 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult ngrams = append(ngrams, ng.ngram) } - // Look up ngram indexes without loading posting lists. This way we can stop // early if a ngram does not exist. On the flip side we incur an additional // loop. From ac10de99a66d70cf2670a5238a84a5779e13a819 Mon Sep 17 00:00:00 2001 From: Stefan Hengl Date: Mon, 17 Jul 2023 16:14:35 +0200 Subject: [PATCH 3/3] focus on cases sensitive searches --- btree.go | 39 +++++---------------- indexdata.go | 99 +++++++++++++++++++++++++++++++++------------------- 2 files changed, 73 insertions(+), 65 deletions(-) diff --git a/btree.go b/btree.go index e300847b0..245e95bff 100644 --- a/btree.go +++ b/btree.go @@ -304,41 +304,20 @@ func (b btreeIndex) SizeBytes() (sz int) { return } -func (b btreeIndex) NgramIndexes(ngrams []ngram, caseSensitive bool) ([][]int, int) { +func (b btreeIndex) NgramIndexes(ngrams []ngram) ([]int, int) { lookups := 0 - ngramIndexes := make([][]int, 0, len(ngrams)) - - if caseSensitive { - for _, ng := range ngrams { - ix := b.ngramIndex(ng) - lookups++ - if ix == -1 { - return nil, lookups - } - ngramIndexes = append(ngramIndexes, []int{ix}) - - } - } else { - for _, ng := range ngrams { - var variantIndexes []int - for _, variant := range generateCaseNgrams(ng) { - ix := b.ngramIndex(variant) - lookups++ - if ix == -1 { - continue - } - variantIndexes = append(variantIndexes, ix) - } - - if len(variantIndexes) == 0 { - return nil, lookups - } + ngramIndexes := make([]int, 0, len(ngrams)) - ngramIndexes = append(ngramIndexes, variantIndexes) + for _, ng := range ngrams { + ix := b.ngramIndex(ng) + lookups++ + if ix == -1 { + return nil, len(ngramIndexes) + 1 } + ngramIndexes = append(ngramIndexes, ix) } - return ngramIndexes, lookups + return ngramIndexes, len(ngramIndexes) } func (b btreeIndex) ngramIndex(ng ngram) int { diff --git a/indexdata.go b/indexdata.go index f377023ce..96e892830 100644 --- a/indexdata.go +++ b/indexdata.go @@ -347,30 +347,28 @@ func lastMinarg(xs []uint32) uint32 { return uint32(j) } +func (d *indexData) ngramFrequency(ng ngram, filename bool) uint32 { + if filename { + return d.fileNameNgrams.Get(ng).sz + } + return d.ngrams.Get(ng).sz +} + // ngramIndexes returns the indexes of the ngrams in the index. We return a // slice of slices because we have to keep track of ngram variants in case of // case-insensitive search. -func (data *indexData) ngramIndexes(ngrams []ngram, filename bool, caseSensitive bool) ([][]int, int) { +func (d *indexData) ngramIndexes(ngrams []ngram, filename bool) ([]int, int) { if filename { - return data.fileNameNgrams.NgramIndexes(ngrams, caseSensitive) + return d.fileNameNgrams.NgramIndexes(ngrams) } - return data.ngrams.NgramIndexes(ngrams, caseSensitive) + return d.ngrams.NgramIndexes(ngrams) } -// ngramIndexFrequency returns the sum of the frequencies of the ngrams at the -// given indexes. -func (data *indexData) ngramIndexFrequency(ngramIndex []int, filename bool) uint32 { - var freq uint32 +func (d *indexData) ngramIndexFrequency(ngramIndex int, filename bool) uint32 { if filename { - for _, i := range ngramIndex { - freq += data.fileNameNgrams.GetPostingList(i).sz - } - return freq - } - for _, i := range ngramIndex { - freq += data.ngrams.GetPostingList(i).sz + return d.fileNameNgrams.GetPostingList(ngramIndex).sz } - return freq + return d.ngrams.GetPostingList(ngramIndex).sz } type ngramIterationResults struct { @@ -408,29 +406,60 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult return a.ngram < b.ngram }) - ngrams := make([]ngram, 0, len(ngramOffs)) - for _, ng := range ngramOffs { - ngrams = append(ngrams, ng.ngram) - } + frequencies := make([]uint32, 0, len(ngramOffs)) + ngramLookups := 0 + if query.CaseSensitive { + // Perf: Look up ngram indexes without loading posting lists. This way we can + // stop early if a ngram does not exist. On the flip side we incur an additional + // loop and more memory allocations. + + ngrams := make([]ngram, 0, len(ngramOffs)) + for _, ng := range ngramOffs { + ngrams = append(ngrams, ng.ngram) + } - // Look up ngram indexes without loading posting lists. This way we can stop - // early if a ngram does not exist. On the flip side we incur an additional - // loop. - ngramIndexes, ngramLookups := d.ngramIndexes(ngrams, query.FileName, query.CaseSensitive) - if len(ngramIndexes) == 0 { - return &ngramIterationResults{ - matchIterator: &noMatchTree{ - Why: "freq=0", - Stats: Stats{ - NgramLookups: ngramLookups, + var ngramIndexes []int + ngramIndexes, ngramLookups = d.ngramIndexes(ngrams, query.FileName) + if len(ngramIndexes) == 0 { + return &ngramIterationResults{ + matchIterator: &noMatchTree{ + Why: "freq=0", + Stats: Stats{ + NgramLookups: ngramLookups, + }, }, - }, - }, nil - } + }, nil + } - frequencies := make([]uint32, 0, len(ngramOffs)) - for _, ngramIndex := range ngramIndexes { - frequencies = append(frequencies, d.ngramIndexFrequency(ngramIndex, query.FileName)) + for _, ngramIndex := range ngramIndexes { + frequencies = append(frequencies, d.ngramIndexFrequency(ngramIndex, query.FileName)) + } + } else { + for _, o := range ngramOffs { + var freq uint32 + if query.CaseSensitive { + freq = d.ngramFrequency(o.ngram, query.FileName) + ngramLookups++ + } else { + for _, v := range generateCaseNgrams(o.ngram) { + freq += d.ngramFrequency(v, query.FileName) + ngramLookups++ + } + } + + if freq == 0 { + return &ngramIterationResults{ + matchIterator: &noMatchTree{ + Why: "freq=0", + Stats: Stats{ + NgramLookups: ngramLookups, + }, + }, + }, nil + } + + frequencies = append(frequencies, freq) + } } var first, last runeNgramOff