From 01c21f15ebb0af678ded020846f6533dafbd8a42 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Mon, 24 Jul 2023 15:38:01 +0200 Subject: [PATCH] Revert "indexdata: read posting list iff all ng exist (#619)" This reverts commit b7e5070bfb563ea836e532dc524a733dd5d684c8. Initial data from production shows we didn't improve performance so we are reverting since the complicates without improving perf. Test Plan: CI --- btree.go | 61 ++++------------------------------------------------ indexdata.go | 58 ++++++++++++------------------------------------- 2 files changed, 18 insertions(+), 101 deletions(-) diff --git a/btree.go b/btree.go index 6af70386c..04b525dca 100644 --- a/btree.go +++ b/btree.go @@ -304,59 +304,6 @@ func (b btreeIndex) SizeBytes() (sz int) { return } -// NgramIndexes returns the indexes of the ngrams in the index. We return a -// slice of slices because we have to keep track of ngram variants in case of -// case-insensitive search. -func (b btreeIndex) NgramIndexes(ngrams []ngram) ([]int, int) { - lookups := 0 - ngramIndexes := make([]int, 0, len(ngrams)) - - for _, ng := range ngrams { - ix := b.ngramIndex(ng) - lookups++ - if ix == -1 { - return nil, len(ngramIndexes) + 1 - } - ngramIndexes = append(ngramIndexes, ix) - } - - return ngramIndexes, len(ngramIndexes) -} - -func (b btreeIndex) ngramIndex(ng ngram) int { - if b.bt == nil { - return -1 - } - - // find bucket - bucketIndex, postingIndexOffset := b.bt.find(ng) - - // read bucket into memory - off, sz := b.getBucket(bucketIndex) - bucket, err := b.file.Read(off, sz) - if err != nil { - return -1 - } - - // find ngram in bucket - getNGram := func(i int) ngram { - i *= ngramEncoding - return ngram(binary.BigEndian.Uint64(bucket[i : i+ngramEncoding])) - } - - bucketSize := len(bucket) / ngramEncoding - x := sort.Search(bucketSize, func(i int) bool { - return ng <= getNGram(i) - }) - - // return index of associated posting list - if x >= bucketSize || getNGram(x) != ng { - return -1 - } - - return postingIndexOffset + x -} - // Get returns the simple section of the posting list associated with the // ngram. The logic is as follows: // 1. Search the inner nodes to find the bucket that may contain ng (in MEM) @@ -394,15 +341,15 @@ func (b btreeIndex) Get(ng ngram) (ss simpleSection) { return simpleSection{} } - return b.GetPostingList(postingIndexOffset + x) + return b.getPostingList(postingIndexOffset + x) } -// GetPostingList returns the simple section pointing to the posting list of +// getPostingList returns the simple section pointing to the posting list of // the ngram at ngramIndex. // // Assumming we don't hit a page boundary, which should be rare given that we // only read 8 bytes, we need 1 disk access to read the posting offset. -func (b btreeIndex) GetPostingList(ngramIndex int) simpleSection { +func (b btreeIndex) getPostingList(ngramIndex int) simpleSection { relativeOffsetBytes := uint32(ngramIndex) * 4 if relativeOffsetBytes+8 <= b.postingIndex.sz { @@ -475,7 +422,7 @@ func (b btreeIndex) DumpMap() map[ngram]simpleSection { // decode all ngrams in the bucket and fill map for i := 0; i < len(bucket)/ngramEncoding; i++ { gram := ngram(binary.BigEndian.Uint64(bucket[i*8:])) - m[gram] = b.GetPostingList(int(n.postingIndexOffset) + i) + m[gram] = b.getPostingList(int(n.postingIndexOffset) + i) } case *innerNode: return diff --git a/indexdata.go b/indexdata.go index a5e530684..f969c4e20 100644 --- a/indexdata.go +++ b/indexdata.go @@ -23,9 +23,8 @@ import ( "math/bits" "unicode/utf8" - "golang.org/x/exp/slices" - "github.com/sourcegraph/zoekt/query" + "golang.org/x/exp/slices" ) // indexData holds the pattern-independent data that we have to have @@ -414,23 +413,22 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult slices.SortFunc(ngramOffs, func(a, b runeNgramOff) bool { return a.ngram < b.ngram }) - - index := d.ngrams(query.FileName) frequencies := make([]uint32, 0, len(ngramOffs)) ngramLookups := 0 - if query.CaseSensitive { - // Perf: Look up ngram indexes without loading posting lists. This way we can - // stop early if a ngram does not exist. On the flip side we incur an additional - // loop and more memory allocations. - - ngrams := make([]ngram, 0, len(ngramOffs)) - for _, ng := range ngramOffs { - ngrams = append(ngrams, ng.ngram) + ngrams := d.ngrams(query.FileName) + for _, o := range ngramOffs { + var freq uint32 + if query.CaseSensitive { + freq = ngrams.Get(o.ngram).sz + ngramLookups++ + } else { + for _, v := range generateCaseNgrams(o.ngram) { + freq += ngrams.Get(v).sz + ngramLookups++ + } } - var ngramIndexes []int - ngramIndexes, ngramLookups = index.NgramIndexes(ngrams) - if len(ngramIndexes) == 0 { + if freq == 0 { return &ngramIterationResults{ matchIterator: &noMatchTree{ Why: "freq=0", @@ -441,35 +439,7 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult }, nil } - for _, ngramIndex := range ngramIndexes { - frequencies = append(frequencies, index.GetPostingList(ngramIndex).sz) - } - } else { - for _, o := range ngramOffs { - var freq uint32 - if query.CaseSensitive { - freq = index.Get(o.ngram).sz - ngramLookups++ - } else { - for _, v := range generateCaseNgrams(o.ngram) { - freq += index.Get(v).sz - ngramLookups++ - } - } - - if freq == 0 { - return &ngramIterationResults{ - matchIterator: &noMatchTree{ - Why: "freq=0", - Stats: Stats{ - NgramLookups: ngramLookups, - }, - }, - }, nil - } - - frequencies = append(frequencies, freq) - } + frequencies = append(frequencies, freq) } // first and last are now the smallest trigram posting lists to iterate