indexdata: read posting list iff all ng exist (#619)

The purpose of this commit is to reduce disk IO in case we skip a shard because of missing ngrams. To achieve this, we first check whether ALL ngrams exist in the shard before loading the posting lists to determine their frequency. This means we have to loop twice over the ngrams for the benefit of not loading any posting list in case the shard would have been skipped anyways. Test plan: This is a refactor, so relying on CI
sourcegraph · Jul 19, 2023 · b7e5070 · b7e5070
1 parent 0aefb15
commit b7e5070
Show file tree

Hide file tree

Showing 2 changed files with 101 additions and 18 deletions.
diff --git a/btree.go b/btree.go
@@ -304,6 +304,59 @@ func (b btreeIndex) SizeBytes() (sz int) {
 	return
 }
 
+// NgramIndexes returns the indexes of the ngrams in the index. We return a
+// slice of slices because we have to keep track of ngram variants in case of
+// case-insensitive search.
+func (b btreeIndex) NgramIndexes(ngrams []ngram) ([]int, int) {
+	lookups := 0
+	ngramIndexes := make([]int, 0, len(ngrams))
+
+	for _, ng := range ngrams {
+		ix := b.ngramIndex(ng)
+		lookups++
+		if ix == -1 {
+			return nil, len(ngramIndexes) + 1
+		}
+		ngramIndexes = append(ngramIndexes, ix)
+	}
+
+	return ngramIndexes, len(ngramIndexes)
+}
+
+func (b btreeIndex) ngramIndex(ng ngram) int {
+	if b.bt == nil {
+		return -1
+	}
+
+	// find bucket
+	bucketIndex, postingIndexOffset := b.bt.find(ng)
+
+	// read bucket into memory
+	off, sz := b.getBucket(bucketIndex)
+	bucket, err := b.file.Read(off, sz)
+	if err != nil {
+		return -1
+	}
+
+	// find ngram in bucket
+	getNGram := func(i int) ngram {
+		i *= ngramEncoding
+		return ngram(binary.BigEndian.Uint64(bucket[i : i+ngramEncoding]))
+	}
+
+	bucketSize := len(bucket) / ngramEncoding
+	x := sort.Search(bucketSize, func(i int) bool {
+		return ng <= getNGram(i)
+	})
+
+	// return index of associated posting list
+	if x >= bucketSize || getNGram(x) != ng {
+		return -1
+	}
+
+	return postingIndexOffset + x
+}
+
 // Get returns the simple section of the posting list associated with the
 // ngram. The logic is as follows:
 // 1. Search the inner nodes to find the bucket that may contain ng (in MEM)
@@ -341,15 +394,15 @@ func (b btreeIndex) Get(ng ngram) (ss simpleSection) {
 		return simpleSection{}
 	}
 
-	return b.getPostingList(postingIndexOffset + x)
+	return b.GetPostingList(postingIndexOffset + x)
 }
 
-// getPostingList returns the simple section pointing to the posting list of
+// GetPostingList returns the simple section pointing to the posting list of
 // the ngram at ngramIndex.
 //
 // Assumming we don't hit a page boundary, which should be rare given that we
 // only read 8 bytes, we need 1 disk access to read the posting offset.
-func (b btreeIndex) getPostingList(ngramIndex int) simpleSection {
+func (b btreeIndex) GetPostingList(ngramIndex int) simpleSection {
 	relativeOffsetBytes := uint32(ngramIndex) * 4
 
 	if relativeOffsetBytes+8 <= b.postingIndex.sz {
@@ -422,7 +475,7 @@ func (b btreeIndex) DumpMap() map[ngram]simpleSection {
 			// decode all ngrams in the bucket and fill map
 			for i := 0; i < len(bucket)/ngramEncoding; i++ {
 				gram := ngram(binary.BigEndian.Uint64(bucket[i*8:]))
-				m[gram] = b.getPostingList(int(n.postingIndexOffset) + i)
+				m[gram] = b.GetPostingList(int(n.postingIndexOffset) + i)
 			}
 		case *innerNode:
 			return

diff --git a/indexdata.go b/indexdata.go
@@ -23,8 +23,9 @@ import (
 	"math/bits"
 	"unicode/utf8"
 
-	"github.com/sourcegraph/zoekt/query"
 	"golang.org/x/exp/slices"
+
+	"github.com/sourcegraph/zoekt/query"
 )
 
 // indexData holds the pattern-independent data that we have to have
@@ -413,22 +414,23 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult
 	slices.SortFunc(ngramOffs, func(a, b runeNgramOff) bool {
 		return a.ngram < b.ngram
 	})
+
+	index := d.ngrams(query.FileName)
 	frequencies := make([]uint32, 0, len(ngramOffs))
 	ngramLookups := 0
-	ngrams := d.ngrams(query.FileName)
-	for _, o := range ngramOffs {
-		var freq uint32
-		if query.CaseSensitive {
-			freq = ngrams.Get(o.ngram).sz
-			ngramLookups++
-		} else {
-			for _, v := range generateCaseNgrams(o.ngram) {
-				freq += ngrams.Get(v).sz
-				ngramLookups++
-			}
+	if query.CaseSensitive {
+		// Perf: Look up ngram indexes without loading posting lists. This way we can
+		// stop early if a ngram does not exist. On the flip side we incur an additional
+		// loop and more memory allocations.
+
+		ngrams := make([]ngram, 0, len(ngramOffs))
+		for _, ng := range ngramOffs {
+			ngrams = append(ngrams, ng.ngram)
 		}
 
-		if freq == 0 {
+		var ngramIndexes []int
+		ngramIndexes, ngramLookups = index.NgramIndexes(ngrams)
+		if len(ngramIndexes) == 0 {
 			return &ngramIterationResults{
 				matchIterator: &noMatchTree{
 					Why: "freq=0",
@@ -439,7 +441,35 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult
 			}, nil
 		}
 
-		frequencies = append(frequencies, freq)
+		for _, ngramIndex := range ngramIndexes {
+			frequencies = append(frequencies, index.GetPostingList(ngramIndex).sz)
+		}
+	} else {
+		for _, o := range ngramOffs {
+			var freq uint32
+			if query.CaseSensitive {
+				freq = index.Get(o.ngram).sz
+				ngramLookups++
+			} else {
+				for _, v := range generateCaseNgrams(o.ngram) {
+					freq += index.Get(v).sz
+					ngramLookups++
+				}
+			}
+
+			if freq == 0 {
+				return &ngramIterationResults{
+					matchIterator: &noMatchTree{
+						Why: "freq=0",
+						Stats: Stats{
+							NgramLookups: ngramLookups,
+						},
+					},
+				}, nil
+			}
+
+			frequencies = append(frequencies, freq)
+		}
 	}
 
 	// first and last are now the smallest trigram posting lists to iterate