sourcegraph · keegancsmith · Jul 19, 2023 · Jul 17, 2023 · Jul 17, 2023 · Jul 17, 2023
diff --git a/btree.go b/btree.go
@@ -304,6 +304,77 @@ func (b btreeIndex) SizeBytes() (sz int) {
 	return
 }
 
+func (b btreeIndex) NgramIndexes(ngrams []ngram, caseSensitive bool) ([][]int, int) {
+	lookups := 0
+	ngramIndexes := make([][]int, 0, len(ngrams))
+
+	if caseSensitive {
+		for _, ng := range ngrams {
+			ix := b.ngramIndex(ng)
+			lookups++
+			if ix == -1 {
+				return nil, lookups
+			}
+			ngramIndexes = append(ngramIndexes, []int{ix})
+
+		}
+	} else {
+		for _, ng := range ngrams {
+			var variantIndexes []int
+			for _, variant := range generateCaseNgrams(ng) {
+				ix := b.ngramIndex(variant)
+				lookups++
+				if ix == -1 {
+					continue
+				}
+				variantIndexes = append(variantIndexes, ix)
+			}
+
+			if len(variantIndexes) == 0 {
+				return nil, lookups
+			}
+
+			ngramIndexes = append(ngramIndexes, variantIndexes)
+		}
+	}
+
+	return ngramIndexes, lookups
+}
+
+func (b btreeIndex) ngramIndex(ng ngram) int {
+	if b.bt == nil {
+		return -1
+	}
+
+	// find bucket
+	bucketIndex, postingIndexOffset := b.bt.find(ng)
+
+	// read bucket into memory
+	off, sz := b.getBucket(bucketIndex)
+	bucket, err := b.file.Read(off, sz)
+	if err != nil {
+		return -1
+	}
+
+	// find ngram in bucket
+	getNGram := func(i int) ngram {
+		i *= ngramEncoding
+		return ngram(binary.BigEndian.Uint64(bucket[i : i+ngramEncoding]))
+	}
+
+	bucketSize := len(bucket) / ngramEncoding
+	x := sort.Search(bucketSize, func(i int) bool {
+		return ng <= getNGram(i)
+	})
+
+	// return index of associated posting list
+	if x >= bucketSize || getNGram(x) != ng {
+		return -1
+	}
+
+	return postingIndexOffset + x
+}
+
 // Get returns the simple section of the posting list associated with the
 // ngram. The logic is as follows:
 // 1. Search the inner nodes to find the bucket that may contain ng (in MEM)
@@ -341,15 +412,15 @@ func (b btreeIndex) Get(ng ngram) (ss simpleSection) {
 		return simpleSection{}
 	}
 
-	return b.getPostingList(postingIndexOffset + x)
+	return b.GetPostingList(postingIndexOffset + x)
 }
 
-// getPostingList returns the simple section pointing to the posting list of
+// GetPostingList returns the simple section pointing to the posting list of
 // the ngram at ngramIndex.
 //
 // Assumming we don't hit a page boundary, which should be rare given that we
 // only read 8 bytes, we need 1 disk access to read the posting offset.
-func (b btreeIndex) getPostingList(ngramIndex int) simpleSection {
+func (b btreeIndex) GetPostingList(ngramIndex int) simpleSection {
 	relativeOffsetBytes := uint32(ngramIndex) * 4
 
 	if relativeOffsetBytes+8 <= b.postingIndex.sz {
@@ -422,7 +493,7 @@ func (b btreeIndex) DumpMap() map[ngram]simpleSection {
 			// decode all ngrams in the bucket and fill map
 			for i := 0; i < len(bucket)/ngramEncoding; i++ {
 				gram := ngram(binary.BigEndian.Uint64(bucket[i*8:]))
-				m[gram] = b.getPostingList(int(n.postingIndexOffset) + i)
+				m[gram] = b.GetPostingList(int(n.postingIndexOffset) + i)
 			}
 		case *innerNode:
 			return

diff --git a/indexdata.go b/indexdata.go
@@ -22,8 +22,9 @@ import (
 	"math/bits"
 	"unicode/utf8"
 
-	"github.com/sourcegraph/zoekt/query"
 	"golang.org/x/exp/slices"
+
+	"github.com/sourcegraph/zoekt/query"
 )
 
 // indexData holds the pattern-independent data that we have to have
@@ -346,11 +347,30 @@ func lastMinarg(xs []uint32) uint32 {
 	return uint32(j)
 }
 
-func (data *indexData) ngramFrequency(ng ngram, filename bool) uint32 {
+// ngramIndexes returns the indexes of the ngrams in the index. We return a
+// slice of slices because we have to keep track of ngram variants in case of
+// case-insensitive search.
+func (data *indexData) ngramIndexes(ngrams []ngram, filename bool, caseSensitive bool) ([][]int, int) {
 	if filename {
-		return data.fileNameNgrams.Get(ng).sz
+		return data.fileNameNgrams.NgramIndexes(ngrams, caseSensitive)
+	}
+	return data.ngrams.NgramIndexes(ngrams, caseSensitive)
+}
+
+// ngramIndexFrequency returns the sum of the frequencies of the ngrams at the
+// given indexes.
+func (data *indexData) ngramIndexFrequency(ngramIndex []int, filename bool) uint32 {
+	var freq uint32
+	if filename {
+		for _, i := range ngramIndex {
+			freq += data.fileNameNgrams.GetPostingList(i).sz
+		}
+		return freq
+	}
+	for _, i := range ngramIndex {
+		freq += data.ngrams.GetPostingList(i).sz
 	}
-	return data.ngrams.Get(ng).sz
+	return freq
 }
 
 type ngramIterationResults struct {
@@ -387,32 +407,30 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult
 	slices.SortFunc(ngramOffs, func(a, b runeNgramOff) bool {
 		return a.ngram < b.ngram
 	})
-	frequencies := make([]uint32, 0, len(ngramOffs))
-	ngramLookups := 0
-	for _, o := range ngramOffs {
-		var freq uint32
-		if query.CaseSensitive {
-			freq = d.ngramFrequency(o.ngram, query.FileName)
-			ngramLookups++
-		} else {
-			for _, v := range generateCaseNgrams(o.ngram) {
-				freq += d.ngramFrequency(v, query.FileName)
-				ngramLookups++
-			}
-		}
 
-		if freq == 0 {
-			return &ngramIterationResults{
-				matchIterator: &noMatchTree{
-					Why: "freq=0",
-					Stats: Stats{
-						NgramLookups: ngramLookups,
-					},
+	ngrams := make([]ngram, 0, len(ngramOffs))
+	for _, ng := range ngramOffs {
+		ngrams = append(ngrams, ng.ngram)
+	}
+
+	// Look up ngram indexes without loading posting lists. This way we can stop
+	// early if a ngram does not exist. On the flip side we incur an additional
+	// loop.
+	ngramIndexes, ngramLookups := d.ngramIndexes(ngrams, query.FileName, query.CaseSensitive)
+	if len(ngramIndexes) == 0 {
+		return &ngramIterationResults{
+			matchIterator: &noMatchTree{
+				Why: "freq=0",
+				Stats: Stats{
+					NgramLookups: ngramLookups,
 				},
-			}, nil
-		}
+			},
+		}, nil
+	}
 
-		frequencies = append(frequencies, freq)
+	frequencies := make([]uint32, 0, len(ngramOffs))
+	for _, ngramIndex := range ngramIndexes {
+		frequencies = append(frequencies, d.ngramIndexFrequency(ngramIndex, query.FileName))
 	}
 
 	var first, last runeNgramOff