From 01c21f15ebb0af678ded020846f6533dafbd8a42 Mon Sep 17 00:00:00 2001
From: Keegan Carruthers-Smith <keegan.csmith@gmail.com>
Date: Mon, 24 Jul 2023 15:38:01 +0200
Subject: [PATCH] Revert "indexdata: read posting list iff all ng exist (#619)"

This reverts commit b7e5070bfb563ea836e532dc524a733dd5d684c8. Initial
data from production shows we didn't improve performance so we are
reverting since the complicates without improving perf.

Test Plan: CI
---
 btree.go     | 61 ++++------------------------------------------------
 indexdata.go | 58 ++++++++++++-------------------------------------
 2 files changed, 18 insertions(+), 101 deletions(-)

diff --git a/btree.go b/btree.go
index 6af70386c..04b525dca 100644
--- a/btree.go
+++ b/btree.go
@@ -304,59 +304,6 @@ func (b btreeIndex) SizeBytes() (sz int) {
 	return
 }
 
-// NgramIndexes returns the indexes of the ngrams in the index. We return a
-// slice of slices because we have to keep track of ngram variants in case of
-// case-insensitive search.
-func (b btreeIndex) NgramIndexes(ngrams []ngram) ([]int, int) {
-	lookups := 0
-	ngramIndexes := make([]int, 0, len(ngrams))
-
-	for _, ng := range ngrams {
-		ix := b.ngramIndex(ng)
-		lookups++
-		if ix == -1 {
-			return nil, len(ngramIndexes) + 1
-		}
-		ngramIndexes = append(ngramIndexes, ix)
-	}
-
-	return ngramIndexes, len(ngramIndexes)
-}
-
-func (b btreeIndex) ngramIndex(ng ngram) int {
-	if b.bt == nil {
-		return -1
-	}
-
-	// find bucket
-	bucketIndex, postingIndexOffset := b.bt.find(ng)
-
-	// read bucket into memory
-	off, sz := b.getBucket(bucketIndex)
-	bucket, err := b.file.Read(off, sz)
-	if err != nil {
-		return -1
-	}
-
-	// find ngram in bucket
-	getNGram := func(i int) ngram {
-		i *= ngramEncoding
-		return ngram(binary.BigEndian.Uint64(bucket[i : i+ngramEncoding]))
-	}
-
-	bucketSize := len(bucket) / ngramEncoding
-	x := sort.Search(bucketSize, func(i int) bool {
-		return ng <= getNGram(i)
-	})
-
-	// return index of associated posting list
-	if x >= bucketSize || getNGram(x) != ng {
-		return -1
-	}
-
-	return postingIndexOffset + x
-}
-
 // Get returns the simple section of the posting list associated with the
 // ngram. The logic is as follows:
 // 1. Search the inner nodes to find the bucket that may contain ng (in MEM)
@@ -394,15 +341,15 @@ func (b btreeIndex) Get(ng ngram) (ss simpleSection) {
 		return simpleSection{}
 	}
 
-	return b.GetPostingList(postingIndexOffset + x)
+	return b.getPostingList(postingIndexOffset + x)
 }
 
-// GetPostingList returns the simple section pointing to the posting list of
+// getPostingList returns the simple section pointing to the posting list of
 // the ngram at ngramIndex.
 //
 // Assumming we don't hit a page boundary, which should be rare given that we
 // only read 8 bytes, we need 1 disk access to read the posting offset.
-func (b btreeIndex) GetPostingList(ngramIndex int) simpleSection {
+func (b btreeIndex) getPostingList(ngramIndex int) simpleSection {
 	relativeOffsetBytes := uint32(ngramIndex) * 4
 
 	if relativeOffsetBytes+8 <= b.postingIndex.sz {
@@ -475,7 +422,7 @@ func (b btreeIndex) DumpMap() map[ngram]simpleSection {
 			// decode all ngrams in the bucket and fill map
 			for i := 0; i < len(bucket)/ngramEncoding; i++ {
 				gram := ngram(binary.BigEndian.Uint64(bucket[i*8:]))
-				m[gram] = b.GetPostingList(int(n.postingIndexOffset) + i)
+				m[gram] = b.getPostingList(int(n.postingIndexOffset) + i)
 			}
 		case *innerNode:
 			return
diff --git a/indexdata.go b/indexdata.go
index a5e530684..f969c4e20 100644
--- a/indexdata.go
+++ b/indexdata.go
@@ -23,9 +23,8 @@ import (
 	"math/bits"
 	"unicode/utf8"
 
-	"golang.org/x/exp/slices"
-
 	"github.com/sourcegraph/zoekt/query"
+	"golang.org/x/exp/slices"
 )
 
 // indexData holds the pattern-independent data that we have to have
@@ -414,23 +413,22 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult
 	slices.SortFunc(ngramOffs, func(a, b runeNgramOff) bool {
 		return a.ngram < b.ngram
 	})
-
-	index := d.ngrams(query.FileName)
 	frequencies := make([]uint32, 0, len(ngramOffs))
 	ngramLookups := 0
-	if query.CaseSensitive {
-		// Perf: Look up ngram indexes without loading posting lists. This way we can
-		// stop early if a ngram does not exist. On the flip side we incur an additional
-		// loop and more memory allocations.
-
-		ngrams := make([]ngram, 0, len(ngramOffs))
-		for _, ng := range ngramOffs {
-			ngrams = append(ngrams, ng.ngram)
+	ngrams := d.ngrams(query.FileName)
+	for _, o := range ngramOffs {
+		var freq uint32
+		if query.CaseSensitive {
+			freq = ngrams.Get(o.ngram).sz
+			ngramLookups++
+		} else {
+			for _, v := range generateCaseNgrams(o.ngram) {
+				freq += ngrams.Get(v).sz
+				ngramLookups++
+			}
 		}
 
-		var ngramIndexes []int
-		ngramIndexes, ngramLookups = index.NgramIndexes(ngrams)
-		if len(ngramIndexes) == 0 {
+		if freq == 0 {
 			return &ngramIterationResults{
 				matchIterator: &noMatchTree{
 					Why: "freq=0",
@@ -441,35 +439,7 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult
 			}, nil
 		}
 
-		for _, ngramIndex := range ngramIndexes {
-			frequencies = append(frequencies, index.GetPostingList(ngramIndex).sz)
-		}
-	} else {
-		for _, o := range ngramOffs {
-			var freq uint32
-			if query.CaseSensitive {
-				freq = index.Get(o.ngram).sz
-				ngramLookups++
-			} else {
-				for _, v := range generateCaseNgrams(o.ngram) {
-					freq += index.Get(v).sz
-					ngramLookups++
-				}
-			}
-
-			if freq == 0 {
-				return &ngramIterationResults{
-					matchIterator: &noMatchTree{
-						Why: "freq=0",
-						Stats: Stats{
-							NgramLookups: ngramLookups,
-						},
-					},
-				}, nil
-			}
-
-			frequencies = append(frequencies, freq)
-		}
+		frequencies = append(frequencies, freq)
 	}
 
 	// first and last are now the smallest trigram posting lists to iterate