From fb825fc28e992499051c91821c59708402e20c8a Mon Sep 17 00:00:00 2001
From: Keegan Carruthers-Smith <keegan.csmith@gmail.com>
Date: Fri, 14 Jul 2023 09:37:50 +0200
Subject: [PATCH 1/2] remove ngram offset code

We have been running btree as the default for many months. We worried
about a performance hit, but it never happened. After some recent local
testing I did I noticed the btree actually interacted with the disk more
efficiently. So the old code both uses more memory and is slower, lets
just remove it.

Test Plan: go test ./...
---
 btree.go            |  34 ++++
 ngramoffset.go      | 401 --------------------------------------------
 ngramoffset_test.go | 166 ------------------
 read.go             |  67 +-------
 4 files changed, 39 insertions(+), 629 deletions(-)
 delete mode 100644 ngramoffset.go
 delete mode 100644 ngramoffset_test.go

diff --git a/btree.go b/btree.go
index 72f097ac3..c56bb3942 100644
--- a/btree.go
+++ b/btree.go
@@ -427,3 +427,37 @@ func (b btreeIndex) DumpMap() map[ngram]simpleSection {
 
 	return m
 }
+
+type ngramIndex interface {
+	Get(gram ngram) simpleSection
+	DumpMap() map[ngram]simpleSection
+	SizeBytes() int
+}
+
+// This is a temporary type to wrap two very different implementations of the
+// inverted index for the purpose of feature-flagging. We will remove this after
+// we enable the b-tree permanently.
+//
+// Alternatively we could have adapted/extended the interface "ngramIndex".
+// However, adapting the existing implementations and their tests to match the
+// access pattern of map[ngram][]byte seems more cumbersome than this makeshift
+// wrapper. In the end, both ngramIndex and this wrapper will be replaced by a
+// concrete type.
+//
+// TODO we have remove non-btree code, lets create the concrete type.
+type fileNameNgrams struct {
+	bt btreeIndex
+}
+
+func (n fileNameNgrams) GetBlob(ng ngram) ([]byte, error) {
+	sec := n.bt.Get(ng)
+	return n.bt.file.Read(sec.off, sec.sz)
+}
+
+func (n fileNameNgrams) Frequency(ng ngram) uint32 {
+	return n.bt.Get(ng).sz
+}
+
+func (n fileNameNgrams) SizeBytes() int {
+	return n.bt.SizeBytes()
+}
diff --git a/ngramoffset.go b/ngramoffset.go
deleted file mode 100644
index 6c0d67f05..000000000
--- a/ngramoffset.go
+++ /dev/null
@@ -1,401 +0,0 @@
-// Copyright 2021 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package zoekt
-
-import (
-	"sort"
-)
-
-// shrinkUint32Slice copies slices with excess capacity to precisely sized ones
-// to avoid wasting memory. It should be used on slices with long static durations.
-func shrinkUint32Slice(a []uint32) []uint32 {
-	if cap(a)-len(a) < 32 {
-		return a
-	}
-	out := make([]uint32, len(a))
-	copy(out, a)
-	return out
-}
-
-type topOffset struct {
-	top, off uint32
-}
-
-// arrayNgramOffset splits ngrams into two 32-bit parts and uses binary search
-// to satisfy requests. A three-level trie (over the runes of an ngram) uses 20%
-// more memory than this simple two-level split.
-type arrayNgramOffset struct {
-	// tops specify where the bottom halves of ngrams with the 32-bit top half begin.
-	// The offset of the next value is used to find where the bottom section ends.
-	tops []topOffset
-
-	// bots are bottom halves of an ngram, referenced by tops
-	bots []uint32
-
-	// offsets is values from simpleSection.off, simpleSection.sz is computed by subtracting
-	// adjacent offsets.
-	offsets []uint32
-}
-
-func makeArrayNgramOffset(ngrams []ngram, offsets []uint32) arrayNgramOffset {
-	arr := arrayNgramOffset{
-		bots: make([]uint32, 0, len(ngrams)),
-	}
-	arr.offsets = shrinkUint32Slice(offsets)
-
-	lastTop := uint32(0xffffffff)
-	lastStart := uint32(0)
-	for i, v := range ngrams {
-		curTop := uint32(v >> 32)
-		if curTop != lastTop {
-			if lastTop != 0xffffffff {
-				arr.tops = append(arr.tops, topOffset{lastTop, lastStart})
-			}
-			lastTop = curTop
-			lastStart = uint32(i)
-		}
-		arr.bots = append(arr.bots, uint32(v))
-	}
-	// add a sentinel value to make it simple to compute sizes
-	arr.tops = append(arr.tops, topOffset{lastTop, lastStart}, topOffset{0xffffffff, uint32(len(arr.bots))})
-
-	// shrink arr.tops to minimal size
-	tops := make([]topOffset, len(arr.tops))
-	copy(tops, arr.tops)
-	arr.tops = tops
-
-	return arr
-}
-
-func (a *arrayNgramOffset) Get(gram ngram) simpleSection {
-	if a.tops == nil {
-		return simpleSection{}
-	}
-
-	top, bot := uint32(uint64(gram)>>32), uint32(gram)
-
-	topIdx := sort.Search(len(a.tops)-1, func(i int) bool { return a.tops[i].top >= top })
-	if topIdx == len(a.tops)-1 || a.tops[topIdx].top != top {
-		return simpleSection{}
-	}
-
-	botsSec := a.bots[a.tops[topIdx].off:a.tops[topIdx+1].off]
-	botIdx := sort.Search(len(botsSec), func(i int) bool { return botsSec[i] >= bot })
-	if botIdx == len(botsSec) || botsSec[botIdx] != bot {
-		return simpleSection{}
-	}
-	idx := botIdx + int(a.tops[topIdx].off)
-	return simpleSection{
-		off: a.offsets[idx],
-		sz:  a.offsets[idx+1] - a.offsets[idx],
-	}
-}
-
-func (a *arrayNgramOffset) DumpMap() map[ngram]simpleSection {
-	m := make(map[ngram]simpleSection, len(a.offsets)-1)
-	for i := 0; i < len(a.tops)-1; i++ {
-		top, botStart, botEnd := a.tops[i].top, a.tops[i].off, a.tops[i+1].off
-		botSec := a.bots[botStart:botEnd]
-		for j, bot := range botSec {
-			idx := int(botStart) + j
-			m[ngram(uint64(top)<<32|uint64(bot))] = simpleSection{
-				off: a.offsets[idx],
-				sz:  a.offsets[idx+1] - a.offsets[idx],
-			}
-		}
-	}
-	return m
-}
-
-func (a *arrayNgramOffset) SizeBytes() int {
-	return 8*len(a.tops) + 4*len(a.bots) + 4*len(a.offsets)
-}
-
-// combinedNgramOffset combines an ascii ngram mapping with a unicode ngram mapping,
-// falling back on unicode for unicode ngrams or ascii ngrams with excessive lengths.
-type combinedNgramOffset struct {
-	asc *asciiNgramOffset
-	uni *arrayNgramOffset
-}
-
-func makeCombinedNgramOffset(ngrams []ngram, offsets []uint32) combinedNgramOffset {
-	// split ngrams & offsets into ascii ngrams and unicode ngrams,
-	// since ascii ngrams can be represented much more compactly (21b instead of 63b)
-
-	// allocate these arrays based off of rough measurements of what their typical
-	// sizes are-- source code is mostly ASCII with a little bit of Unicode.
-	// Allocating 101% of the total number of ngrams gives a little space for the
-	// duplicate entries used to mark section ends.
-	ngramsAscii := make([]ngramAscii, 0, len(ngrams)*101/100)
-	offsetsAscii := make([]uint32, 0, len(ngrams)*101/100)
-
-	ngramsUnicode := make([]ngram, 0, len(ngrams)*11/100)
-	offsetsUnicode := make([]uint32, 0, len(ngrams)*11/100)
-
-	for i, ng := range ngrams {
-		if ng&ngramAsciiMask == ng { // is ngram ascii-only?
-			ngp := ngramAsciiToPacked(ng)
-			if i == len(ngrams)-1 || ngrams[i+1]&ngramAsciiMask != ngrams[i+1] {
-				// at the end of a section we insert an extra offset with the same ngram,
-				// so the size of the segment can be calculated properly
-				ngramsAscii = append(ngramsAscii, ngp, ngp)
-				offsetsAscii = append(offsetsAscii, offsets[i], offsets[i+1])
-			} else {
-				ngramsAscii = append(ngramsAscii, ngp)
-				offsetsAscii = append(offsetsAscii, offsets[i])
-			}
-			// note: len(offsets) == len(ngrams) + 1
-			if offsets[i+1]-offsets[i] >= ngramAsciiMaxSectionLength {
-				// max-length ascii sections can't be represented properly in the ascii mapping,
-				// and are duplicated in the normal unicode entries.
-				ngramsUnicode = append(ngramsUnicode, ng, ng)
-				offsetsUnicode = append(offsetsUnicode, offsets[i], offsets[i+1])
-			}
-		} else {
-			if i == len(ngrams)-1 || ngrams[i+1]&ngramAsciiMask == ngrams[i+1] {
-				ngramsUnicode = append(ngramsUnicode, ng, ng)
-				offsetsUnicode = append(offsetsUnicode, offsets[i], offsets[i+1])
-			} else {
-				ngramsUnicode = append(ngramsUnicode, ng)
-				offsetsUnicode = append(offsetsUnicode, offsets[i])
-			}
-		}
-	}
-
-	// The last segment always has an extra trailing ngram entry that we don't need, and
-	// is only present for spacing and alignment. Trim it.
-	if len(ngramsAscii) > 0 {
-		ngramsAscii = ngramsAscii[:len(ngramsAscii)-1]
-	}
-	if len(ngramsUnicode) > 0 {
-		ngramsUnicode = ngramsUnicode[:len(ngramsUnicode)-1]
-	}
-
-	asc := makeAsciiNgramOffset(ngramsAscii, offsetsAscii)
-	uni := makeArrayNgramOffset(ngramsUnicode, offsetsUnicode)
-
-	return combinedNgramOffset{asc, &uni}
-}
-
-// Get returns a simpleSection with sz=0 if no entry, otherwise the appropriate
-// offset based on the underlying ASCII or Unicode offset index.
-func (a combinedNgramOffset) Get(gram ngram) simpleSection {
-	if a.asc == nil {
-		return simpleSection{}
-	}
-
-	var sec simpleSection
-	if gram&ngramAsciiMask == gram {
-		sec = a.asc.Get(gram)
-		if sec.sz == ngramAsciiMaxSectionLength {
-			// Fallback: this section's length was too long to store in the
-			// ASCII map, find it in the Unicode map.
-			sec = a.uni.Get(gram)
-		}
-	} else {
-		sec = a.uni.Get(gram)
-	}
-
-	return sec
-}
-
-func (a combinedNgramOffset) DumpMap() map[ngram]simpleSection {
-	m := a.asc.DumpMap()
-	for k, v := range a.uni.DumpMap() {
-		m[k] = v
-	}
-	return m
-}
-
-func (a combinedNgramOffset) SizeBytes() int {
-	return a.asc.SizeBytes() + a.uni.SizeBytes()
-}
-
-const ngramAsciiMask = 127 | 127<<21 | 127<<42
-
-// Ascii mapping packs 3*7b chars and 11 bits of lengths, with this as the set maximum.
-// We could save another ~3% of total RAM / 5% of combinedNgramOffset RAM by switching to
-// a 40b packing with 19-bit lengths, but the code would be significantly uglier so it doesn't
-// seem worth it.
-const ngramAsciiMaxSectionLength = (1 << 11) - 1
-
-type ngramAscii uint32
-
-func ngramAsciiToPacked(ng ngram) ngramAscii {
-	return ngramAscii(uint32(ng&127) | uint32((ng>>(21-7))&(127<<7)) | uint32((ng>>(42-14))&(127<<14)))
-}
-
-func ngramAsciiPackedToNgram(ng ngramAscii) ngram {
-	return ngram(ng&127) | ngram(ng&(127<<7))<<(21-7) | ngram(ng&(127<<14))<<(42-14)
-}
-
-// asciiNgramOffset stores ascii trigrams packed together with short lengths,
-// using offsets for a chunk of entries to limit the number of lengths that must
-// be summed to compute a section's offset.
-type asciiNgramOffset struct {
-	entries      []uint32 // (chara << 25 | charb << 18 | charc << 11 | length)
-	chunkOffsets []uint32 // offset for entries[i*asciiNgramOffsetChunkLength]
-}
-
-// asciiNgramOffsetChunkLength specifies how many entries share one initial offset.
-// It must be a power of 2, and was chosen empirically by measuring RAM usage:
-// 8: 4132MB, 16: 4047MB, 32: 4006MB, 64: 3992MB, 128: 3990MB
-const asciiNgramOffsetChunkLength = 32
-
-func makeAsciiNgramOffset(ngrams []ngramAscii, offsets []uint32) *asciiNgramOffset {
-	ao := &asciiNgramOffset{
-		entries:      make([]uint32, 0, len(ngrams)),
-		chunkOffsets: make([]uint32, 0, len(ngrams)/asciiNgramOffsetChunkLength),
-	}
-
-	for i, ng := range ngrams {
-		if len(ao.entries)%asciiNgramOffsetChunkLength == 0 {
-			ao.chunkOffsets = append(ao.chunkOffsets, offsets[i])
-		}
-		length := offsets[i+1] - offsets[i]
-
-		for {
-			if length < ngramAsciiMaxSectionLength {
-				ao.entries = append(ao.entries, uint32(ng)<<11|length)
-				break
-			} else {
-				// entries with lengths that are too long can't be represented fully in this
-				// map, but we repeatedly insert offsets to make the next entry's offset computable
-				// by summing the offsets in the preceding entries in the chunk, including
-				// this invalid one.
-				ao.entries = append(ao.entries, uint32(ng)<<11|ngramAsciiMaxSectionLength)
-				length -= ngramAsciiMaxSectionLength
-				if len(ao.entries)%asciiNgramOffsetChunkLength == 0 {
-					// We reached the end of the chunk, so there's no need to reach the
-					// offset for the next entry.
-					break
-				}
-			}
-		}
-	}
-
-	ao.entries = shrinkUint32Slice(ao.entries)
-	ao.chunkOffsets = shrinkUint32Slice(ao.chunkOffsets)
-
-	return ao
-}
-
-// Get returns a simpleSection with sz=0 if no entry, or sz=ngramAsciiMaxSectionLength
-// if the length of the ngram is too large for this type and it should cascade to the next entry.
-func (a *asciiNgramOffset) Get(gram ngram) simpleSection {
-	if gram&ngramAsciiMask != gram {
-		return simpleSection{}
-	}
-	g := uint32(ngramAsciiToPacked(gram) << 11)
-
-	idx := sort.Search(len(a.entries), func(i int) bool {
-		return a.entries[i] >= g
-	})
-
-	if idx == len(a.entries) || a.entries[idx]>>11 != g>>11 {
-		return simpleSection{}
-	}
-
-	length := a.entries[idx] & ngramAsciiMaxSectionLength
-	if length == ngramAsciiMaxSectionLength {
-		// this ascii ngram's section length is too large to be represented;
-		// repeate the Get() on the unicode map to get the correct result.
-		return simpleSection{
-			off: 0,
-			sz:  ngramAsciiMaxSectionLength,
-		}
-	}
-
-	chunkNum := idx / asciiNgramOffsetChunkLength
-	chunkBase := chunkNum * asciiNgramOffsetChunkLength
-	offset := a.chunkOffsets[chunkNum]
-	for i := chunkBase; i < idx; i++ {
-		offset += a.entries[i] & ngramAsciiMaxSectionLength
-	}
-
-	return simpleSection{
-		off: offset,
-		sz:  length,
-	}
-}
-
-func (a *asciiNgramOffset) DumpMap() map[ngram]simpleSection {
-	m := make(map[ngram]simpleSection, len(a.entries))
-	off := uint32(0)
-	for i, ent := range a.entries {
-		if i%asciiNgramOffsetChunkLength == 0 {
-			off = a.chunkOffsets[i/asciiNgramOffsetChunkLength]
-		}
-		length := ent & ngramAsciiMaxSectionLength
-		if length == ngramAsciiMaxSectionLength {
-			// This entry is an ascii gram with a section too long
-			// to be represented, so skip the entry.
-			continue
-		}
-		m[ngramAsciiPackedToNgram(ngramAscii(ent>>11))] = simpleSection{
-			off: off,
-			sz:  length,
-		}
-		off += length
-	}
-	return m
-}
-
-func (a *asciiNgramOffset) SizeBytes() int {
-	return 4*len(a.entries) + 4*len(a.chunkOffsets)
-}
-
-type ngramIndex interface {
-	Get(gram ngram) simpleSection
-	DumpMap() map[ngram]simpleSection
-	SizeBytes() int
-}
-
-// This is a temporary type to wrap two very different implementations of the
-// inverted index for the purpose of feature-flagging. We will remove this after
-// we enable the b-tree permanently.
-//
-// Alternatively we could have adapted/extended the interface "ngramIndex".
-// However, adapting the existing implementations and their tests to match the
-// access pattern of map[ngram][]byte seems more cumbersome than this makeshift
-// wrapper. In the end, both ngramIndex and this wrapper will be replaced by a
-// concrete type.
-type fileNameNgrams struct {
-	m  map[ngram][]byte
-	bt btreeIndex
-}
-
-func (n fileNameNgrams) GetBlob(ng ngram) ([]byte, error) {
-	if n.m != nil {
-		return n.m[ng], nil
-	}
-	sec := n.bt.Get(ng)
-	return n.bt.file.Read(sec.off, sec.sz)
-}
-
-func (n fileNameNgrams) Frequency(ng ngram) uint32 {
-	if n.m != nil {
-		return uint32(len(n.m[ng]))
-	}
-	return n.bt.Get(ng).sz
-}
-
-func (n fileNameNgrams) SizeBytes() int {
-	if n.m != nil {
-		// these slices reference mmap-ed memory
-		return 12 * len(n.m)
-	}
-	return n.bt.SizeBytes()
-}
diff --git a/ngramoffset_test.go b/ngramoffset_test.go
deleted file mode 100644
index a6599d346..000000000
--- a/ngramoffset_test.go
+++ /dev/null
@@ -1,166 +0,0 @@
-// Copyright 2021 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package zoekt
-
-import (
-	"fmt"
-	"math/rand"
-	"sort"
-	"testing"
-)
-
-func TestMakeArrayNgramOffset(t *testing.T) {
-	for n, tc := range []struct {
-		ngrams  []string
-		offsets []uint32
-	}{
-		{nil, nil},
-		{[]string{"ant", "any", "awl", "big", "bin", "bit", "can", "con"}, []uint32{0, 2, 5, 8, 10, 14, 18, 25, 30}},
-	} {
-		ngrams := []ngram{}
-		for _, s := range tc.ngrams {
-			ngrams = append(ngrams, stringToNGram(s))
-		}
-		m := makeArrayNgramOffset(ngrams, tc.offsets)
-		t.Logf("makeNgramOffset(%v, %v) => %s", tc.ngrams, tc.offsets, &m)
-		failn := stringToNGram("foo")
-		if getFail := m.Get(failn); getFail != (simpleSection{}) {
-			t.Errorf("#%d: Get(%q) got %v, want zero", n, failn, getFail)
-		}
-		for i := 0; i < len(tc.offsets)-1; i++ {
-			want := simpleSection{tc.offsets[i], tc.offsets[i+1] - tc.offsets[i]}
-			got := m.Get(ngrams[i])
-			if want != got {
-				t.Errorf("#%d.%d: Get(%q) got %v, want %v", n, i, tc.ngrams[i], got, want)
-			}
-			failn := ngram(uint64(ngrams[i] - 1))
-			if getFail := m.Get(failn); getFail != (simpleSection{}) {
-				t.Errorf("#%d.%d: Get(%q) got %v, want zero", n, i, failn, getFail)
-			}
-			failn = ngram(uint64(ngrams[i] + 1))
-			if getFail := m.Get(failn); getFail != (simpleSection{}) {
-				t.Errorf("#%d.%d: Get(%q) got %v, want zero", n, i, failn, getFail)
-			}
-		}
-	}
-}
-
-func TestMakeCombinedNgramOffset(t *testing.T) {
-	// The ascii / unicode ngram offset splitting is significantly
-	// more complicated. Exercise it with a more comprehensive test!
-	unicodeProbability := 0.2
-	ngramCount := 1000
-	ngramMap := map[ngram]bool{}
-
-	rng := rand.New(rand.NewSource(42))
-
-	randRune := func() rune {
-		if rng.Float64() < unicodeProbability {
-			return rune(0x100 + rand.Intn(0x80)) // Emoji
-		}
-		return rune('A' + rng.Intn('Z'-'A')) // A letter
-	}
-
-	for len(ngramMap) < ngramCount {
-		ngramMap[runesToNGram([3]rune{randRune(), randRune(), randRune()})] = true
-	}
-
-	ngrams := []ngram{}
-	for ng := range ngramMap {
-		ngrams = append(ngrams, ng)
-	}
-	sort.Slice(ngrams, func(i, j int) bool { return ngrams[i] < ngrams[j] })
-
-	offset := uint32(0)
-	offsets := []uint32{0}
-
-	for i := 0; i < len(ngrams); i++ {
-		// vary
-		offset += uint32(ngramAsciiMaxSectionLength/2 + rand.Intn(ngramAsciiMaxSectionLength))
-		offsets = append(offsets, offset)
-	}
-
-	m := makeCombinedNgramOffset(ngrams, offsets)
-
-	for i, ng := range ngrams {
-		want := simpleSection{offsets[i], offsets[i+1] - offsets[i]}
-		got := m.Get(ng)
-		if want != got {
-			t.Errorf("#%d: Get(%q) got %v, want %v", i, ng, got, want)
-		}
-		failn := ngram(uint64(ng - 1))
-		if getFail := m.Get(failn); !ngramMap[failn] && getFail != (simpleSection{}) {
-			t.Errorf("#%d: Get(%q) got %v, want zero", i, failn, getFail)
-		}
-		failn = ngram(uint64(ng + 1))
-		if getFail := m.Get(failn); !ngramMap[failn] && getFail != (simpleSection{}) {
-			t.Errorf("#%d: Get(%q) got %v, want zero", i, failn, getFail)
-		}
-	}
-
-	if t.Failed() || true {
-		t.Log(ngrams)
-		t.Log(offsets)
-		t.Log(m)
-	}
-}
-
-func (a combinedNgramOffset) String() string {
-	return fmt.Sprintf("combinedNgramOffset{\n  asc: %s,\n  uni: %s,\n}", a.asc, a.uni)
-}
-
-func (a *arrayNgramOffset) String() string {
-	o := "arrayNgramOffset{tops:{"
-	for i, p := range a.tops {
-		if i > 0 {
-			o += ", "
-		}
-		if p.top&1023 == 0 {
-			// only one rune is represented here
-			o += fmt.Sprintf("%s: %d", string(rune(p.top>>10)), p.off)
-		} else {
-			o += fmt.Sprintf("0x%x: %d", p.top>>10, p.off)
-		}
-	}
-	o += "}, bots: {"
-	for i, p := range a.bots {
-		if i > 0 {
-			o += ", "
-		}
-		if p < (256 << 21) {
-			// two ascii-ish runes (probably)
-			o += fmt.Sprintf("%s%s", string(rune(p>>21)), string(rune(p&runeMask)))
-		} else {
-			o += fmt.Sprintf("0x%x", p)
-		}
-	}
-	o += fmt.Sprintf("}, offsets: %v}", a.offsets)
-	return o
-}
-
-func (a *asciiNgramOffset) String() string {
-	o := "asciiNgramOffset{entries:{"
-	for i, e := range a.entries {
-		ng := ngramAsciiPackedToNgram(ngramAscii(uint32(e) >> 11))
-		length := e & ngramAsciiMaxSectionLength
-		if i > 0 {
-			o += ", "
-		}
-		o += fmt.Sprintf("%s: %d", ng, length)
-	}
-	o += fmt.Sprintf("}, chunkOffsets: %v}", a.chunkOffsets)
-	return o
-
-}
diff --git a/read.go b/read.go
index 9f49b3b6a..298b7270b 100644
--- a/read.go
+++ b/read.go
@@ -288,19 +288,11 @@ func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) {
 		return nil, err
 	}
 
-	if os.Getenv("ZOEKT_DISABLE_BTREE") != "" {
-		offsetMap, err := d.readNgrams(toc)
-		if err != nil {
-			return nil, err
-		}
-		d.ngrams = offsetMap
-	} else {
-		bt, err := d.newBtreeIndex(toc.ngramText, toc.postings)
-		if err != nil {
-			return nil, err
-		}
-		d.ngrams = bt
+	bt, err := d.newBtreeIndex(toc.ngramText, toc.postings)
+	if err != nil {
+		return nil, err
 	}
+	d.ngrams = bt
 
 	d.fileBranchMasks, err = readSectionU64(d.file, toc.branchMasks)
 	if err != nil {
@@ -314,11 +306,7 @@ func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) {
 
 	d.fileNameIndex = toc.fileNames.relativeIndex()
 
-	if os.Getenv("ZOEKT_DISABLE_BTREE") != "" {
-		d.fileNameNgrams.m, err = d.readFileNameNgrams(toc)
-	} else {
-		d.fileNameNgrams.bt, err = d.newBtreeIndex(toc.nameNgramText, toc.namePostings)
-	}
+	d.fileNameNgrams.bt, err = d.newBtreeIndex(toc.nameNgramText, toc.namePostings)
 	if err != nil {
 		return nil, err
 	}
@@ -460,26 +448,6 @@ func (r *reader) readMetadata(toc *indexTOC) ([]*Repository, *IndexMetadata, err
 
 const ngramEncoding = 8
 
-func (d *indexData) readNgrams(toc *indexTOC) (combinedNgramOffset, error) {
-	textContent, err := d.readSectionBlob(toc.ngramText)
-	if err != nil {
-		return combinedNgramOffset{}, err
-	}
-	postingsIndex := toc.postings.relativeIndex()
-
-	for i := 0; i < len(postingsIndex); i++ {
-		postingsIndex[i] += toc.postings.data.off
-	}
-
-	ngrams := make([]ngram, 0, len(textContent)/ngramEncoding)
-	for i := 0; i < len(textContent); i += ngramEncoding {
-		ng := ngram(binary.BigEndian.Uint64(textContent[i : i+ngramEncoding]))
-		ngrams = append(ngrams, ng)
-	}
-
-	return makeCombinedNgramOffset(ngrams, postingsIndex), nil
-}
-
 func (d *indexData) newBtreeIndex(ngramSec simpleSection, postings compoundSection) (btreeIndex, error) {
 	bi := btreeIndex{file: d.file}
 
@@ -507,31 +475,6 @@ func (d *indexData) newBtreeIndex(ngramSec simpleSection, postings compoundSecti
 	return bi, nil
 }
 
-func (d *indexData) readFileNameNgrams(toc *indexTOC) (map[ngram][]byte, error) {
-	nameNgramText, err := d.readSectionBlob(toc.nameNgramText)
-	if err != nil {
-		return nil, err
-	}
-
-	fileNamePostingsData, err := d.readSectionBlob(toc.namePostings.data)
-	if err != nil {
-		return nil, err
-	}
-
-	fileNamePostingsIndex := toc.namePostings.relativeIndex()
-
-	fileNameNgrams := make(map[ngram][]byte, len(nameNgramText)/ngramEncoding)
-	for i := 0; i < len(nameNgramText); i += ngramEncoding {
-		j := i / ngramEncoding
-		off := fileNamePostingsIndex[j]
-		end := fileNamePostingsIndex[j+1]
-		ng := ngram(binary.BigEndian.Uint64(nameNgramText[i : i+ngramEncoding]))
-		fileNameNgrams[ng] = fileNamePostingsData[off:end]
-	}
-
-	return fileNameNgrams, nil
-}
-
 func (d *indexData) verify() error {
 	// This is not an exhaustive check: the postings can easily
 	// generate OOB acccesses, and are expensive to check, but this lets us rule out

From 40d709a80d41c6333c8b6c5954a797a92635cd88 Mon Sep 17 00:00:00 2001
From: Keegan Carruthers-Smith <keegan.csmith@gmail.com>
Date: Fri, 14 Jul 2023 09:52:31 +0200
Subject: [PATCH 2/2] remove intermediate ngramIndex interface and
 fileNameNgrams

This is no longer needed, we always use the btree. Additionally I added
some documentation.
---
 btree.go     | 40 +++++++++-------------------------------
 hititer.go   |  4 ----
 indexdata.go | 15 ++++-----------
 read.go      |  8 ++------
 4 files changed, 15 insertions(+), 52 deletions(-)

diff --git a/btree.go b/btree.go
index c56bb3942..80e973913 100644
--- a/btree.go
+++ b/btree.go
@@ -289,6 +289,7 @@ type btreeIndex struct {
 	postingIndex simpleSection
 }
 
+// SizeBytes returns how much memory this structure uses in the heap.
 func (b btreeIndex) SizeBytes() (sz int) {
 	// btree
 	if b.bt != nil {
@@ -401,6 +402,9 @@ func (b btreeIndex) getBucket(bucketIndex int) (off uint32, sz uint32) {
 	return
 }
 
+// DumpMap is a debug method which returns the btree as an in-memory
+// representation. This is how zoekt represents the ngram index in
+// google/zoekt.
 func (b btreeIndex) DumpMap() map[ngram]simpleSection {
 	if b.bt == nil {
 		return nil
@@ -428,36 +432,10 @@ func (b btreeIndex) DumpMap() map[ngram]simpleSection {
 	return m
 }
 
-type ngramIndex interface {
-	Get(gram ngram) simpleSection
-	DumpMap() map[ngram]simpleSection
-	SizeBytes() int
-}
-
-// This is a temporary type to wrap two very different implementations of the
-// inverted index for the purpose of feature-flagging. We will remove this after
-// we enable the b-tree permanently.
-//
-// Alternatively we could have adapted/extended the interface "ngramIndex".
-// However, adapting the existing implementations and their tests to match the
-// access pattern of map[ngram][]byte seems more cumbersome than this makeshift
-// wrapper. In the end, both ngramIndex and this wrapper will be replaced by a
-// concrete type.
+// GetBlob returns the raw encoded offset list for ng.
 //
-// TODO we have remove non-btree code, lets create the concrete type.
-type fileNameNgrams struct {
-	bt btreeIndex
-}
-
-func (n fileNameNgrams) GetBlob(ng ngram) ([]byte, error) {
-	sec := n.bt.Get(ng)
-	return n.bt.file.Read(sec.off, sec.sz)
-}
-
-func (n fileNameNgrams) Frequency(ng ngram) uint32 {
-	return n.bt.Get(ng).sz
-}
-
-func (n fileNameNgrams) SizeBytes() int {
-	return n.bt.SizeBytes()
+// Note: the returned byte slice is mmap backed normally.
+func (b btreeIndex) GetBlob(ng ngram) ([]byte, error) {
+	sec := b.Get(ng)
+	return b.file.Read(sec.off, sec.sz)
 }
diff --git a/hititer.go b/hititer.go
index 86bb0df68..57e90bad7 100644
--- a/hititer.go
+++ b/hititer.go
@@ -110,10 +110,6 @@ func (d *indexData) newDistanceTrigramIter(ng1, ng2 ngram, dist uint32, caseSens
 }
 
 func (d *indexData) trigramHitIterator(ng ngram, caseSensitive, fileName bool) (hitIterator, error) {
-	if d.ngrams == nil {
-		return nil, fmt.Errorf("trigramHitIterator: ngrams=nil")
-	}
-
 	variants := []ngram{ng}
 	if !caseSensitive {
 		variants = generateCaseNgrams(ng)
diff --git a/indexdata.go b/indexdata.go
index 02c59fb4a..c1cb16928 100644
--- a/indexdata.go
+++ b/indexdata.go
@@ -33,7 +33,7 @@ type indexData struct {
 
 	file IndexFile
 
-	ngrams ngramIndex
+	ngrams btreeIndex
 
 	newlinesStart uint32
 	newlinesIndex []uint32
@@ -56,7 +56,7 @@ type indexData struct {
 
 	fileNameContent []byte
 	fileNameIndex   []uint32
-	fileNameNgrams  fileNameNgrams
+	fileNameNgrams  btreeIndex
 
 	// fileEndSymbol[i] is the index of the first symbol for document i.
 	fileEndSymbol []uint32
@@ -314,9 +314,7 @@ func (d *indexData) memoryUse() int {
 	}
 	sz += 8 * len(d.runeDocSections)
 	sz += 8 * len(d.fileBranchMasks)
-	if d.ngrams != nil {
-		sz += d.ngrams.SizeBytes()
-	}
+	sz += d.ngrams.SizeBytes()
 	sz += d.fileNameNgrams.SizeBytes()
 	return sz
 }
@@ -349,13 +347,8 @@ func lastMinarg(xs []uint32) uint32 {
 
 func (data *indexData) ngramFrequency(ng ngram, filename bool) uint32 {
 	if filename {
-		return data.fileNameNgrams.Frequency(ng)
-	}
-
-	if data.ngrams == nil {
-		return 0
+		return data.fileNameNgrams.Get(ng).sz
 	}
-
 	return data.ngrams.Get(ng).sz
 }
 
diff --git a/read.go b/read.go
index 298b7270b..49434bd4e 100644
--- a/read.go
+++ b/read.go
@@ -288,11 +288,10 @@ func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) {
 		return nil, err
 	}
 
-	bt, err := d.newBtreeIndex(toc.ngramText, toc.postings)
+	d.ngrams, err = d.newBtreeIndex(toc.ngramText, toc.postings)
 	if err != nil {
 		return nil, err
 	}
-	d.ngrams = bt
 
 	d.fileBranchMasks, err = readSectionU64(d.file, toc.branchMasks)
 	if err != nil {
@@ -306,7 +305,7 @@ func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) {
 
 	d.fileNameIndex = toc.fileNames.relativeIndex()
 
-	d.fileNameNgrams.bt, err = d.newBtreeIndex(toc.nameNgramText, toc.namePostings)
+	d.fileNameNgrams, err = d.newBtreeIndex(toc.nameNgramText, toc.namePostings)
 	if err != nil {
 		return nil, err
 	}
@@ -656,9 +655,6 @@ func PrintNgramStats(r IndexFile) error {
 		return err
 	}
 
-	if id.ngrams == nil {
-		return fmt.Errorf("PrintNgramStats: ngrams=nil")
-	}
 	var rNgram [3]rune
 	for ngram, ss := range id.ngrams.DumpMap() {
 		rNgram = ngramToRunes(ngram)