From fb825fc28e992499051c91821c59708402e20c8a Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Fri, 14 Jul 2023 09:37:50 +0200 Subject: [PATCH 1/2] remove ngram offset code We have been running btree as the default for many months. We worried about a performance hit, but it never happened. After some recent local testing I did I noticed the btree actually interacted with the disk more efficiently. So the old code both uses more memory and is slower, lets just remove it. Test Plan: go test ./... --- btree.go | 34 ++++ ngramoffset.go | 401 -------------------------------------------- ngramoffset_test.go | 166 ------------------ read.go | 67 +------- 4 files changed, 39 insertions(+), 629 deletions(-) delete mode 100644 ngramoffset.go delete mode 100644 ngramoffset_test.go diff --git a/btree.go b/btree.go index 72f097ac3..c56bb3942 100644 --- a/btree.go +++ b/btree.go @@ -427,3 +427,37 @@ func (b btreeIndex) DumpMap() map[ngram]simpleSection { return m } + +type ngramIndex interface { + Get(gram ngram) simpleSection + DumpMap() map[ngram]simpleSection + SizeBytes() int +} + +// This is a temporary type to wrap two very different implementations of the +// inverted index for the purpose of feature-flagging. We will remove this after +// we enable the b-tree permanently. +// +// Alternatively we could have adapted/extended the interface "ngramIndex". +// However, adapting the existing implementations and their tests to match the +// access pattern of map[ngram][]byte seems more cumbersome than this makeshift +// wrapper. In the end, both ngramIndex and this wrapper will be replaced by a +// concrete type. +// +// TODO we have remove non-btree code, lets create the concrete type. +type fileNameNgrams struct { + bt btreeIndex +} + +func (n fileNameNgrams) GetBlob(ng ngram) ([]byte, error) { + sec := n.bt.Get(ng) + return n.bt.file.Read(sec.off, sec.sz) +} + +func (n fileNameNgrams) Frequency(ng ngram) uint32 { + return n.bt.Get(ng).sz +} + +func (n fileNameNgrams) SizeBytes() int { + return n.bt.SizeBytes() +} diff --git a/ngramoffset.go b/ngramoffset.go deleted file mode 100644 index 6c0d67f05..000000000 --- a/ngramoffset.go +++ /dev/null @@ -1,401 +0,0 @@ -// Copyright 2021 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zoekt - -import ( - "sort" -) - -// shrinkUint32Slice copies slices with excess capacity to precisely sized ones -// to avoid wasting memory. It should be used on slices with long static durations. -func shrinkUint32Slice(a []uint32) []uint32 { - if cap(a)-len(a) < 32 { - return a - } - out := make([]uint32, len(a)) - copy(out, a) - return out -} - -type topOffset struct { - top, off uint32 -} - -// arrayNgramOffset splits ngrams into two 32-bit parts and uses binary search -// to satisfy requests. A three-level trie (over the runes of an ngram) uses 20% -// more memory than this simple two-level split. -type arrayNgramOffset struct { - // tops specify where the bottom halves of ngrams with the 32-bit top half begin. - // The offset of the next value is used to find where the bottom section ends. - tops []topOffset - - // bots are bottom halves of an ngram, referenced by tops - bots []uint32 - - // offsets is values from simpleSection.off, simpleSection.sz is computed by subtracting - // adjacent offsets. - offsets []uint32 -} - -func makeArrayNgramOffset(ngrams []ngram, offsets []uint32) arrayNgramOffset { - arr := arrayNgramOffset{ - bots: make([]uint32, 0, len(ngrams)), - } - arr.offsets = shrinkUint32Slice(offsets) - - lastTop := uint32(0xffffffff) - lastStart := uint32(0) - for i, v := range ngrams { - curTop := uint32(v >> 32) - if curTop != lastTop { - if lastTop != 0xffffffff { - arr.tops = append(arr.tops, topOffset{lastTop, lastStart}) - } - lastTop = curTop - lastStart = uint32(i) - } - arr.bots = append(arr.bots, uint32(v)) - } - // add a sentinel value to make it simple to compute sizes - arr.tops = append(arr.tops, topOffset{lastTop, lastStart}, topOffset{0xffffffff, uint32(len(arr.bots))}) - - // shrink arr.tops to minimal size - tops := make([]topOffset, len(arr.tops)) - copy(tops, arr.tops) - arr.tops = tops - - return arr -} - -func (a *arrayNgramOffset) Get(gram ngram) simpleSection { - if a.tops == nil { - return simpleSection{} - } - - top, bot := uint32(uint64(gram)>>32), uint32(gram) - - topIdx := sort.Search(len(a.tops)-1, func(i int) bool { return a.tops[i].top >= top }) - if topIdx == len(a.tops)-1 || a.tops[topIdx].top != top { - return simpleSection{} - } - - botsSec := a.bots[a.tops[topIdx].off:a.tops[topIdx+1].off] - botIdx := sort.Search(len(botsSec), func(i int) bool { return botsSec[i] >= bot }) - if botIdx == len(botsSec) || botsSec[botIdx] != bot { - return simpleSection{} - } - idx := botIdx + int(a.tops[topIdx].off) - return simpleSection{ - off: a.offsets[idx], - sz: a.offsets[idx+1] - a.offsets[idx], - } -} - -func (a *arrayNgramOffset) DumpMap() map[ngram]simpleSection { - m := make(map[ngram]simpleSection, len(a.offsets)-1) - for i := 0; i < len(a.tops)-1; i++ { - top, botStart, botEnd := a.tops[i].top, a.tops[i].off, a.tops[i+1].off - botSec := a.bots[botStart:botEnd] - for j, bot := range botSec { - idx := int(botStart) + j - m[ngram(uint64(top)<<32|uint64(bot))] = simpleSection{ - off: a.offsets[idx], - sz: a.offsets[idx+1] - a.offsets[idx], - } - } - } - return m -} - -func (a *arrayNgramOffset) SizeBytes() int { - return 8*len(a.tops) + 4*len(a.bots) + 4*len(a.offsets) -} - -// combinedNgramOffset combines an ascii ngram mapping with a unicode ngram mapping, -// falling back on unicode for unicode ngrams or ascii ngrams with excessive lengths. -type combinedNgramOffset struct { - asc *asciiNgramOffset - uni *arrayNgramOffset -} - -func makeCombinedNgramOffset(ngrams []ngram, offsets []uint32) combinedNgramOffset { - // split ngrams & offsets into ascii ngrams and unicode ngrams, - // since ascii ngrams can be represented much more compactly (21b instead of 63b) - - // allocate these arrays based off of rough measurements of what their typical - // sizes are-- source code is mostly ASCII with a little bit of Unicode. - // Allocating 101% of the total number of ngrams gives a little space for the - // duplicate entries used to mark section ends. - ngramsAscii := make([]ngramAscii, 0, len(ngrams)*101/100) - offsetsAscii := make([]uint32, 0, len(ngrams)*101/100) - - ngramsUnicode := make([]ngram, 0, len(ngrams)*11/100) - offsetsUnicode := make([]uint32, 0, len(ngrams)*11/100) - - for i, ng := range ngrams { - if ng&ngramAsciiMask == ng { // is ngram ascii-only? - ngp := ngramAsciiToPacked(ng) - if i == len(ngrams)-1 || ngrams[i+1]&ngramAsciiMask != ngrams[i+1] { - // at the end of a section we insert an extra offset with the same ngram, - // so the size of the segment can be calculated properly - ngramsAscii = append(ngramsAscii, ngp, ngp) - offsetsAscii = append(offsetsAscii, offsets[i], offsets[i+1]) - } else { - ngramsAscii = append(ngramsAscii, ngp) - offsetsAscii = append(offsetsAscii, offsets[i]) - } - // note: len(offsets) == len(ngrams) + 1 - if offsets[i+1]-offsets[i] >= ngramAsciiMaxSectionLength { - // max-length ascii sections can't be represented properly in the ascii mapping, - // and are duplicated in the normal unicode entries. - ngramsUnicode = append(ngramsUnicode, ng, ng) - offsetsUnicode = append(offsetsUnicode, offsets[i], offsets[i+1]) - } - } else { - if i == len(ngrams)-1 || ngrams[i+1]&ngramAsciiMask == ngrams[i+1] { - ngramsUnicode = append(ngramsUnicode, ng, ng) - offsetsUnicode = append(offsetsUnicode, offsets[i], offsets[i+1]) - } else { - ngramsUnicode = append(ngramsUnicode, ng) - offsetsUnicode = append(offsetsUnicode, offsets[i]) - } - } - } - - // The last segment always has an extra trailing ngram entry that we don't need, and - // is only present for spacing and alignment. Trim it. - if len(ngramsAscii) > 0 { - ngramsAscii = ngramsAscii[:len(ngramsAscii)-1] - } - if len(ngramsUnicode) > 0 { - ngramsUnicode = ngramsUnicode[:len(ngramsUnicode)-1] - } - - asc := makeAsciiNgramOffset(ngramsAscii, offsetsAscii) - uni := makeArrayNgramOffset(ngramsUnicode, offsetsUnicode) - - return combinedNgramOffset{asc, &uni} -} - -// Get returns a simpleSection with sz=0 if no entry, otherwise the appropriate -// offset based on the underlying ASCII or Unicode offset index. -func (a combinedNgramOffset) Get(gram ngram) simpleSection { - if a.asc == nil { - return simpleSection{} - } - - var sec simpleSection - if gram&ngramAsciiMask == gram { - sec = a.asc.Get(gram) - if sec.sz == ngramAsciiMaxSectionLength { - // Fallback: this section's length was too long to store in the - // ASCII map, find it in the Unicode map. - sec = a.uni.Get(gram) - } - } else { - sec = a.uni.Get(gram) - } - - return sec -} - -func (a combinedNgramOffset) DumpMap() map[ngram]simpleSection { - m := a.asc.DumpMap() - for k, v := range a.uni.DumpMap() { - m[k] = v - } - return m -} - -func (a combinedNgramOffset) SizeBytes() int { - return a.asc.SizeBytes() + a.uni.SizeBytes() -} - -const ngramAsciiMask = 127 | 127<<21 | 127<<42 - -// Ascii mapping packs 3*7b chars and 11 bits of lengths, with this as the set maximum. -// We could save another ~3% of total RAM / 5% of combinedNgramOffset RAM by switching to -// a 40b packing with 19-bit lengths, but the code would be significantly uglier so it doesn't -// seem worth it. -const ngramAsciiMaxSectionLength = (1 << 11) - 1 - -type ngramAscii uint32 - -func ngramAsciiToPacked(ng ngram) ngramAscii { - return ngramAscii(uint32(ng&127) | uint32((ng>>(21-7))&(127<<7)) | uint32((ng>>(42-14))&(127<<14))) -} - -func ngramAsciiPackedToNgram(ng ngramAscii) ngram { - return ngram(ng&127) | ngram(ng&(127<<7))<<(21-7) | ngram(ng&(127<<14))<<(42-14) -} - -// asciiNgramOffset stores ascii trigrams packed together with short lengths, -// using offsets for a chunk of entries to limit the number of lengths that must -// be summed to compute a section's offset. -type asciiNgramOffset struct { - entries []uint32 // (chara << 25 | charb << 18 | charc << 11 | length) - chunkOffsets []uint32 // offset for entries[i*asciiNgramOffsetChunkLength] -} - -// asciiNgramOffsetChunkLength specifies how many entries share one initial offset. -// It must be a power of 2, and was chosen empirically by measuring RAM usage: -// 8: 4132MB, 16: 4047MB, 32: 4006MB, 64: 3992MB, 128: 3990MB -const asciiNgramOffsetChunkLength = 32 - -func makeAsciiNgramOffset(ngrams []ngramAscii, offsets []uint32) *asciiNgramOffset { - ao := &asciiNgramOffset{ - entries: make([]uint32, 0, len(ngrams)), - chunkOffsets: make([]uint32, 0, len(ngrams)/asciiNgramOffsetChunkLength), - } - - for i, ng := range ngrams { - if len(ao.entries)%asciiNgramOffsetChunkLength == 0 { - ao.chunkOffsets = append(ao.chunkOffsets, offsets[i]) - } - length := offsets[i+1] - offsets[i] - - for { - if length < ngramAsciiMaxSectionLength { - ao.entries = append(ao.entries, uint32(ng)<<11|length) - break - } else { - // entries with lengths that are too long can't be represented fully in this - // map, but we repeatedly insert offsets to make the next entry's offset computable - // by summing the offsets in the preceding entries in the chunk, including - // this invalid one. - ao.entries = append(ao.entries, uint32(ng)<<11|ngramAsciiMaxSectionLength) - length -= ngramAsciiMaxSectionLength - if len(ao.entries)%asciiNgramOffsetChunkLength == 0 { - // We reached the end of the chunk, so there's no need to reach the - // offset for the next entry. - break - } - } - } - } - - ao.entries = shrinkUint32Slice(ao.entries) - ao.chunkOffsets = shrinkUint32Slice(ao.chunkOffsets) - - return ao -} - -// Get returns a simpleSection with sz=0 if no entry, or sz=ngramAsciiMaxSectionLength -// if the length of the ngram is too large for this type and it should cascade to the next entry. -func (a *asciiNgramOffset) Get(gram ngram) simpleSection { - if gram&ngramAsciiMask != gram { - return simpleSection{} - } - g := uint32(ngramAsciiToPacked(gram) << 11) - - idx := sort.Search(len(a.entries), func(i int) bool { - return a.entries[i] >= g - }) - - if idx == len(a.entries) || a.entries[idx]>>11 != g>>11 { - return simpleSection{} - } - - length := a.entries[idx] & ngramAsciiMaxSectionLength - if length == ngramAsciiMaxSectionLength { - // this ascii ngram's section length is too large to be represented; - // repeate the Get() on the unicode map to get the correct result. - return simpleSection{ - off: 0, - sz: ngramAsciiMaxSectionLength, - } - } - - chunkNum := idx / asciiNgramOffsetChunkLength - chunkBase := chunkNum * asciiNgramOffsetChunkLength - offset := a.chunkOffsets[chunkNum] - for i := chunkBase; i < idx; i++ { - offset += a.entries[i] & ngramAsciiMaxSectionLength - } - - return simpleSection{ - off: offset, - sz: length, - } -} - -func (a *asciiNgramOffset) DumpMap() map[ngram]simpleSection { - m := make(map[ngram]simpleSection, len(a.entries)) - off := uint32(0) - for i, ent := range a.entries { - if i%asciiNgramOffsetChunkLength == 0 { - off = a.chunkOffsets[i/asciiNgramOffsetChunkLength] - } - length := ent & ngramAsciiMaxSectionLength - if length == ngramAsciiMaxSectionLength { - // This entry is an ascii gram with a section too long - // to be represented, so skip the entry. - continue - } - m[ngramAsciiPackedToNgram(ngramAscii(ent>>11))] = simpleSection{ - off: off, - sz: length, - } - off += length - } - return m -} - -func (a *asciiNgramOffset) SizeBytes() int { - return 4*len(a.entries) + 4*len(a.chunkOffsets) -} - -type ngramIndex interface { - Get(gram ngram) simpleSection - DumpMap() map[ngram]simpleSection - SizeBytes() int -} - -// This is a temporary type to wrap two very different implementations of the -// inverted index for the purpose of feature-flagging. We will remove this after -// we enable the b-tree permanently. -// -// Alternatively we could have adapted/extended the interface "ngramIndex". -// However, adapting the existing implementations and their tests to match the -// access pattern of map[ngram][]byte seems more cumbersome than this makeshift -// wrapper. In the end, both ngramIndex and this wrapper will be replaced by a -// concrete type. -type fileNameNgrams struct { - m map[ngram][]byte - bt btreeIndex -} - -func (n fileNameNgrams) GetBlob(ng ngram) ([]byte, error) { - if n.m != nil { - return n.m[ng], nil - } - sec := n.bt.Get(ng) - return n.bt.file.Read(sec.off, sec.sz) -} - -func (n fileNameNgrams) Frequency(ng ngram) uint32 { - if n.m != nil { - return uint32(len(n.m[ng])) - } - return n.bt.Get(ng).sz -} - -func (n fileNameNgrams) SizeBytes() int { - if n.m != nil { - // these slices reference mmap-ed memory - return 12 * len(n.m) - } - return n.bt.SizeBytes() -} diff --git a/ngramoffset_test.go b/ngramoffset_test.go deleted file mode 100644 index a6599d346..000000000 --- a/ngramoffset_test.go +++ /dev/null @@ -1,166 +0,0 @@ -// Copyright 2021 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zoekt - -import ( - "fmt" - "math/rand" - "sort" - "testing" -) - -func TestMakeArrayNgramOffset(t *testing.T) { - for n, tc := range []struct { - ngrams []string - offsets []uint32 - }{ - {nil, nil}, - {[]string{"ant", "any", "awl", "big", "bin", "bit", "can", "con"}, []uint32{0, 2, 5, 8, 10, 14, 18, 25, 30}}, - } { - ngrams := []ngram{} - for _, s := range tc.ngrams { - ngrams = append(ngrams, stringToNGram(s)) - } - m := makeArrayNgramOffset(ngrams, tc.offsets) - t.Logf("makeNgramOffset(%v, %v) => %s", tc.ngrams, tc.offsets, &m) - failn := stringToNGram("foo") - if getFail := m.Get(failn); getFail != (simpleSection{}) { - t.Errorf("#%d: Get(%q) got %v, want zero", n, failn, getFail) - } - for i := 0; i < len(tc.offsets)-1; i++ { - want := simpleSection{tc.offsets[i], tc.offsets[i+1] - tc.offsets[i]} - got := m.Get(ngrams[i]) - if want != got { - t.Errorf("#%d.%d: Get(%q) got %v, want %v", n, i, tc.ngrams[i], got, want) - } - failn := ngram(uint64(ngrams[i] - 1)) - if getFail := m.Get(failn); getFail != (simpleSection{}) { - t.Errorf("#%d.%d: Get(%q) got %v, want zero", n, i, failn, getFail) - } - failn = ngram(uint64(ngrams[i] + 1)) - if getFail := m.Get(failn); getFail != (simpleSection{}) { - t.Errorf("#%d.%d: Get(%q) got %v, want zero", n, i, failn, getFail) - } - } - } -} - -func TestMakeCombinedNgramOffset(t *testing.T) { - // The ascii / unicode ngram offset splitting is significantly - // more complicated. Exercise it with a more comprehensive test! - unicodeProbability := 0.2 - ngramCount := 1000 - ngramMap := map[ngram]bool{} - - rng := rand.New(rand.NewSource(42)) - - randRune := func() rune { - if rng.Float64() < unicodeProbability { - return rune(0x100 + rand.Intn(0x80)) // Emoji - } - return rune('A' + rng.Intn('Z'-'A')) // A letter - } - - for len(ngramMap) < ngramCount { - ngramMap[runesToNGram([3]rune{randRune(), randRune(), randRune()})] = true - } - - ngrams := []ngram{} - for ng := range ngramMap { - ngrams = append(ngrams, ng) - } - sort.Slice(ngrams, func(i, j int) bool { return ngrams[i] < ngrams[j] }) - - offset := uint32(0) - offsets := []uint32{0} - - for i := 0; i < len(ngrams); i++ { - // vary - offset += uint32(ngramAsciiMaxSectionLength/2 + rand.Intn(ngramAsciiMaxSectionLength)) - offsets = append(offsets, offset) - } - - m := makeCombinedNgramOffset(ngrams, offsets) - - for i, ng := range ngrams { - want := simpleSection{offsets[i], offsets[i+1] - offsets[i]} - got := m.Get(ng) - if want != got { - t.Errorf("#%d: Get(%q) got %v, want %v", i, ng, got, want) - } - failn := ngram(uint64(ng - 1)) - if getFail := m.Get(failn); !ngramMap[failn] && getFail != (simpleSection{}) { - t.Errorf("#%d: Get(%q) got %v, want zero", i, failn, getFail) - } - failn = ngram(uint64(ng + 1)) - if getFail := m.Get(failn); !ngramMap[failn] && getFail != (simpleSection{}) { - t.Errorf("#%d: Get(%q) got %v, want zero", i, failn, getFail) - } - } - - if t.Failed() || true { - t.Log(ngrams) - t.Log(offsets) - t.Log(m) - } -} - -func (a combinedNgramOffset) String() string { - return fmt.Sprintf("combinedNgramOffset{\n asc: %s,\n uni: %s,\n}", a.asc, a.uni) -} - -func (a *arrayNgramOffset) String() string { - o := "arrayNgramOffset{tops:{" - for i, p := range a.tops { - if i > 0 { - o += ", " - } - if p.top&1023 == 0 { - // only one rune is represented here - o += fmt.Sprintf("%s: %d", string(rune(p.top>>10)), p.off) - } else { - o += fmt.Sprintf("0x%x: %d", p.top>>10, p.off) - } - } - o += "}, bots: {" - for i, p := range a.bots { - if i > 0 { - o += ", " - } - if p < (256 << 21) { - // two ascii-ish runes (probably) - o += fmt.Sprintf("%s%s", string(rune(p>>21)), string(rune(p&runeMask))) - } else { - o += fmt.Sprintf("0x%x", p) - } - } - o += fmt.Sprintf("}, offsets: %v}", a.offsets) - return o -} - -func (a *asciiNgramOffset) String() string { - o := "asciiNgramOffset{entries:{" - for i, e := range a.entries { - ng := ngramAsciiPackedToNgram(ngramAscii(uint32(e) >> 11)) - length := e & ngramAsciiMaxSectionLength - if i > 0 { - o += ", " - } - o += fmt.Sprintf("%s: %d", ng, length) - } - o += fmt.Sprintf("}, chunkOffsets: %v}", a.chunkOffsets) - return o - -} diff --git a/read.go b/read.go index 9f49b3b6a..298b7270b 100644 --- a/read.go +++ b/read.go @@ -288,19 +288,11 @@ func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) { return nil, err } - if os.Getenv("ZOEKT_DISABLE_BTREE") != "" { - offsetMap, err := d.readNgrams(toc) - if err != nil { - return nil, err - } - d.ngrams = offsetMap - } else { - bt, err := d.newBtreeIndex(toc.ngramText, toc.postings) - if err != nil { - return nil, err - } - d.ngrams = bt + bt, err := d.newBtreeIndex(toc.ngramText, toc.postings) + if err != nil { + return nil, err } + d.ngrams = bt d.fileBranchMasks, err = readSectionU64(d.file, toc.branchMasks) if err != nil { @@ -314,11 +306,7 @@ func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) { d.fileNameIndex = toc.fileNames.relativeIndex() - if os.Getenv("ZOEKT_DISABLE_BTREE") != "" { - d.fileNameNgrams.m, err = d.readFileNameNgrams(toc) - } else { - d.fileNameNgrams.bt, err = d.newBtreeIndex(toc.nameNgramText, toc.namePostings) - } + d.fileNameNgrams.bt, err = d.newBtreeIndex(toc.nameNgramText, toc.namePostings) if err != nil { return nil, err } @@ -460,26 +448,6 @@ func (r *reader) readMetadata(toc *indexTOC) ([]*Repository, *IndexMetadata, err const ngramEncoding = 8 -func (d *indexData) readNgrams(toc *indexTOC) (combinedNgramOffset, error) { - textContent, err := d.readSectionBlob(toc.ngramText) - if err != nil { - return combinedNgramOffset{}, err - } - postingsIndex := toc.postings.relativeIndex() - - for i := 0; i < len(postingsIndex); i++ { - postingsIndex[i] += toc.postings.data.off - } - - ngrams := make([]ngram, 0, len(textContent)/ngramEncoding) - for i := 0; i < len(textContent); i += ngramEncoding { - ng := ngram(binary.BigEndian.Uint64(textContent[i : i+ngramEncoding])) - ngrams = append(ngrams, ng) - } - - return makeCombinedNgramOffset(ngrams, postingsIndex), nil -} - func (d *indexData) newBtreeIndex(ngramSec simpleSection, postings compoundSection) (btreeIndex, error) { bi := btreeIndex{file: d.file} @@ -507,31 +475,6 @@ func (d *indexData) newBtreeIndex(ngramSec simpleSection, postings compoundSecti return bi, nil } -func (d *indexData) readFileNameNgrams(toc *indexTOC) (map[ngram][]byte, error) { - nameNgramText, err := d.readSectionBlob(toc.nameNgramText) - if err != nil { - return nil, err - } - - fileNamePostingsData, err := d.readSectionBlob(toc.namePostings.data) - if err != nil { - return nil, err - } - - fileNamePostingsIndex := toc.namePostings.relativeIndex() - - fileNameNgrams := make(map[ngram][]byte, len(nameNgramText)/ngramEncoding) - for i := 0; i < len(nameNgramText); i += ngramEncoding { - j := i / ngramEncoding - off := fileNamePostingsIndex[j] - end := fileNamePostingsIndex[j+1] - ng := ngram(binary.BigEndian.Uint64(nameNgramText[i : i+ngramEncoding])) - fileNameNgrams[ng] = fileNamePostingsData[off:end] - } - - return fileNameNgrams, nil -} - func (d *indexData) verify() error { // This is not an exhaustive check: the postings can easily // generate OOB acccesses, and are expensive to check, but this lets us rule out From 40d709a80d41c6333c8b6c5954a797a92635cd88 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Fri, 14 Jul 2023 09:52:31 +0200 Subject: [PATCH 2/2] remove intermediate ngramIndex interface and fileNameNgrams This is no longer needed, we always use the btree. Additionally I added some documentation. --- btree.go | 40 +++++++++------------------------------- hititer.go | 4 ---- indexdata.go | 15 ++++----------- read.go | 8 ++------ 4 files changed, 15 insertions(+), 52 deletions(-) diff --git a/btree.go b/btree.go index c56bb3942..80e973913 100644 --- a/btree.go +++ b/btree.go @@ -289,6 +289,7 @@ type btreeIndex struct { postingIndex simpleSection } +// SizeBytes returns how much memory this structure uses in the heap. func (b btreeIndex) SizeBytes() (sz int) { // btree if b.bt != nil { @@ -401,6 +402,9 @@ func (b btreeIndex) getBucket(bucketIndex int) (off uint32, sz uint32) { return } +// DumpMap is a debug method which returns the btree as an in-memory +// representation. This is how zoekt represents the ngram index in +// google/zoekt. func (b btreeIndex) DumpMap() map[ngram]simpleSection { if b.bt == nil { return nil @@ -428,36 +432,10 @@ func (b btreeIndex) DumpMap() map[ngram]simpleSection { return m } -type ngramIndex interface { - Get(gram ngram) simpleSection - DumpMap() map[ngram]simpleSection - SizeBytes() int -} - -// This is a temporary type to wrap two very different implementations of the -// inverted index for the purpose of feature-flagging. We will remove this after -// we enable the b-tree permanently. -// -// Alternatively we could have adapted/extended the interface "ngramIndex". -// However, adapting the existing implementations and their tests to match the -// access pattern of map[ngram][]byte seems more cumbersome than this makeshift -// wrapper. In the end, both ngramIndex and this wrapper will be replaced by a -// concrete type. +// GetBlob returns the raw encoded offset list for ng. // -// TODO we have remove non-btree code, lets create the concrete type. -type fileNameNgrams struct { - bt btreeIndex -} - -func (n fileNameNgrams) GetBlob(ng ngram) ([]byte, error) { - sec := n.bt.Get(ng) - return n.bt.file.Read(sec.off, sec.sz) -} - -func (n fileNameNgrams) Frequency(ng ngram) uint32 { - return n.bt.Get(ng).sz -} - -func (n fileNameNgrams) SizeBytes() int { - return n.bt.SizeBytes() +// Note: the returned byte slice is mmap backed normally. +func (b btreeIndex) GetBlob(ng ngram) ([]byte, error) { + sec := b.Get(ng) + return b.file.Read(sec.off, sec.sz) } diff --git a/hititer.go b/hititer.go index 86bb0df68..57e90bad7 100644 --- a/hititer.go +++ b/hititer.go @@ -110,10 +110,6 @@ func (d *indexData) newDistanceTrigramIter(ng1, ng2 ngram, dist uint32, caseSens } func (d *indexData) trigramHitIterator(ng ngram, caseSensitive, fileName bool) (hitIterator, error) { - if d.ngrams == nil { - return nil, fmt.Errorf("trigramHitIterator: ngrams=nil") - } - variants := []ngram{ng} if !caseSensitive { variants = generateCaseNgrams(ng) diff --git a/indexdata.go b/indexdata.go index 02c59fb4a..c1cb16928 100644 --- a/indexdata.go +++ b/indexdata.go @@ -33,7 +33,7 @@ type indexData struct { file IndexFile - ngrams ngramIndex + ngrams btreeIndex newlinesStart uint32 newlinesIndex []uint32 @@ -56,7 +56,7 @@ type indexData struct { fileNameContent []byte fileNameIndex []uint32 - fileNameNgrams fileNameNgrams + fileNameNgrams btreeIndex // fileEndSymbol[i] is the index of the first symbol for document i. fileEndSymbol []uint32 @@ -314,9 +314,7 @@ func (d *indexData) memoryUse() int { } sz += 8 * len(d.runeDocSections) sz += 8 * len(d.fileBranchMasks) - if d.ngrams != nil { - sz += d.ngrams.SizeBytes() - } + sz += d.ngrams.SizeBytes() sz += d.fileNameNgrams.SizeBytes() return sz } @@ -349,13 +347,8 @@ func lastMinarg(xs []uint32) uint32 { func (data *indexData) ngramFrequency(ng ngram, filename bool) uint32 { if filename { - return data.fileNameNgrams.Frequency(ng) - } - - if data.ngrams == nil { - return 0 + return data.fileNameNgrams.Get(ng).sz } - return data.ngrams.Get(ng).sz } diff --git a/read.go b/read.go index 298b7270b..49434bd4e 100644 --- a/read.go +++ b/read.go @@ -288,11 +288,10 @@ func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) { return nil, err } - bt, err := d.newBtreeIndex(toc.ngramText, toc.postings) + d.ngrams, err = d.newBtreeIndex(toc.ngramText, toc.postings) if err != nil { return nil, err } - d.ngrams = bt d.fileBranchMasks, err = readSectionU64(d.file, toc.branchMasks) if err != nil { @@ -306,7 +305,7 @@ func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) { d.fileNameIndex = toc.fileNames.relativeIndex() - d.fileNameNgrams.bt, err = d.newBtreeIndex(toc.nameNgramText, toc.namePostings) + d.fileNameNgrams, err = d.newBtreeIndex(toc.nameNgramText, toc.namePostings) if err != nil { return nil, err } @@ -656,9 +655,6 @@ func PrintNgramStats(r IndexFile) error { return err } - if id.ngrams == nil { - return fmt.Errorf("PrintNgramStats: ngrams=nil") - } var rNgram [3]rune for ngram, ss := range id.ngrams.DumpMap() { rNgram = ngramToRunes(ngram)