Skip to content

Commit

Permalink
remove ngram offset code (#616)
Browse files Browse the repository at this point in the history
We have been running btree as the default for many months. We worried
about a performance hit, but it never happened. After some recent local
testing I did I noticed the btree actually interacted with the disk more
efficiently. So the old code both uses more memory and is slower, lets
just remove it.

Test Plan: go test ./...
  • Loading branch information
keegancsmith authored Jul 17, 2023
1 parent f9d3a0e commit 3d0bdd5
Show file tree
Hide file tree
Showing 6 changed files with 20 additions and 647 deletions.
12 changes: 12 additions & 0 deletions btree.go
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,7 @@ type btreeIndex struct {
postingIndex simpleSection
}

// SizeBytes returns how much memory this structure uses in the heap.
func (b btreeIndex) SizeBytes() (sz int) {
// btree
if b.bt != nil {
Expand Down Expand Up @@ -401,6 +402,9 @@ func (b btreeIndex) getBucket(bucketIndex int) (off uint32, sz uint32) {
return
}

// DumpMap is a debug method which returns the btree as an in-memory
// representation. This is how zoekt represents the ngram index in
// google/zoekt.
func (b btreeIndex) DumpMap() map[ngram]simpleSection {
if b.bt == nil {
return nil
Expand All @@ -427,3 +431,11 @@ func (b btreeIndex) DumpMap() map[ngram]simpleSection {

return m
}

// GetBlob returns the raw encoded offset list for ng.
//
// Note: the returned byte slice is mmap backed normally.
func (b btreeIndex) GetBlob(ng ngram) ([]byte, error) {
sec := b.Get(ng)
return b.file.Read(sec.off, sec.sz)
}
4 changes: 0 additions & 4 deletions hititer.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,6 @@ func (d *indexData) newDistanceTrigramIter(ng1, ng2 ngram, dist uint32, caseSens
}

func (d *indexData) trigramHitIterator(ng ngram, caseSensitive, fileName bool) (hitIterator, error) {
if d.ngrams == nil {
return nil, fmt.Errorf("trigramHitIterator: ngrams=nil")
}

variants := []ngram{ng}
if !caseSensitive {
variants = generateCaseNgrams(ng)
Expand Down
15 changes: 4 additions & 11 deletions indexdata.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ type indexData struct {

file IndexFile

ngrams ngramIndex
ngrams btreeIndex

newlinesStart uint32
newlinesIndex []uint32
Expand All @@ -56,7 +56,7 @@ type indexData struct {

fileNameContent []byte
fileNameIndex []uint32
fileNameNgrams fileNameNgrams
fileNameNgrams btreeIndex

// fileEndSymbol[i] is the index of the first symbol for document i.
fileEndSymbol []uint32
Expand Down Expand Up @@ -314,9 +314,7 @@ func (d *indexData) memoryUse() int {
}
sz += 8 * len(d.runeDocSections)
sz += 8 * len(d.fileBranchMasks)
if d.ngrams != nil {
sz += d.ngrams.SizeBytes()
}
sz += d.ngrams.SizeBytes()
sz += d.fileNameNgrams.SizeBytes()
return sz
}
Expand Down Expand Up @@ -349,13 +347,8 @@ func lastMinarg(xs []uint32) uint32 {

func (data *indexData) ngramFrequency(ng ngram, filename bool) uint32 {
if filename {
return data.fileNameNgrams.Frequency(ng)
}

if data.ngrams == nil {
return 0
return data.fileNameNgrams.Get(ng).sz
}

return data.ngrams.Get(ng).sz
}

Expand Down
Loading

0 comments on commit 3d0bdd5

Please sign in to comment.