Skip to content

Commit

Permalink
Better issubset performance (#514)
Browse files Browse the repository at this point in the history
* Benchmarking issubset

* Improving set.isssubset performance

* Improve readability of hashtable.count
  • Loading branch information
marco6 authored Oct 13, 2023
1 parent 10651d5 commit 47c85ba
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 5 deletions.
52 changes: 52 additions & 0 deletions starlark/hashtable.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package starlark

import (
"fmt"
"math/big"
_ "unsafe" // for go:linkname hack
)

Expand Down Expand Up @@ -200,6 +201,57 @@ func (ht *hashtable) lookup(k Value) (v Value, found bool, err error) {
return None, false, nil // not found
}

// count returns the number of distinct elements of iter that are elements of ht.
func (ht *hashtable) count(iter Iterator) (int, error) {
if ht.table == nil {
return 0, nil // empty
}

var k Value
count := 0

// Use a bitset per table entry to record seen elements of ht.
// Elements are identified by their bucket number and index within the bucket.
// Each bitset gets one word initially, but may grow.
storage := make([]big.Word, len(ht.table))
bitsets := make([]big.Int, len(ht.table))
for i := range bitsets {
bitsets[i].SetBits(storage[i : i+1 : i+1])
}
for iter.Next(&k) && count != int(ht.len) {
h, err := k.Hash()
if err != nil {
return 0, err // unhashable
}
if h == 0 {
h = 1 // zero is reserved
}

// Inspect each bucket in the bucket list.
bucketId := h & (uint32(len(ht.table) - 1))
i := 0
for p := &ht.table[bucketId]; p != nil; p = p.next {
for j := range p.entries {
e := &p.entries[j]
if e.hash == h {
if eq, err := Equal(k, e.key); err != nil {
return 0, err
} else if eq {
bitIndex := i<<3 + j
if bitsets[bucketId].Bit(bitIndex) == 0 {
bitsets[bucketId].SetBit(&bitsets[bucketId], bitIndex, 1)
count++
}
}
}
}
i++
}
}

return count, nil
}

// Items returns all the items in the map (as key/value pairs) in insertion order.
func (ht *hashtable) items() []Tuple {
items := make([]Tuple, 0, ht.len)
Expand Down
14 changes: 14 additions & 0 deletions starlark/hashtable_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,17 @@ func testHashtable(tb testing.TB, sane map[int]bool) {
}
}
}

func TestHashtableCount(t *testing.T) {
const count = 1000
ht := new(hashtable)
for i := 0; i < count; i++ {
ht.insert(MakeInt(i), None)
}

if c, err := ht.count(rangeValue{0, count, 1, count}.Iterate()); err != nil {
t.Error(err)
} else if c != count {
t.Errorf("count doesn't match: expected %d got %d", count, c)
}
}
39 changes: 39 additions & 0 deletions starlark/testdata/benchmark.star
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,42 @@ def bench_to_json_deep_list(b):
"Benchmark json.encode builtin with a list of deep input"
for _ in range(b.n):
json.encode(deep)

def bench_issubset_unique_large_small(b):
"Benchmark set.issubset builtin"
s = set(range(10000))
for _ in range(b.n):
s.issubset(range(1000))

def bench_issubset_unique_small_large(b):
"Benchmark set.issubset builtin"
s = set(range(1000))
for _ in range(b.n):
s.issubset(range(10000))

def bench_issubset_unique_same(b):
"Benchmark set.issubset builtin"
s = set(range(1000))
for _ in range(b.n):
s.issubset(range(1000))

def bench_issubset_duplicate_large_small(b):
"Benchmark set.issubset builtin"
s = set(range(10000))
l = list(range(200)) * 5
for _ in range(b.n):
s.issubset(range(1000))

def bench_issubset_duplicate_small_large(b):
"Benchmark set.issubset builtin"
s = set(range(1000))
l = list(range(2000)) * 5
for _ in range(b.n):
s.issubset(l)

def bench_issubset_duplicate_same(b):
"Benchmark set.issubset builtin"
s = set(range(1000))
l = list(range(200)) * 5
for _ in range(b.n):
s.issubset(l)
8 changes: 3 additions & 5 deletions starlark/value.go
Original file line number Diff line number Diff line change
Expand Up @@ -1236,13 +1236,11 @@ func (s *Set) IsSuperset(other Iterator) (bool, error) {
}

func (s *Set) IsSubset(other Iterator) (bool, error) {
otherset, err := setFromIterator(other)
if err != nil {
if count, err := s.ht.count(other); err != nil {
return false, err
} else {
return count == s.Len(), nil
}
iter := s.Iterate()
defer iter.Done()
return otherset.IsSuperset(iter)
}

func (s *Set) Intersection(other Iterator) (Value, error) {
Expand Down

0 comments on commit 47c85ba

Please sign in to comment.