From d9b6e1ec4f1bb7e2f52441b91030bd80db608af2 Mon Sep 17 00:00:00 2001 From: greatroar <61184462+greatroar@users.noreply.github.com> Date: Mon, 15 Jan 2024 10:39:39 +0100 Subject: [PATCH] zstd: Tweak noasm FSE decoder (#910) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Benchmark results below. On a different machine, I saw somewhat bigger speedups when benchmarking a version that has the noasm version of FSE, but asm seqdec. The paper-100k benchmarks have some extreme variance in the first 12 runs, so I did 12 more to get a more reliable average. The apparent regression on that benchmark may be a fluke. The asm FSE decoder still wins. goos: linux goarch: amd64 pkg: github.com/klauspost/compress/zstd cpu: Intel(R) Core(TM) i7-3770K CPU @ 3.50GHz │ old │ new │ │ B/s │ B/s vs base │ Decoder_DecoderSmall/kppkn.gtb.zst/buffered-8 252.3Mi ± 0% 251.9Mi ± 0% -0.12% (p=0.027 n=12) Decoder_DecoderSmall/kppkn.gtb.zst/unbuffered-8 300.2Mi ± 1% 299.5Mi ± 1% ~ (p=0.977 n=12) Decoder_DecoderSmall/geo.protodata.zst/buffered-8 702.1Mi ± 0% 701.4Mi ± 0% ~ (p=0.551 n=12) Decoder_DecoderSmall/geo.protodata.zst/unbuffered-8 703.1Mi ± 1% 695.9Mi ± 1% ~ (p=0.178 n=12) Decoder_DecoderSmall/plrabn12.txt.zst/buffered-8 200.3Mi ± 0% 200.2Mi ± 0% ~ (p=0.417 n=12) Decoder_DecoderSmall/plrabn12.txt.zst/unbuffered-8 294.3Mi ± 2% 294.2Mi ± 2% ~ (p=0.755 n=12) Decoder_DecoderSmall/lcet10.txt.zst/buffered-8 237.7Mi ± 0% 237.7Mi ± 0% ~ (p=0.561 n=12) Decoder_DecoderSmall/lcet10.txt.zst/unbuffered-8 315.7Mi ± 1% 313.0Mi ± 2% ~ (p=0.713 n=12) Decoder_DecoderSmall/asyoulik.txt.zst/buffered-8 205.7Mi ± 3% 212.8Mi ± 0% +3.43% (p=0.002 n=12) Decoder_DecoderSmall/asyoulik.txt.zst/unbuffered-8 267.1Mi ± 2% 273.4Mi ± 4% ~ (p=0.078 n=12) Decoder_DecoderSmall/alice29.txt.zst/buffered-8 194.8Mi ± 0% 194.8Mi ± 0% ~ (p=0.876 n=12) Decoder_DecoderSmall/alice29.txt.zst/unbuffered-8 219.8Mi ± 2% 221.5Mi ± 1% ~ (p=0.410 n=12) Decoder_DecoderSmall/html_x_4.zst/buffered-8 1.512Gi ± 1% 1.550Gi ± 0% +2.49% (p=0.000 n=12) Decoder_DecoderSmall/html_x_4.zst/unbuffered-8 1.149Gi ± 2% 1.196Gi ± 2% +4.09% (p=0.000 n=12) Decoder_DecoderSmall/paper-100k.pdf.zst/buffered-8 2.504Gi ± 1% 2.513Gi ± 0% ~ (p=0.136 n=24) Decoder_DecoderSmall/paper-100k.pdf.zst/unbuffered-8 1.041Gi ± 3% 1.023Gi ± 0% -1.72% (p=0.002 n=24) Decoder_DecoderSmall/fireworks.jpeg.zst/buffered-8 6.315Gi ± 0% 6.349Gi ± 0% +0.53% (p=0.000 n=12) Decoder_DecoderSmall/fireworks.jpeg.zst/unbuffered-8 3.178Gi ± 3% 3.234Gi ± 2% ~ (p=0.713 n=12) Decoder_DecoderSmall/urls.10K.zst/buffered-8 350.9Mi ± 0% 351.0Mi ± 0% ~ (p=1.000 n=12) Decoder_DecoderSmall/urls.10K.zst/unbuffered-8 525.8Mi ± 1% 524.9Mi ± 3% ~ (p=0.671 n=12) Decoder_DecoderSmall/html.zst/buffered-8 562.0Mi ± 0% 561.2Mi ± 0% ~ (p=0.054 n=12) Decoder_DecoderSmall/html.zst/unbuffered-8 537.4Mi ± 2% 543.5Mi ± 2% ~ (p=0.178 n=12) Decoder_DecoderSmall/comp-data.bin.zst/buffered-8 283.4Mi ± 0% 283.8Mi ± 0% ~ (p=0.164 n=12) Decoder_DecoderSmall/comp-data.bin.zst/unbuffered-8 118.0Mi ± 1% 120.3Mi ± 3% ~ (p=0.128 n=12) geomean 505.1Mi 508.1Mi +0.60% --- zstd/fse_decoder_generic.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/zstd/fse_decoder_generic.go b/zstd/fse_decoder_generic.go index 332e51fe44..8adfebb029 100644 --- a/zstd/fse_decoder_generic.go +++ b/zstd/fse_decoder_generic.go @@ -20,10 +20,9 @@ func (s *fseDecoder) buildDtable() error { if v == -1 { s.dt[highThreshold].setAddBits(uint8(i)) highThreshold-- - symbolNext[i] = 1 - } else { - symbolNext[i] = uint16(v) + v = 1 } + symbolNext[i] = uint16(v) } } @@ -35,10 +34,12 @@ func (s *fseDecoder) buildDtable() error { for ss, v := range s.norm[:s.symbolLen] { for i := 0; i < int(v); i++ { s.dt[position].setAddBits(uint8(ss)) - position = (position + step) & tableMask - for position > highThreshold { + for { // lowprob area position = (position + step) & tableMask + if position <= highThreshold { + break + } } } }