From b404607005bc5c9c6388abcfc89317b86df3a520 Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Wed, 9 Aug 2023 04:00:45 -0700
Subject: [PATCH] flate: Add limited window compression (#843)

Adds a medium compressor that can operate with limited window size.

Exposed in gzip outside deflate for now.

Example sizes:

```
=== RUN   TestFileWindow/32
    gzip_test.go:349: size: 82504 bytes
=== RUN   TestFileWindow/64
    gzip_test.go:349: size: 75350 bytes
=== RUN   TestFileWindow/128
    gzip_test.go:349: size: 70668 bytes
=== RUN   TestFileWindow/256
    gzip_test.go:349: size: 69276 bytes
=== RUN   TestFileWindow/512
    gzip_test.go:349: size: 68327 bytes
=== RUN   TestFileWindow/1024
    gzip_test.go:349: size: 67876 bytes
=== RUN   TestFileWindow/2048
    gzip_test.go:349: size: 40900 bytes
=== RUN   TestFileWindow/4096
    gzip_test.go:349: size: 38684 bytes
=== RUN   TestFileWindow/8192
    gzip_test.go:349: size: 36263 bytes
=== RUN   TestFileWindow/16384
    gzip_test.go:349: size: 35434 bytes
=== RUN   TestFileWindow/32768
    gzip_test.go:349: size: 34654 bytes
--- PASS: TestFileWindow (0.03s)
```
---
 flate/deflate.go     |  29 ++++
 flate/fuzz_test.go   | 189 ++++++++++++--------
 flate/level5.go      | 398 +++++++++++++++++++++++++++++++++++++++++++
 flate/reader_test.go |   1 +
 flate/writer_test.go |   4 +
 gzip/gzip.go         |  21 +++
 gzip/gzip_test.go    |  61 ++++++-
 7 files changed, 631 insertions(+), 72 deletions(-)

diff --git a/flate/deflate.go b/flate/deflate.go
index 5faea0b2b3..de912e187c 100644
--- a/flate/deflate.go
+++ b/flate/deflate.go
@@ -7,6 +7,7 @@ package flate
 
 import (
 	"encoding/binary"
+	"errors"
 	"fmt"
 	"io"
 	"math"
@@ -833,6 +834,12 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
 		d.initDeflate()
 		d.fill = (*compressor).fillDeflate
 		d.step = (*compressor).deflateLazy
+	case -level >= MinCustomWindowSize && -level <= MaxCustomWindowSize:
+		d.w.logNewTablePenalty = 7
+		d.fast = &fastEncL5Window{maxOffset: int32(-level), cur: maxStoreBlockSize}
+		d.window = make([]byte, maxStoreBlockSize)
+		d.fill = (*compressor).fillBlock
+		d.step = (*compressor).storeFast
 	default:
 		return fmt.Errorf("flate: invalid compression level %d: want value in range [-2, 9]", level)
 	}
@@ -929,6 +936,28 @@ func NewWriterDict(w io.Writer, level int, dict []byte) (*Writer, error) {
 	return zw, err
 }
 
+// MinCustomWindowSize is the minimum window size that can be sent to NewWriterWindow.
+const MinCustomWindowSize = 32
+
+// MaxCustomWindowSize is the maximum custom window that can be sent to NewWriterWindow.
+const MaxCustomWindowSize = windowSize
+
+// NewWriterWindow returns a new Writer compressing data with a custom window size.
+// windowSize must be from MinCustomWindowSize to MaxCustomWindowSize.
+func NewWriterWindow(w io.Writer, windowSize int) (*Writer, error) {
+	if windowSize < MinCustomWindowSize {
+		return nil, errors.New("flate: requested window size less than MinWindowSize")
+	}
+	if windowSize > MaxCustomWindowSize {
+		return nil, errors.New("flate: requested window size bigger than MaxCustomWindowSize")
+	}
+	var dw Writer
+	if err := dw.d.init(w, -windowSize); err != nil {
+		return nil, err
+	}
+	return &dw, nil
+}
+
 // A Writer takes data written to it and writes the compressed
 // form of that data to an underlying writer (see NewWriter).
 type Writer struct {
diff --git a/flate/fuzz_test.go b/flate/fuzz_test.go
index 527bad25d1..cdda0f5ce7 100644
--- a/flate/fuzz_test.go
+++ b/flate/fuzz_test.go
@@ -19,6 +19,7 @@ var fuzzStartF = flag.Int("start", HuffmanOnly, "Start fuzzing at this level")
 var fuzzEndF = flag.Int("end", BestCompression, "End fuzzing at this level (inclusive)")
 var fuzzMaxF = flag.Int("max", 1<<20, "Maximum input size")
 var fuzzSLF = flag.Bool("sl", true, "Include stateless encodes")
+var fuzzWindow = flag.Bool("windows", true, "Include windowed encodes")
 
 func TestMain(m *testing.M) {
 	flag.Parse()
@@ -34,6 +35,7 @@ func FuzzEncoding(f *testing.F) {
 	endFuzz := *fuzzEndF
 	maxSize := *fuzzMaxF
 	stateless := *fuzzSLF
+	fuzzWindow := *fuzzWindow
 
 	decoder := NewReader(nil)
 	buf := new(bytes.Buffer)
@@ -52,77 +54,130 @@ func FuzzEncoding(f *testing.F) {
 		}
 		for level := startFuzz; level <= endFuzz; level++ {
 			msg := "level " + strconv.Itoa(level) + ":"
-			buf.Reset()
-			fw := encs[level-startFuzz]
-			fw.Reset(buf)
-			n, err := fw.Write(data)
-			if n != len(data) {
-				t.Fatal(msg + "short write")
-			}
-			if err != nil {
-				t.Fatal(msg + err.Error())
-			}
-			err = fw.Close()
-			if err != nil {
-				t.Fatal(msg + err.Error())
-			}
-			decoder.(Resetter).Reset(buf, nil)
-			data2, err := io.ReadAll(decoder)
-			if err != nil {
-				t.Fatal(msg + err.Error())
-			}
-			if !bytes.Equal(data, data2) {
-				t.Fatal(msg + "not equal")
-			}
-			// Do it again...
-			msg = "level " + strconv.Itoa(level) + " (reset):"
-			buf.Reset()
-			fw.Reset(buf)
-			n, err = fw.Write(data)
-			if n != len(data) {
-				t.Fatal(msg + "short write")
-			}
-			if err != nil {
-				t.Fatal(msg + err.Error())
-			}
-			err = fw.Close()
-			if err != nil {
-				t.Fatal(msg + err.Error())
-			}
-			decoder.(Resetter).Reset(buf, nil)
-			data2, err = io.ReadAll(decoder)
-			if err != nil {
-				t.Fatal(msg + err.Error())
-			}
-			if !bytes.Equal(data, data2) {
-				t.Fatal(msg + "not equal")
-			}
-		}
-		if !stateless {
-			return
-		}
-		// Split into two and use history...
-		buf.Reset()
-		err := StatelessDeflate(buf, data[:len(data)/2], false, nil)
-		if err != nil {
-			t.Error(err)
+			t.Run(msg, func(t *testing.T) {
+				buf.Reset()
+				fw := encs[level-startFuzz]
+				fw.Reset(buf)
+				n, err := fw.Write(data)
+				if n != len(data) {
+					t.Fatal(msg + "short write")
+				}
+				if err != nil {
+					t.Fatal(msg + err.Error())
+				}
+				err = fw.Close()
+				if err != nil {
+					t.Fatal(msg + err.Error())
+				}
+				decoder.(Resetter).Reset(buf, nil)
+				data2, err := io.ReadAll(decoder)
+				if err != nil {
+					t.Fatal(msg + err.Error())
+				}
+				if !bytes.Equal(data, data2) {
+					t.Fatal(msg + "not equal")
+				}
+				// Do it again...
+				msg = "level " + strconv.Itoa(level) + " (reset):"
+				buf.Reset()
+				fw.Reset(buf)
+				n, err = fw.Write(data)
+				if n != len(data) {
+					t.Fatal(msg + "short write")
+				}
+				if err != nil {
+					t.Fatal(msg + err.Error())
+				}
+				err = fw.Close()
+				if err != nil {
+					t.Fatal(msg + err.Error())
+				}
+				decoder.(Resetter).Reset(buf, nil)
+				data2, err = io.ReadAll(decoder)
+				if err != nil {
+					t.Fatal(msg + err.Error())
+				}
+				if !bytes.Equal(data, data2) {
+					t.Fatal(msg + "not equal")
+				}
+			})
 		}
+		if stateless {
+			t.Run("stateless", func(t *testing.T) {
+				// Split into two and use history...
+				buf.Reset()
+				err := StatelessDeflate(buf, data[:len(data)/2], false, nil)
+				if err != nil {
+					t.Error(err)
+				}
 
-		// Use top half as dictionary...
-		dict := data[:len(data)/2]
-		err = StatelessDeflate(buf, data[len(data)/2:], true, dict)
-		if err != nil {
-			t.Error(err)
-		}
+				// Use top half as dictionary...
+				dict := data[:len(data)/2]
+				err = StatelessDeflate(buf, data[len(data)/2:], true, dict)
+				if err != nil {
+					t.Error(err)
+				}
 
-		decoder.(Resetter).Reset(buf, nil)
-		data2, err := io.ReadAll(decoder)
-		if err != nil {
-			t.Error(err)
+				decoder.(Resetter).Reset(buf, nil)
+				data2, err := io.ReadAll(decoder)
+				if err != nil {
+					t.Error(err)
+				}
+				if !bytes.Equal(data, data2) {
+					//fmt.Printf("want:%x\ngot: %x\n", data1, data2)
+					t.Error("not equal")
+				}
+			})
 		}
-		if !bytes.Equal(data, data2) {
-			//fmt.Printf("want:%x\ngot: %x\n", data1, data2)
-			t.Error("not equal")
+		if fuzzWindow {
+			t.Run("window", func(t *testing.T) {
+				msg := "windowed"
+				buf.Reset()
+				fw, err := NewWriterWindow(buf, 1000)
+				if err != nil {
+					t.Fatal(msg + err.Error())
+				}
+				fw.Reset(buf)
+				n, err := fw.Write(data)
+				if n != len(data) {
+					t.Fatal(msg + "short write")
+				}
+				if err != nil {
+					t.Fatal(msg + err.Error())
+				}
+				err = fw.Close()
+				if err != nil {
+					t.Fatal(msg + err.Error())
+				}
+				decoder.(Resetter).Reset(buf, nil)
+				data2, err := io.ReadAll(decoder)
+				if err != nil {
+					t.Fatal(msg + err.Error())
+				}
+				if !bytes.Equal(data, data2) {
+					t.Fatal(msg + "not equal")
+				}
+				// Do it again...
+				msg = msg + " (reset):"
+				buf.Reset()
+				fw.Reset(buf)
+				n, err = fw.Write(data)
+				if n != len(data) {
+					t.Fatal(msg + "short write")
+				}
+				if err != nil {
+					t.Fatal(msg + err.Error())
+				}
+				err = fw.Close()
+				if err != nil {
+					t.Fatal(msg + err.Error())
+				}
+				decoder.(Resetter).Reset(buf, nil)
+				data2, err = io.ReadAll(decoder)
+				if err != nil {
+					t.Fatal(msg + err.Error())
+				}
+			})
 		}
 	})
 }
diff --git a/flate/level5.go b/flate/level5.go
index 83ef50ba45..1f61ec1829 100644
--- a/flate/level5.go
+++ b/flate/level5.go
@@ -308,3 +308,401 @@ emitRemainder:
 		emitLiteral(dst, src[nextEmit:])
 	}
 }
+
+// fastEncL5Window is a level 5 encoder,
+// but with a custom window size.
+type fastEncL5Window struct {
+	hist      []byte
+	cur       int32
+	maxOffset int32
+	table     [tableSize]tableEntry
+	bTable    [tableSize]tableEntryPrev
+}
+
+func (e *fastEncL5Window) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashShortBytes         = 4
+	)
+	maxMatchOffset := e.maxOffset
+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.bTable[:] {
+				e.bTable[i] = tableEntryPrev{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.bTable[:] {
+			v := e.bTable[i]
+			if v.Cur.offset <= minOff {
+				v.Cur.offset = 0
+				v.Prev.offset = 0
+			} else {
+				v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
+				if v.Prev.offset <= minOff {
+					v.Prev.offset = 0
+				} else {
+					v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
+				}
+			}
+			e.bTable[i] = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load6432(src, s)
+	for {
+		const skipLog = 6
+		const doEvery = 1
+
+		nextS := s
+		var l int32
+		var t int32
+		for {
+			nextHashS := hashLen(cv, tableBits, hashShortBytes)
+			nextHashL := hash7(cv, tableBits)
+
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			// Fetch a short+long candidate
+			sCandidate := e.table[nextHashS]
+			lCandidate := e.bTable[nextHashL]
+			next := load6432(src, nextS)
+			entry := tableEntry{offset: s + e.cur}
+			e.table[nextHashS] = entry
+			eLong := &e.bTable[nextHashL]
+			eLong.Cur, eLong.Prev = entry, eLong.Cur
+
+			nextHashS = hashLen(next, tableBits, hashShortBytes)
+			nextHashL = hash7(next, tableBits)
+
+			t = lCandidate.Cur.offset - e.cur
+			if s-t < maxMatchOffset {
+				if uint32(cv) == load3232(src, lCandidate.Cur.offset-e.cur) {
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+					t2 := lCandidate.Prev.offset - e.cur
+					if s-t2 < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
+						l = e.matchlen(s+4, t+4, src) + 4
+						ml1 := e.matchlen(s+4, t2+4, src) + 4
+						if ml1 > l {
+							t = t2
+							l = ml1
+							break
+						}
+					}
+					break
+				}
+				t = lCandidate.Prev.offset - e.cur
+				if s-t < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+					break
+				}
+			}
+
+			t = sCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, sCandidate.offset-e.cur) {
+				// Found a 4 match...
+				l = e.matchlen(s+4, t+4, src) + 4
+				lCandidate = e.bTable[nextHashL]
+				// Store the next match
+
+				e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+				eLong := &e.bTable[nextHashL]
+				eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+				// If the next long is a candidate, use that...
+				t2 := lCandidate.Cur.offset - e.cur
+				if nextS-t2 < maxMatchOffset {
+					if load3232(src, lCandidate.Cur.offset-e.cur) == uint32(next) {
+						ml := e.matchlen(nextS+4, t2+4, src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							break
+						}
+					}
+					// If the previous long is a candidate, use that...
+					t2 = lCandidate.Prev.offset - e.cur
+					if nextS-t2 < maxMatchOffset && load3232(src, lCandidate.Prev.offset-e.cur) == uint32(next) {
+						ml := e.matchlen(nextS+4, t2+4, src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							break
+						}
+					}
+				}
+				break
+			}
+			cv = next
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		if l == 0 {
+			// Extend the 4-byte match as long as possible.
+			l = e.matchlenLong(s+4, t+4, src) + 4
+		} else if l == maxMatchLength {
+			l += e.matchlenLong(s+l, t+l, src)
+		}
+
+		// Try to locate a better match by checking the end of best match...
+		if sAt := s + l; l < 30 && sAt < sLimit {
+			// Allow some bytes at the beginning to mismatch.
+			// Sweet spot is 2/3 bytes depending on input.
+			// 3 is only a little better when it is but sometimes a lot worse.
+			// The skipped bytes are tested in Extend backwards,
+			// and still picked up as part of the match if they do.
+			const skipBeginning = 2
+			eLong := e.bTable[hash7(load6432(src, sAt), tableBits)].Cur.offset
+			t2 := eLong - e.cur - l + skipBeginning
+			s2 := s + skipBeginning
+			off := s2 - t2
+			if t2 >= 0 && off < maxMatchOffset && off > 0 {
+				if l2 := e.matchlenLong(s2, t2, src); l2 > l {
+					t = t2
+					l = l2
+					s = s2
+				}
+			}
+		}
+
+		// Extend backwards
+		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+			s--
+			t--
+			l++
+		}
+		if nextEmit < s {
+			if false {
+				emitLiteral(dst, src[nextEmit:s])
+			} else {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
+		}
+		if debugDeflate {
+			if t >= s {
+				panic(fmt.Sprintln("s-t", s, t))
+			}
+			if (s - t) > maxMatchOffset {
+				panic(fmt.Sprintln("mmo", s-t))
+			}
+			if l < baseMatchLength {
+				panic("bml")
+			}
+		}
+
+		dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+		s += l
+		nextEmit = s
+		if nextS >= s {
+			s = nextS + 1
+		}
+
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		// Store every 3rd hash in-between.
+		if true {
+			const hashEvery = 3
+			i := s - l + 1
+			if i < s-1 {
+				cv := load6432(src, i)
+				t := tableEntry{offset: i + e.cur}
+				e.table[hashLen(cv, tableBits, hashShortBytes)] = t
+				eLong := &e.bTable[hash7(cv, tableBits)]
+				eLong.Cur, eLong.Prev = t, eLong.Cur
+
+				// Do an long at i+1
+				cv >>= 8
+				t = tableEntry{offset: t.offset + 1}
+				eLong = &e.bTable[hash7(cv, tableBits)]
+				eLong.Cur, eLong.Prev = t, eLong.Cur
+
+				// We only have enough bits for a short entry at i+2
+				cv >>= 8
+				t = tableEntry{offset: t.offset + 1}
+				e.table[hashLen(cv, tableBits, hashShortBytes)] = t
+
+				// Skip one - otherwise we risk hitting 's'
+				i += 4
+				for ; i < s-1; i += hashEvery {
+					cv := load6432(src, i)
+					t := tableEntry{offset: i + e.cur}
+					t2 := tableEntry{offset: t.offset + 1}
+					eLong := &e.bTable[hash7(cv, tableBits)]
+					eLong.Cur, eLong.Prev = t, eLong.Cur
+					e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2
+				}
+			}
+		}
+
+		// We could immediately start working at s now, but to improve
+		// compression we first update the hash table at s-1 and at s.
+		x := load6432(src, s-1)
+		o := e.cur + s - 1
+		prevHashS := hashLen(x, tableBits, hashShortBytes)
+		prevHashL := hash7(x, tableBits)
+		e.table[prevHashS] = tableEntry{offset: o}
+		eLong := &e.bTable[prevHashL]
+		eLong.Cur, eLong.Prev = tableEntry{offset: o}, eLong.Cur
+		cv = x >> 8
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiteral(dst, src[nextEmit:])
+	}
+}
+
+// Reset the encoding table.
+func (e *fastEncL5Window) Reset() {
+	// We keep the same allocs, since we are compressing the same block sizes.
+	if cap(e.hist) < allocHistory {
+		e.hist = make([]byte, 0, allocHistory)
+	}
+
+	// We offset current position so everything will be out of reach.
+	// If we are above the buffer reset it will be cleared anyway since len(hist) == 0.
+	if e.cur <= int32(bufferReset) {
+		e.cur += e.maxOffset + int32(len(e.hist))
+	}
+	e.hist = e.hist[:0]
+}
+
+func (e *fastEncL5Window) addBlock(src []byte) int32 {
+	// check if we have space already
+	maxMatchOffset := e.maxOffset
+
+	if len(e.hist)+len(src) > cap(e.hist) {
+		if cap(e.hist) == 0 {
+			e.hist = make([]byte, 0, allocHistory)
+		} else {
+			if cap(e.hist) < int(maxMatchOffset*2) {
+				panic("unexpected buffer size")
+			}
+			// Move down
+			offset := int32(len(e.hist)) - maxMatchOffset
+			copy(e.hist[0:maxMatchOffset], e.hist[offset:])
+			e.cur += offset
+			e.hist = e.hist[:maxMatchOffset]
+		}
+	}
+	s := int32(len(e.hist))
+	e.hist = append(e.hist, src...)
+	return s
+}
+
+// matchlen will return the match length between offsets and t in src.
+// The maximum length returned is maxMatchLength - 4.
+// It is assumed that s > t, that t >=0 and s < len(src).
+func (e *fastEncL5Window) matchlen(s, t int32, src []byte) int32 {
+	if debugDecode {
+		if t >= s {
+			panic(fmt.Sprint("t >=s:", t, s))
+		}
+		if int(s) >= len(src) {
+			panic(fmt.Sprint("s >= len(src):", s, len(src)))
+		}
+		if t < 0 {
+			panic(fmt.Sprint("t < 0:", t))
+		}
+		if s-t > e.maxOffset {
+			panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")"))
+		}
+	}
+	s1 := int(s) + maxMatchLength - 4
+	if s1 > len(src) {
+		s1 = len(src)
+	}
+
+	// Extend the match to be as long as possible.
+	return int32(matchLen(src[s:s1], src[t:]))
+}
+
+// matchlenLong will return the match length between offsets and t in src.
+// It is assumed that s > t, that t >=0 and s < len(src).
+func (e *fastEncL5Window) matchlenLong(s, t int32, src []byte) int32 {
+	if debugDeflate {
+		if t >= s {
+			panic(fmt.Sprint("t >=s:", t, s))
+		}
+		if int(s) >= len(src) {
+			panic(fmt.Sprint("s >= len(src):", s, len(src)))
+		}
+		if t < 0 {
+			panic(fmt.Sprint("t < 0:", t))
+		}
+		if s-t > e.maxOffset {
+			panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")"))
+		}
+	}
+	// Extend the match to be as long as possible.
+	return int32(matchLen(src[s:], src[t:]))
+}
diff --git a/flate/reader_test.go b/flate/reader_test.go
index bc83c1f1d9..37e9b912fe 100644
--- a/flate/reader_test.go
+++ b/flate/reader_test.go
@@ -81,6 +81,7 @@ const (
 	speed    = BestSpeed
 	default_ = DefaultCompression
 	compress = BestCompression
+	oneK     = -1024
 )
 
 func BenchmarkDecodeDigitsSpeed1e4(b *testing.B)    { benchmarkDecode(b, digits, speed, 1e4) }
diff --git a/flate/writer_test.go b/flate/writer_test.go
index 766ab6b900..22d006ce34 100644
--- a/flate/writer_test.go
+++ b/flate/writer_test.go
@@ -217,6 +217,10 @@ func BenchmarkEncodeTwainSL1e4(b *testing.B)        { benchmarkStatelessEncoder(
 func BenchmarkEncodeTwainSL1e5(b *testing.B)        { benchmarkStatelessEncoder(b, twain, 1e5) }
 func BenchmarkEncodeTwainSL1e6(b *testing.B)        { benchmarkStatelessEncoder(b, twain, 1e6) }
 
+func BenchmarkEncodeTwain1024Win1e4(b *testing.B) { benchmarkEncoder(b, twain, oneK, 1e4) }
+func BenchmarkEncodeTwain1024Win1e5(b *testing.B) { benchmarkEncoder(b, twain, oneK, 1e5) }
+func BenchmarkEncodeTwain1024Win1e6(b *testing.B) { benchmarkEncoder(b, twain, oneK, 1e6) }
+
 func benchmarkStatelessEncoder(b *testing.B, testfile, n int) {
 	b.SetBytes(int64(n))
 	buf0, err := os.ReadFile(testfiles[testfile])
diff --git a/gzip/gzip.go b/gzip/gzip.go
index 26203851bd..5bc720593e 100644
--- a/gzip/gzip.go
+++ b/gzip/gzip.go
@@ -74,6 +74,27 @@ func NewWriterLevel(w io.Writer, level int) (*Writer, error) {
 	return z, nil
 }
 
+// MinCustomWindowSize is the minimum window size that can be sent to NewWriterWindow.
+const MinCustomWindowSize = flate.MinCustomWindowSize
+
+// MaxCustomWindowSize is the maximum custom window that can be sent to NewWriterWindow.
+const MaxCustomWindowSize = flate.MaxCustomWindowSize
+
+// NewWriterWindow returns a new Writer compressing data with a custom window size.
+// windowSize must be from MinCustomWindowSize to MaxCustomWindowSize.
+func NewWriterWindow(w io.Writer, windowSize int) (*Writer, error) {
+	if windowSize < MinCustomWindowSize {
+		return nil, errors.New("gzip: requested window size less than MinWindowSize")
+	}
+	if windowSize > MaxCustomWindowSize {
+		return nil, errors.New("gzip: requested window size bigger than MaxCustomWindowSize")
+	}
+
+	z := new(Writer)
+	z.init(w, -windowSize)
+	return z, nil
+}
+
 func (z *Writer) init(w io.Writer, level int) {
 	compressor := z.compressor
 	if level != StatelessCompression {
diff --git a/gzip/gzip_test.go b/gzip/gzip_test.go
index e1ebb5a178..4c7992aa2b 100644
--- a/gzip/gzip_test.go
+++ b/gzip/gzip_test.go
@@ -7,6 +7,7 @@ package gzip
 import (
 	"bufio"
 	"bytes"
+	"fmt"
 	"io"
 	"math/rand"
 	"os"
@@ -252,7 +253,7 @@ func testFile(i, level int, t *testing.T) {
 
 	br := bytes.NewBuffer(testbuf)
 	var buf bytes.Buffer
-	w, err := NewWriterLevel(&buf, DefaultCompression)
+	w, err := NewWriterLevel(&buf, level)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -309,6 +310,56 @@ func TestFile200(t *testing.T) {
 	testFile(200, BestSpeed, t)
 }
 
+func TestFileWindow(t *testing.T) {
+	for sz := MinCustomWindowSize; sz <= MaxCustomWindowSize; sz *= 2 {
+		t.Run(fmt.Sprint(sz), func(t *testing.T) {
+			testFileWindow(1, sz, t)
+		})
+	}
+}
+
+func testFileWindow(i, window int, t *testing.T) {
+	dat, _ := os.ReadFile("testdata/test.json")
+	dl := len(dat)
+	if len(testbuf) != i*dl {
+		// Make results predictable
+		testbuf = make([]byte, i*dl)
+		for j := 0; j < i; j++ {
+			copy(testbuf[j*dl:j*dl+dl], dat)
+		}
+	}
+
+	br := bytes.NewBuffer(testbuf)
+	var buf bytes.Buffer
+	w, err := NewWriterWindow(&buf, window)
+	if err != nil {
+		t.Fatal(err)
+	}
+	n, err := io.Copy(w, br)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if int(n) != len(testbuf) {
+		t.Fatal("Short write:", n, "!=", testbuf)
+	}
+	err = w.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+	t.Logf("size: %d bytes", buf.Len())
+	r, err := NewReader(&buf)
+	if err != nil {
+		t.Fatal(err.Error())
+	}
+	decoded, err := io.ReadAll(r)
+	if err != nil {
+		t.Fatal(err.Error())
+	}
+	if !bytes.Equal(testbuf, decoded) {
+		t.Errorf("decoded content does not match.")
+	}
+}
+
 func testBigGzip(i int, t *testing.T) {
 	if len(testbuf) != i {
 		// Make results predictable
@@ -385,7 +436,7 @@ func TestDeterministicL7(t *testing.T) { testDeterm(7, t) }
 func TestDeterministicL8(t *testing.T) { testDeterm(8, t) }
 func TestDeterministicL9(t *testing.T) { testDeterm(9, t) }
 
-func testDeterm(i int, t *testing.T) {
+func testDeterm(level int, t *testing.T) {
 	var length = 500000
 	if testing.Short() {
 		length = 100000
@@ -398,7 +449,7 @@ func testDeterm(i int, t *testing.T) {
 
 	br := bytes.NewBuffer(t1)
 	var b1 bytes.Buffer
-	w, err := NewWriterLevel(&b1, i)
+	w, err := NewWriterLevel(&b1, level)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -419,7 +470,7 @@ func testDeterm(i int, t *testing.T) {
 
 	br2 := bytes.NewBuffer(t2)
 	var b2 bytes.Buffer
-	w2, err := NewWriterLevel(&b2, i)
+	w2, err := NewWriterLevel(&b2, level)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -445,7 +496,7 @@ func testDeterm(i int, t *testing.T) {
 	b2b := b2.Bytes()
 
 	if !bytes.Equal(b1b, b2b) {
-		t.Fatalf("Level %d did not produce deterministric result, len(a) = %d, len(b) = %d", i, len(b1b), len(b2b))
+		t.Fatalf("Level %d did not produce deterministric result, len(a) = %d, len(b) = %d", level, len(b1b), len(b2b))
 	}
 }