From b7e00cb587fb0cdfe29b27a6c9ce43d0cb5a74b3 Mon Sep 17 00:00:00 2001
From: Oliver Powell <oliver@opowell.com>
Date: Mon, 30 Dec 2024 16:14:11 +0100
Subject: [PATCH] Add variable sized data blob erasure coding.

---
 internal/common/constants.go                  |   1 +
 internal/common/constants_integration.go      |   3 +-
 internal/erasurecoding/erasurecoding.go       | 167 ++++++++++++++++
 internal/erasurecoding/erasurecoding_test.go  | 180 ++++++++++++++++++
 .../erasurecoding/reedsolomon/reedsolomon.go  |  35 ++--
 5 files changed, 366 insertions(+), 20 deletions(-)
 create mode 100644 internal/erasurecoding/erasurecoding_test.go

diff --git a/internal/common/constants.go b/internal/common/constants.go
index 9b5f969..8134ede 100644
--- a/internal/common/constants.go
+++ b/internal/common/constants.go
@@ -15,6 +15,7 @@ const (
 	MaxHistoricalTimeslotsForPreimageMeta        = 3                                // () Maximum number of historical timeslots for preimage metadata
 	SizeOfSegment                                = 4104                             // WG = WP*WE = 4104: The size of a segment in octets.
 	MaxWorkPackageSize                           = 12 * 1 << 20                     // WB = 12 MB: The maximum size of an encoded work-package in octets (including extrinsic data and import implications).
+	ErasureCodingChunkSize                       = 684                              // WE = 684: The basic size of erasure-coded pieces in octets.
 	MaxAllocatedGasAccumulation                  = 100_000                          // GA = 100,000: The gas allocated to invoke a work-report’s Accumulation logic.
 	WorkReportMaxSumOfDependencies               = 8                                // (J) The maximum sum of dependency items in a work-report.
 )
diff --git a/internal/common/constants_integration.go b/internal/common/constants_integration.go
index 1702966..43b5320 100644
--- a/internal/common/constants_integration.go
+++ b/internal/common/constants_integration.go
@@ -14,7 +14,8 @@ const (
 	MaxTicketExtrinsicSize                       = 16
 	MaxHistoricalTimeslotsForPreimageMeta        = 3
 	SizeOfSegment                                = 4104
-	MaxWorkPackageSize                           = 12 * 1 << 20 // 12 MB: The maximum size of an encoded work-package in octets (including extrinsic data and import implications).
+	MaxWorkPackageSize                           = 12 * 1 << 20
+	ErasureCodingChunkSize                       = 684
 	MaxAllocatedGasAccumulation                  = 100_000
 	WorkReportMaxSumOfDependencies               = 8
 )
diff --git a/internal/erasurecoding/erasurecoding.go b/internal/erasurecoding/erasurecoding.go
index ab1cdff..d63559a 100644
--- a/internal/erasurecoding/erasurecoding.go
+++ b/internal/erasurecoding/erasurecoding.go
@@ -1 +1,168 @@
 package erasurecoding
+
+import (
+	"bytes"
+	"errors"
+
+	"github.com/eigerco/strawberry/internal/common"
+	"github.com/eigerco/strawberry/internal/erasurecoding/reedsolomon"
+)
+
+// These parameters allow data to be retrieved even when only 1/3 of the
+// validators are available.
+const (
+	OriginalShards = 342                                            // Number of original data shards.
+	RecoveryShards = common.NumberOfValidators - OriginalShards     // Number of recovery data shards.
+	ChunkShardSize = common.ErasureCodingChunkSize / OriginalShards // In bytes.
+)
+
+// Encode transforms input data into striped erasure-coded shards using
+// Reed-Solomon encoding. It splits the data into chunks of ChunkSize bytes,
+// then generates OriginalShards + RecoveryShards total shards. If data cannot
+// be evenly split it will be padded with zeros. The resulting shards can
+// tolerate the loss of up to RecoveryShards number of shards while still being
+// able to reconstruct the original data.
+// Implements equation H.6 from the graypaper (v0.5.3).
+func Encode(data []byte) ([][]byte, error) {
+	if len(data) == 0 {
+		return nil, errors.New("data has length 0")
+	}
+	if len(data) > common.MaxWorkPackageSize {
+		return nil, errors.New("data length too long")
+	}
+
+	encoder, err := reedsolomon.New(OriginalShards, RecoveryShards)
+	if err != nil {
+		return nil, errors.New("error creating encoder")
+	}
+
+	chunks := unzip(common.ErasureCodingChunkSize, data)
+	if len(chunks) == 0 {
+		return nil, errors.New("couldn't unzip data")
+	}
+
+	finalShards := make([][]byte, OriginalShards+RecoveryShards)
+	for i := range finalShards {
+		finalShards[i] = make([]byte, len(chunks)*ChunkShardSize)
+	}
+	for i, c := range chunks {
+		shards, err := encoder.Chunk(c)
+		if err != nil {
+			return nil, err
+		}
+		err = encoder.Encode(shards)
+		if err != nil {
+			return nil, err
+		}
+
+		// Transpose and copy to result in one go.
+		for j := range finalShards {
+			start := i * ChunkShardSize
+			copy(finalShards[j][start:start+ChunkShardSize], shards[j])
+		}
+	}
+
+	return finalShards, nil
+}
+
+// Decode reconstructs the original data from striped erasure-coded shards using
+// Reed-Solomon decoding. It requires at least OriginalShards number of shards
+// to successfully recover the data. Missing or corrupted shards can be passed
+// as nil or empty byte slices. The outSize parameter specifies the expected
+// length of the original data.
+// Implements equation H.7 from the graypaper (v0.5.3).
+func Decode(shards [][]byte, outSize int) ([]byte, error) {
+	if reedsolomon.ShardCount(shards) < OriginalShards {
+		return nil, errors.New("too few shards")
+	}
+
+	shardSize := reedsolomon.ShardSize(shards)
+	if shardSize%ChunkShardSize != 0 {
+		return nil, errors.New("invalid shard size")
+	}
+
+	if outSize > shardSize*OriginalShards {
+		return nil, errors.New("out size too long")
+	}
+
+	encoder, err := reedsolomon.New(OriginalShards, RecoveryShards)
+	if err != nil {
+		return nil, errors.New("error creating encoder")
+	}
+
+	noChunks := shardSize / ChunkShardSize
+	chunkShards := make([][][]byte, noChunks)
+	for i := 0; i < len(chunkShards); i++ {
+		chunkShards[i] = make([][]byte, len(shards))
+	}
+
+	for i, s := range shards {
+		if len(s) == 0 {
+			continue
+		}
+		for j := range chunkShards {
+			start := j * ChunkShardSize
+			chunkShards[j][i] = s[start : start+ChunkShardSize]
+		}
+	}
+
+	unzippedChunks := make([][]byte, noChunks)
+	for i, c := range chunkShards {
+		err := encoder.Decode(c)
+		if err != nil {
+			return nil, err
+		}
+		unzippedChunks[i] = bytes.Join(c[0:OriginalShards], []byte(""))
+	}
+
+	return lace(unzippedChunks, outSize), nil
+}
+
+// unzip stripes data into n-sized chunks by distributing bytes column-wise.
+// For input data [1,2,3,4,5,6] and n=2, it produces chunks: [[1,3,5], [2,4,6]]
+// If the input data length is not perfectly divisible by n, the final chunk
+// will be padded with zeros. For example, data [1,2,3,4,5] and n=2 produces:
+// [[1,3,5], [2,4,0]] This transformation prepares data for erasure coding by
+// ensuring even distribution across shards. Returns nil if n <= 0.
+// Implements equation H.4 from the graypaper (v0.5.3).
+func unzip(n int, data []byte) [][]byte {
+
+	if n <= 0 {
+		return nil
+	}
+
+	k := (len(data) + n - 1) / n
+
+	result := make([][]byte, k)
+
+	for i := range result {
+		result[i] = make([]byte, n)
+	}
+
+	for i, b := range data {
+		result[i%k][i/k] = b
+	}
+
+	return result
+}
+
+// lace combines multiple chunks back into a single byte slice by reading
+// column-wise. It performs the inverse operation of unzip. For input chunks
+// [[1,3,5], [2,4,6]] and outSize=6, it produces: [1,2,3,4,5,6]. The outSize
+// parameter determines the length of the final output slice which should match
+// the length of the original data (this removes any padding that was added by
+// zip).
+// Implements equation H.5 from the graypaper (v0.5.3).
+func lace(chunks [][]byte, outSize int) []byte {
+	if len(chunks) == 0 {
+		return nil
+	}
+
+	k := len(chunks)
+	result := make([]byte, outSize)
+	for i := range result {
+		result[i] = chunks[i%k][i/k]
+	}
+
+	return result
+}
diff --git a/internal/erasurecoding/erasurecoding_test.go b/internal/erasurecoding/erasurecoding_test.go
new file mode 100644
index 0000000..17a2aa2
--- /dev/null
+++ b/internal/erasurecoding/erasurecoding_test.go
@@ -0,0 +1,180 @@
+package erasurecoding
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"testing"
+	"time"
+
+	"github.com/eigerco/strawberry/internal/common"
+	"github.com/eigerco/strawberry/internal/testutils"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/rand"
+)
+
+func TestUnzipLace(t *testing.T) {
+	testCases := []struct {
+		name string
+		data []byte
+		n    int
+		want []byte
+	}{
+		{
+			name: "perfectlyDivsible",
+			data: []byte{1, 2, 3, 4, 5, 6},
+			n:    2,
+			want: []byte{1, 2, 3, 4, 5, 6},
+		},
+		{
+			name: "withPadding",
+			data: []byte{1, 2, 3, 4, 5},
+			n:    2,
+			want: []byte{1, 2, 3, 4, 5},
+		},
+		{
+			name: "singles",
+			data: []byte{1, 2, 3},
+			n:    3,
+			want: []byte{1, 2, 3},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			chunks := unzip(tc.n, tc.data)
+			data := lace(chunks, len(tc.data))
+
+			require.Equal(t, data, tc.want)
+		})
+	}
+}
+
+func TestEncodeDecodeRoundTripRandom(t *testing.T) {
+	dataSizes := []int{
+		1,
+		684,
+		1368,
+		2052,
+		4096,
+		4104,
+	}
+
+	seed := uint64(time.Now().UnixNano())
+	t.Logf("using random seed: %d", seed)
+	rng := rand.New(rand.NewSource(seed))
+
+	for _, size := range dataSizes {
+		t.Run(fmt.Sprintf("size%d", size), func(t *testing.T) {
+			// Generate some random data.
+			data := make([]byte, size)
+			_, err := rng.Read(data)
+			require.NoError(t, err)
+
+			// Encode the data
+			shards, err := Encode(data)
+			require.NoError(t, err)
+			require.Equal(t, len(shards), OriginalShards+RecoveryShards)
+
+			// Randomly remove RecoveryShards shards.
+			indices := rng.Perm(RecoveryShards)
+			for i := range indices {
+				shards[i] = nil
+			}
+
+			// Decode the data with the remaining shards.
+			dataOut, err := Decode(shards, size)
+			if err != nil {
+				t.Fatalf("Decode failed: %v", err)
+			}
+
+			require.Equal(t, data, dataOut)
+		})
+	}
+}
+
+func TestEncodeDecodeInitialTestVector(t *testing.T) {
+	jsonData, err := os.ReadFile("reedsolomon/vectors/initial.json")
+	require.NoError(t, err)
+
+	tv := TestVector{}
+	err = json.Unmarshal(jsonData, &tv)
+	require.NoError(t, err)
+	data := testutils.MustFromHex(t, tv.Data)
+
+	result, err := Encode(data)
+	require.NoError(t, err)
+
+	decodedData, err := Decode(result, len(data))
+	require.NoError(t, err)
+
+	require.Equal(t, decodedData, data)
+}
+func TestEncodeFailureCases(t *testing.T) {
+	tests := []struct {
+		name  string
+		input []byte
+	}{
+		{
+			name:  "emptyData",
+			input: []byte{},
+		},
+		{
+			name:  "dataTooLarge",
+			input: make([]byte, common.MaxWorkPackageSize+1),
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			_, err := Encode(tt.input)
+			require.Error(t, err)
+		})
+	}
+}
+
+func TestDecodeFailureCases(t *testing.T) {
+	tests := []struct {
+		name    string
+		shards  [][]byte
+		outSize int
+	}{
+		{
+			name:   "tooFewShards",
+			shards: testShards(OriginalShards-1, ChunkShardSize),
+		},
+		{
+			name:   "invalidShardSize",
+			shards: testShards(OriginalShards+RecoveryShards, ChunkShardSize+1),
+		},
+		{
+			name:   "nilShards",
+			shards: nil,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			_, err := Decode(tt.shards, 100)
+			require.Error(t, err)
+		})
+	}
+}
+
+// Helper function to create test shards with specific size.
+func testShards(count, size int) [][]byte {
+	shards := make([][]byte, count)
+	for i := range shards {
+		shards[i] = make([]byte, size)
+	}
+	return shards
+}
+
+type TestVector struct {
+	Data    string `json:"data"`
+	Segment struct {
+		Segments []struct {
+			SegmentEC []string `json:"segment_ec"`
+		} `json:"segments"`
+	} `json:"segment"`
+}
diff --git a/internal/erasurecoding/reedsolomon/reedsolomon.go b/internal/erasurecoding/reedsolomon/reedsolomon.go
index 7977a55..eb05087 100644
--- a/internal/erasurecoding/reedsolomon/reedsolomon.go
+++ b/internal/erasurecoding/reedsolomon/reedsolomon.go
@@ -51,7 +51,7 @@ type Encoder struct {
 	recoveryShardsCount int
 }
 
-// Create a new reed solomon enocder with the given original shards count and
+// Create a new reed solomon encoder with the given original shards count and
 // recovery shards count.
 func New(originalShardsCount, recoveryShardsCount int) (*Encoder, error) {
 	if recoveryShardsCount > math.MaxInt-originalShardsCount {
@@ -84,14 +84,13 @@ func (r *Encoder) Chunk(data []byte) ([][]byte, error) {
 	shardSize := (len(data) + r.originalShardsCount - 1) / r.originalShardsCount
 	shards := make([][]byte, r.originalShardsCount+r.recoveryShardsCount)
 
-	// Handle all full-sized chunks
+	// Handle all full-sized chunks.
 	for i := 0; i < r.originalShardsCount-1; i++ {
 		start := i * shardSize
-		end := start + shardSize
-		shards[i] = data[start:end]
+		shards[i] = data[start : start+shardSize]
 	}
 
-	// Handle last chunk with padding
+	// Handle last chunk with padding.
 	start := (r.originalShardsCount - 1) * shardSize
 	padded := make([]byte, shardSize)
 	copy(padded, data[start:])
@@ -106,7 +105,7 @@ func (r *Encoder) Chunk(data []byte) ([][]byte, error) {
 // This can be used to implement C ∶ ⟦Y2⟧342 → ⟦Y2⟧1023 in the graypaper. (Apendix H, v0.5.2-4)
 func (r *Encoder) Encode(
 	shards [][]byte) error {
-	if shardCount(shards[:r.originalShardsCount]) != r.originalShardsCount {
+	if ShardCount(shards[:r.originalShardsCount]) != r.originalShardsCount {
 		return errors.New("too few original shards")
 	}
 
@@ -114,7 +113,7 @@ func (r *Encoder) Encode(
 		return errors.New("not enough space for recovery shards")
 	}
 
-	shardSize := shardSize(shards)
+	shardSize := ShardSize(shards)
 	if shardSize == 0 || shardSize > MaxShardSize {
 		return errors.New("invalid shard size")
 	}
@@ -144,8 +143,7 @@ func (r *Encoder) Encode(
 
 	for i := 0; i < r.recoveryShardsCount; i++ {
 		start := i * shardSize
-		end := start + shardSize
-		shards[i+r.originalShardsCount] = recoveryShardsOut[start:end]
+		shards[i+r.originalShardsCount] = recoveryShardsOut[start : start+shardSize]
 	}
 	return nil
 }
@@ -155,10 +153,10 @@ func (r *Encoder) Encode(
 // or a length of zero before decoding.
 // This can be used to implement R ∶ ℘⟨⎧⎩Y2, N1023⎫⎭⟩342 → ⟦Y2⟧342 in the graypaper. (Apendix H, v0.5.2-4)
 func (r *Encoder) Decode(shards [][]byte) error {
-	if shardCount(shards) < r.originalShardsCount {
+	if ShardCount(shards) < r.originalShardsCount {
 		return errors.New("too few shards")
 	}
-	shardSize := shardSize(shards)
+	shardSize := ShardSize(shards)
 	if shardSize == 0 || shardSize > MaxShardSize {
 		return errors.New("invalid shard size")
 	}
@@ -187,7 +185,7 @@ func (r *Encoder) Decode(shards [][]byte) error {
 		}
 	}
 
-	shardCountOriginal := shardCount(shards[:r.originalShardsCount])
+	shardCountOriginal := ShardCount(shards[:r.originalShardsCount])
 	// Shards we already have aren't restored.
 	restoredShardsCount := r.originalShardsCount - shardCountOriginal
 	restoredShards := make([]byte, restoredShardsCount*shardSize)
@@ -212,16 +210,15 @@ func (r *Encoder) Decode(shards [][]byte) error {
 
 	for i := 0; i < restoredShardsCount; i++ {
 		start := i * shardSize
-		end := start + shardSize
 		index := int(restoredShardsIndexes[i])
-		shards[index] = restoredShards[start:end]
+		shards[index] = restoredShards[start : start+shardSize]
 	}
 
 	return nil
 }
 
 // Returns the first non nil or length zero shard length.
-func shardSize(shards [][]byte) int {
+func ShardSize(shards [][]byte) int {
 	for _, shard := range shards {
 		if len(shard) != 0 {
 			return len(shard)
@@ -231,7 +228,7 @@ func shardSize(shards [][]byte) int {
 }
 
 // Returns the count of non nil or length zero shards.
-func shardCount(shards [][]byte) int {
+func ShardCount(shards [][]byte) int {
 	count := 0
 	for _, shard := range shards {
 		if len(shard) != 0 {
@@ -242,21 +239,21 @@ func shardCount(shards [][]byte) int {
 }
 
 func init() {
-	// Load the Rust shared library in the init function
+	// Load the Rust shared library in the init function.
 	libPath, err := getErasurecodingLibaryPath()
 	if err != nil {
 		fmt.Println("Failed to load erasure coding library path:", err)
 		os.Exit(1)
 	}
 
-	// Load the Rust shared library
+	// Load the Rust shared library.
 	lib, err := purego.Dlopen(libPath, purego.RTLD_NOW|purego.RTLD_GLOBAL)
 	if err != nil {
 		fmt.Println("Failed to load erasure coding library:", err)
 		os.Exit(1)
 	}
 
-	// Register the Rust FFI functions with Go using purego
+	// Register the Rust FFI functions with Go using purego.
 	purego.RegisterLibFunc(&reedSolomonEncode, lib, "reed_solomon_encode")
 	purego.RegisterLibFunc(&reedSolomonDecode, lib, "reed_solomon_decode")
 }