Add variable sized data blob erasure coding.

eigerco · Dec 30, 2024 · b7e00cb · b7e00cb
1 parent f6ef8e1
commit b7e00cb
Show file tree

Hide file tree

Showing 5 changed files with 366 additions and 20 deletions.
diff --git a/internal/common/constants.go b/internal/common/constants.go
@@ -15,6 +15,7 @@ const (
 	MaxHistoricalTimeslotsForPreimageMeta        = 3                                // () Maximum number of historical timeslots for preimage metadata
 	SizeOfSegment                                = 4104                             // WG = WP*WE = 4104: The size of a segment in octets.
 	MaxWorkPackageSize                           = 12 * 1 << 20                     // WB = 12 MB: The maximum size of an encoded work-package in octets (including extrinsic data and import implications).
+	ErasureCodingChunkSize                       = 684                              // WE = 684: The basic size of erasure-coded pieces in octets.
 	MaxAllocatedGasAccumulation                  = 100_000                          // GA = 100,000: The gas allocated to invoke a work-report’s Accumulation logic.
 	WorkReportMaxSumOfDependencies               = 8                                // (J) The maximum sum of dependency items in a work-report.
 )
diff --git a/internal/common/constants_integration.go b/internal/common/constants_integration.go
@@ -14,7 +14,8 @@ const (
 	MaxTicketExtrinsicSize                       = 16
 	MaxHistoricalTimeslotsForPreimageMeta        = 3
 	SizeOfSegment                                = 4104
-	MaxWorkPackageSize                           = 12 * 1 << 20 // 12 MB: The maximum size of an encoded work-package in octets (including extrinsic data and import implications).
+	MaxWorkPackageSize                           = 12 * 1 << 20
+	ErasureCodingChunkSize                       = 684
 	MaxAllocatedGasAccumulation                  = 100_000
 	WorkReportMaxSumOfDependencies               = 8
 )
diff --git a/internal/erasurecoding/erasurecoding.go b/internal/erasurecoding/erasurecoding.go
@@ -1 +1,168 @@
 package erasurecoding
+
+import (
+	"bytes"
+	"errors"
+
+	"github.com/eigerco/strawberry/internal/common"
+	"github.com/eigerco/strawberry/internal/erasurecoding/reedsolomon"
+)
+
+// These parameters allow data to be retrieved even when only 1/3 of the
+// validators are available.
+const (
+	OriginalShards = 342                                            // Number of original data shards.
+	RecoveryShards = common.NumberOfValidators - OriginalShards     // Number of recovery data shards.
+	ChunkShardSize = common.ErasureCodingChunkSize / OriginalShards // In bytes.
+)
+
+// Encode transforms input data into striped erasure-coded shards using
+// Reed-Solomon encoding. It splits the data into chunks of ChunkSize bytes,
+// then generates OriginalShards + RecoveryShards total shards. If data cannot
+// be evenly split it will be padded with zeros. The resulting shards can
+// tolerate the loss of up to RecoveryShards number of shards while still being
+// able to reconstruct the original data.
+// Implements equation H.6 from the graypaper (v0.5.3).
+func Encode(data []byte) ([][]byte, error) {
+	if len(data) == 0 {
+		return nil, errors.New("data has length 0")
+	}
+	if len(data) > common.MaxWorkPackageSize {
+		return nil, errors.New("data length too long")
+	}
+
+	encoder, err := reedsolomon.New(OriginalShards, RecoveryShards)
+	if err != nil {
+		return nil, errors.New("error creating encoder")
+	}
+
+	chunks := unzip(common.ErasureCodingChunkSize, data)
+	if len(chunks) == 0 {
+		return nil, errors.New("couldn't unzip data")
+	}
+
+	finalShards := make([][]byte, OriginalShards+RecoveryShards)
+	for i := range finalShards {
+		finalShards[i] = make([]byte, len(chunks)*ChunkShardSize)
+	}
+	for i, c := range chunks {
+		shards, err := encoder.Chunk(c)
+		if err != nil {
+			return nil, err
+		}
+		err = encoder.Encode(shards)
+		if err != nil {
+			return nil, err
+		}
+
+		// Transpose and copy to result in one go.
+		for j := range finalShards {
+			start := i * ChunkShardSize
+			copy(finalShards[j][start:start+ChunkShardSize], shards[j])
+		}
+	}
+
+	return finalShards, nil
+}
+
+// Decode reconstructs the original data from striped erasure-coded shards using
+// Reed-Solomon decoding. It requires at least OriginalShards number of shards
+// to successfully recover the data. Missing or corrupted shards can be passed
+// as nil or empty byte slices. The outSize parameter specifies the expected
+// length of the original data.
+// Implements equation H.7 from the graypaper (v0.5.3).
+func Decode(shards [][]byte, outSize int) ([]byte, error) {
+	if reedsolomon.ShardCount(shards) < OriginalShards {
+		return nil, errors.New("too few shards")
+	}
+
+	shardSize := reedsolomon.ShardSize(shards)
+	if shardSize%ChunkShardSize != 0 {
+		return nil, errors.New("invalid shard size")
+	}
+
+	if outSize > shardSize*OriginalShards {
+		return nil, errors.New("out size too long")
+	}
+
+	encoder, err := reedsolomon.New(OriginalShards, RecoveryShards)
+	if err != nil {
+		return nil, errors.New("error creating encoder")
+	}
+
+	noChunks := shardSize / ChunkShardSize
+	chunkShards := make([][][]byte, noChunks)
+	for i := 0; i < len(chunkShards); i++ {
+		chunkShards[i] = make([][]byte, len(shards))
+	}
+
+	for i, s := range shards {
+		if len(s) == 0 {
+			continue
+		}
+		for j := range chunkShards {
+			start := j * ChunkShardSize
+			chunkShards[j][i] = s[start : start+ChunkShardSize]
+		}
+	}
+
+	unzippedChunks := make([][]byte, noChunks)
+	for i, c := range chunkShards {
+		err := encoder.Decode(c)
+		if err != nil {
+			return nil, err
+		}
+		unzippedChunks[i] = bytes.Join(c[0:OriginalShards], []byte(""))
+	}
+
+	return lace(unzippedChunks, outSize), nil
+}
+
+// unzip stripes data into n-sized chunks by distributing bytes column-wise.
+// For input data [1,2,3,4,5,6] and n=2, it produces chunks: [[1,3,5], [2,4,6]]
+// If the input data length is not perfectly divisible by n, the final chunk
+// will be padded with zeros. For example, data [1,2,3,4,5] and n=2 produces:
+// [[1,3,5], [2,4,0]] This transformation prepares data for erasure coding by
+// ensuring even distribution across shards. Returns nil if n <= 0.
+// Implements equation H.4 from the graypaper (v0.5.3).
+func unzip(n int, data []byte) [][]byte {
+
+	if n <= 0 {
+		return nil
+	}
+
+	k := (len(data) + n - 1) / n
+
+	result := make([][]byte, k)
+
+	for i := range result {
+		result[i] = make([]byte, n)
+	}
+
+	for i, b := range data {
+		result[i%k][i/k] = b
+	}
+
+	return result
+}
+
+// lace combines multiple chunks back into a single byte slice by reading
+// column-wise. It performs the inverse operation of unzip. For input chunks
+// [[1,3,5], [2,4,6]] and outSize=6, it produces: [1,2,3,4,5,6]. The outSize
+// parameter determines the length of the final output slice which should match
+// the length of the original data (this removes any padding that was added by
+// zip).
+// Implements equation H.5 from the graypaper (v0.5.3).
+func lace(chunks [][]byte, outSize int) []byte {
+	if len(chunks) == 0 {
+		return nil
+	}
+
+	k := len(chunks)
+	result := make([]byte, outSize)
+	for i := range result {
+		result[i] = chunks[i%k][i/k]
+	}
+
+	return result
+}
diff --git a/internal/erasurecoding/erasurecoding_test.go b/internal/erasurecoding/erasurecoding_test.go
@@ -0,0 +1,180 @@
+package erasurecoding
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"testing"
+	"time"
+
+	"github.com/eigerco/strawberry/internal/common"
+	"github.com/eigerco/strawberry/internal/testutils"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/rand"
+)
+
+func TestUnzipLace(t *testing.T) {
+	testCases := []struct {
+		name string
+		data []byte
+		n    int
+		want []byte
+	}{
+		{
+			name: "perfectlyDivsible",
+			data: []byte{1, 2, 3, 4, 5, 6},
+			n:    2,
+			want: []byte{1, 2, 3, 4, 5, 6},
+		},
+		{
+			name: "withPadding",
+			data: []byte{1, 2, 3, 4, 5},
+			n:    2,
+			want: []byte{1, 2, 3, 4, 5},
+		},
+		{
+			name: "singles",
+			data: []byte{1, 2, 3},
+			n:    3,
+			want: []byte{1, 2, 3},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			chunks := unzip(tc.n, tc.data)
+			data := lace(chunks, len(tc.data))
+
+			require.Equal(t, data, tc.want)
+		})
+	}
+}
+
+func TestEncodeDecodeRoundTripRandom(t *testing.T) {
+	dataSizes := []int{
+		1,
+		684,
+		1368,
+		2052,
+		4096,
+		4104,
+	}
+
+	seed := uint64(time.Now().UnixNano())
+	t.Logf("using random seed: %d", seed)
+	rng := rand.New(rand.NewSource(seed))
+
+	for _, size := range dataSizes {
+		t.Run(fmt.Sprintf("size%d", size), func(t *testing.T) {
+			// Generate some random data.
+			data := make([]byte, size)
+			_, err := rng.Read(data)
+			require.NoError(t, err)
+
+			// Encode the data
+			shards, err := Encode(data)
+			require.NoError(t, err)
+			require.Equal(t, len(shards), OriginalShards+RecoveryShards)
+
+			// Randomly remove RecoveryShards shards.
+			indices := rng.Perm(RecoveryShards)
+			for i := range indices {
+				shards[i] = nil
+			}
+
+			// Decode the data with the remaining shards.
+			dataOut, err := Decode(shards, size)
+			if err != nil {
+				t.Fatalf("Decode failed: %v", err)
+			}
+
+			require.Equal(t, data, dataOut)
+		})
+	}
+}
+
+func TestEncodeDecodeInitialTestVector(t *testing.T) {
+	jsonData, err := os.ReadFile("reedsolomon/vectors/initial.json")
+	require.NoError(t, err)
+
+	tv := TestVector{}
+	err = json.Unmarshal(jsonData, &tv)
+	require.NoError(t, err)
+	data := testutils.MustFromHex(t, tv.Data)
+
+	result, err := Encode(data)
+	require.NoError(t, err)
+
+	decodedData, err := Decode(result, len(data))
+	require.NoError(t, err)
+
+	require.Equal(t, decodedData, data)
+}
+func TestEncodeFailureCases(t *testing.T) {
+	tests := []struct {
+		name  string
+		input []byte
+	}{
+		{
+			name:  "emptyData",
+			input: []byte{},
+		},
+		{
+			name:  "dataTooLarge",
+			input: make([]byte, common.MaxWorkPackageSize+1),
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			_, err := Encode(tt.input)
+			require.Error(t, err)
+		})
+	}
+}
+
+func TestDecodeFailureCases(t *testing.T) {
+	tests := []struct {
+		name    string
+		shards  [][]byte
+		outSize int
+	}{
+		{
+			name:   "tooFewShards",
+			shards: testShards(OriginalShards-1, ChunkShardSize),
+		},
+		{
+			name:   "invalidShardSize",
+			shards: testShards(OriginalShards+RecoveryShards, ChunkShardSize+1),
+		},
+		{
+			name:   "nilShards",
+			shards: nil,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			_, err := Decode(tt.shards, 100)
+			require.Error(t, err)
+		})
+	}
+}
+
+// Helper function to create test shards with specific size.
+func testShards(count, size int) [][]byte {
+	shards := make([][]byte, count)
+	for i := range shards {
+		shards[i] = make([]byte, size)
+	}
+	return shards
+}
+
+type TestVector struct {
+	Data    string `json:"data"`
+	Segment struct {
+		Segments []struct {
+			SegmentEC []string `json:"segment_ec"`
+		} `json:"segments"`
+	} `json:"segment"`
+}