From b7e00cb587fb0cdfe29b27a6c9ce43d0cb5a74b3 Mon Sep 17 00:00:00 2001 From: Oliver Powell Date: Mon, 30 Dec 2024 16:14:11 +0100 Subject: [PATCH] Add variable sized data blob erasure coding. --- internal/common/constants.go | 1 + internal/common/constants_integration.go | 3 +- internal/erasurecoding/erasurecoding.go | 167 ++++++++++++++++ internal/erasurecoding/erasurecoding_test.go | 180 ++++++++++++++++++ .../erasurecoding/reedsolomon/reedsolomon.go | 35 ++-- 5 files changed, 366 insertions(+), 20 deletions(-) create mode 100644 internal/erasurecoding/erasurecoding_test.go diff --git a/internal/common/constants.go b/internal/common/constants.go index 9b5f969..8134ede 100644 --- a/internal/common/constants.go +++ b/internal/common/constants.go @@ -15,6 +15,7 @@ const ( MaxHistoricalTimeslotsForPreimageMeta = 3 // () Maximum number of historical timeslots for preimage metadata SizeOfSegment = 4104 // WG = WP*WE = 4104: The size of a segment in octets. MaxWorkPackageSize = 12 * 1 << 20 // WB = 12 MB: The maximum size of an encoded work-package in octets (including extrinsic data and import implications). + ErasureCodingChunkSize = 684 // WE = 684: The basic size of erasure-coded pieces in octets. MaxAllocatedGasAccumulation = 100_000 // GA = 100,000: The gas allocated to invoke a work-report’s Accumulation logic. WorkReportMaxSumOfDependencies = 8 // (J) The maximum sum of dependency items in a work-report. ) diff --git a/internal/common/constants_integration.go b/internal/common/constants_integration.go index 1702966..43b5320 100644 --- a/internal/common/constants_integration.go +++ b/internal/common/constants_integration.go @@ -14,7 +14,8 @@ const ( MaxTicketExtrinsicSize = 16 MaxHistoricalTimeslotsForPreimageMeta = 3 SizeOfSegment = 4104 - MaxWorkPackageSize = 12 * 1 << 20 // 12 MB: The maximum size of an encoded work-package in octets (including extrinsic data and import implications). + MaxWorkPackageSize = 12 * 1 << 20 + ErasureCodingChunkSize = 684 MaxAllocatedGasAccumulation = 100_000 WorkReportMaxSumOfDependencies = 8 ) diff --git a/internal/erasurecoding/erasurecoding.go b/internal/erasurecoding/erasurecoding.go index ab1cdff..d63559a 100644 --- a/internal/erasurecoding/erasurecoding.go +++ b/internal/erasurecoding/erasurecoding.go @@ -1 +1,168 @@ package erasurecoding + +import ( + "bytes" + "errors" + + "github.com/eigerco/strawberry/internal/common" + "github.com/eigerco/strawberry/internal/erasurecoding/reedsolomon" +) + +// These parameters allow data to be retrieved even when only 1/3 of the +// validators are available. +const ( + OriginalShards = 342 // Number of original data shards. + RecoveryShards = common.NumberOfValidators - OriginalShards // Number of recovery data shards. + ChunkShardSize = common.ErasureCodingChunkSize / OriginalShards // In bytes. +) + +// Encode transforms input data into striped erasure-coded shards using +// Reed-Solomon encoding. It splits the data into chunks of ChunkSize bytes, +// then generates OriginalShards + RecoveryShards total shards. If data cannot +// be evenly split it will be padded with zeros. The resulting shards can +// tolerate the loss of up to RecoveryShards number of shards while still being +// able to reconstruct the original data. +// Implements equation H.6 from the graypaper (v0.5.3). +func Encode(data []byte) ([][]byte, error) { + if len(data) == 0 { + return nil, errors.New("data has length 0") + } + if len(data) > common.MaxWorkPackageSize { + return nil, errors.New("data length too long") + } + + encoder, err := reedsolomon.New(OriginalShards, RecoveryShards) + if err != nil { + return nil, errors.New("error creating encoder") + } + + chunks := unzip(common.ErasureCodingChunkSize, data) + if len(chunks) == 0 { + return nil, errors.New("couldn't unzip data") + } + + finalShards := make([][]byte, OriginalShards+RecoveryShards) + for i := range finalShards { + finalShards[i] = make([]byte, len(chunks)*ChunkShardSize) + } + for i, c := range chunks { + shards, err := encoder.Chunk(c) + if err != nil { + return nil, err + } + err = encoder.Encode(shards) + if err != nil { + return nil, err + } + + // Transpose and copy to result in one go. + for j := range finalShards { + start := i * ChunkShardSize + copy(finalShards[j][start:start+ChunkShardSize], shards[j]) + } + } + + return finalShards, nil +} + +// Decode reconstructs the original data from striped erasure-coded shards using +// Reed-Solomon decoding. It requires at least OriginalShards number of shards +// to successfully recover the data. Missing or corrupted shards can be passed +// as nil or empty byte slices. The outSize parameter specifies the expected +// length of the original data. +// Implements equation H.7 from the graypaper (v0.5.3). +func Decode(shards [][]byte, outSize int) ([]byte, error) { + if reedsolomon.ShardCount(shards) < OriginalShards { + return nil, errors.New("too few shards") + } + + shardSize := reedsolomon.ShardSize(shards) + if shardSize%ChunkShardSize != 0 { + return nil, errors.New("invalid shard size") + } + + if outSize > shardSize*OriginalShards { + return nil, errors.New("out size too long") + } + + encoder, err := reedsolomon.New(OriginalShards, RecoveryShards) + if err != nil { + return nil, errors.New("error creating encoder") + } + + noChunks := shardSize / ChunkShardSize + chunkShards := make([][][]byte, noChunks) + for i := 0; i < len(chunkShards); i++ { + chunkShards[i] = make([][]byte, len(shards)) + } + + for i, s := range shards { + if len(s) == 0 { + continue + } + for j := range chunkShards { + start := j * ChunkShardSize + chunkShards[j][i] = s[start : start+ChunkShardSize] + } + } + + unzippedChunks := make([][]byte, noChunks) + for i, c := range chunkShards { + err := encoder.Decode(c) + if err != nil { + return nil, err + } + unzippedChunks[i] = bytes.Join(c[0:OriginalShards], []byte("")) + } + + return lace(unzippedChunks, outSize), nil +} + +// unzip stripes data into n-sized chunks by distributing bytes column-wise. +// For input data [1,2,3,4,5,6] and n=2, it produces chunks: [[1,3,5], [2,4,6]] +// If the input data length is not perfectly divisible by n, the final chunk +// will be padded with zeros. For example, data [1,2,3,4,5] and n=2 produces: +// [[1,3,5], [2,4,0]] This transformation prepares data for erasure coding by +// ensuring even distribution across shards. Returns nil if n <= 0. +// Implements equation H.4 from the graypaper (v0.5.3). +func unzip(n int, data []byte) [][]byte { + + if n <= 0 { + return nil + } + + k := (len(data) + n - 1) / n + + result := make([][]byte, k) + + for i := range result { + result[i] = make([]byte, n) + } + + for i, b := range data { + result[i%k][i/k] = b + } + + return result +} + +// lace combines multiple chunks back into a single byte slice by reading +// column-wise. It performs the inverse operation of unzip. For input chunks +// [[1,3,5], [2,4,6]] and outSize=6, it produces: [1,2,3,4,5,6]. The outSize +// parameter determines the length of the final output slice which should match +// the length of the original data (this removes any padding that was added by +// zip). +// Implements equation H.5 from the graypaper (v0.5.3). +func lace(chunks [][]byte, outSize int) []byte { + if len(chunks) == 0 { + return nil + } + + k := len(chunks) + result := make([]byte, outSize) + for i := range result { + result[i] = chunks[i%k][i/k] + } + + return result +} diff --git a/internal/erasurecoding/erasurecoding_test.go b/internal/erasurecoding/erasurecoding_test.go new file mode 100644 index 0000000..17a2aa2 --- /dev/null +++ b/internal/erasurecoding/erasurecoding_test.go @@ -0,0 +1,180 @@ +package erasurecoding + +import ( + "encoding/json" + "fmt" + "os" + "testing" + "time" + + "github.com/eigerco/strawberry/internal/common" + "github.com/eigerco/strawberry/internal/testutils" + "github.com/stretchr/testify/require" + "golang.org/x/exp/rand" +) + +func TestUnzipLace(t *testing.T) { + testCases := []struct { + name string + data []byte + n int + want []byte + }{ + { + name: "perfectlyDivsible", + data: []byte{1, 2, 3, 4, 5, 6}, + n: 2, + want: []byte{1, 2, 3, 4, 5, 6}, + }, + { + name: "withPadding", + data: []byte{1, 2, 3, 4, 5}, + n: 2, + want: []byte{1, 2, 3, 4, 5}, + }, + { + name: "singles", + data: []byte{1, 2, 3}, + n: 3, + want: []byte{1, 2, 3}, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + chunks := unzip(tc.n, tc.data) + data := lace(chunks, len(tc.data)) + + require.Equal(t, data, tc.want) + }) + } +} + +func TestEncodeDecodeRoundTripRandom(t *testing.T) { + dataSizes := []int{ + 1, + 684, + 1368, + 2052, + 4096, + 4104, + } + + seed := uint64(time.Now().UnixNano()) + t.Logf("using random seed: %d", seed) + rng := rand.New(rand.NewSource(seed)) + + for _, size := range dataSizes { + t.Run(fmt.Sprintf("size%d", size), func(t *testing.T) { + // Generate some random data. + data := make([]byte, size) + _, err := rng.Read(data) + require.NoError(t, err) + + // Encode the data + shards, err := Encode(data) + require.NoError(t, err) + require.Equal(t, len(shards), OriginalShards+RecoveryShards) + + // Randomly remove RecoveryShards shards. + indices := rng.Perm(RecoveryShards) + for i := range indices { + shards[i] = nil + } + + // Decode the data with the remaining shards. + dataOut, err := Decode(shards, size) + if err != nil { + t.Fatalf("Decode failed: %v", err) + } + + require.Equal(t, data, dataOut) + }) + } +} + +func TestEncodeDecodeInitialTestVector(t *testing.T) { + jsonData, err := os.ReadFile("reedsolomon/vectors/initial.json") + require.NoError(t, err) + + tv := TestVector{} + err = json.Unmarshal(jsonData, &tv) + require.NoError(t, err) + data := testutils.MustFromHex(t, tv.Data) + + result, err := Encode(data) + require.NoError(t, err) + + decodedData, err := Decode(result, len(data)) + require.NoError(t, err) + + require.Equal(t, decodedData, data) +} +func TestEncodeFailureCases(t *testing.T) { + tests := []struct { + name string + input []byte + }{ + { + name: "emptyData", + input: []byte{}, + }, + { + name: "dataTooLarge", + input: make([]byte, common.MaxWorkPackageSize+1), + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + _, err := Encode(tt.input) + require.Error(t, err) + }) + } +} + +func TestDecodeFailureCases(t *testing.T) { + tests := []struct { + name string + shards [][]byte + outSize int + }{ + { + name: "tooFewShards", + shards: testShards(OriginalShards-1, ChunkShardSize), + }, + { + name: "invalidShardSize", + shards: testShards(OriginalShards+RecoveryShards, ChunkShardSize+1), + }, + { + name: "nilShards", + shards: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + _, err := Decode(tt.shards, 100) + require.Error(t, err) + }) + } +} + +// Helper function to create test shards with specific size. +func testShards(count, size int) [][]byte { + shards := make([][]byte, count) + for i := range shards { + shards[i] = make([]byte, size) + } + return shards +} + +type TestVector struct { + Data string `json:"data"` + Segment struct { + Segments []struct { + SegmentEC []string `json:"segment_ec"` + } `json:"segments"` + } `json:"segment"` +} diff --git a/internal/erasurecoding/reedsolomon/reedsolomon.go b/internal/erasurecoding/reedsolomon/reedsolomon.go index 7977a55..eb05087 100644 --- a/internal/erasurecoding/reedsolomon/reedsolomon.go +++ b/internal/erasurecoding/reedsolomon/reedsolomon.go @@ -51,7 +51,7 @@ type Encoder struct { recoveryShardsCount int } -// Create a new reed solomon enocder with the given original shards count and +// Create a new reed solomon encoder with the given original shards count and // recovery shards count. func New(originalShardsCount, recoveryShardsCount int) (*Encoder, error) { if recoveryShardsCount > math.MaxInt-originalShardsCount { @@ -84,14 +84,13 @@ func (r *Encoder) Chunk(data []byte) ([][]byte, error) { shardSize := (len(data) + r.originalShardsCount - 1) / r.originalShardsCount shards := make([][]byte, r.originalShardsCount+r.recoveryShardsCount) - // Handle all full-sized chunks + // Handle all full-sized chunks. for i := 0; i < r.originalShardsCount-1; i++ { start := i * shardSize - end := start + shardSize - shards[i] = data[start:end] + shards[i] = data[start : start+shardSize] } - // Handle last chunk with padding + // Handle last chunk with padding. start := (r.originalShardsCount - 1) * shardSize padded := make([]byte, shardSize) copy(padded, data[start:]) @@ -106,7 +105,7 @@ func (r *Encoder) Chunk(data []byte) ([][]byte, error) { // This can be used to implement C ∶ ⟦Y2⟧342 → ⟦Y2⟧1023 in the graypaper. (Apendix H, v0.5.2-4) func (r *Encoder) Encode( shards [][]byte) error { - if shardCount(shards[:r.originalShardsCount]) != r.originalShardsCount { + if ShardCount(shards[:r.originalShardsCount]) != r.originalShardsCount { return errors.New("too few original shards") } @@ -114,7 +113,7 @@ func (r *Encoder) Encode( return errors.New("not enough space for recovery shards") } - shardSize := shardSize(shards) + shardSize := ShardSize(shards) if shardSize == 0 || shardSize > MaxShardSize { return errors.New("invalid shard size") } @@ -144,8 +143,7 @@ func (r *Encoder) Encode( for i := 0; i < r.recoveryShardsCount; i++ { start := i * shardSize - end := start + shardSize - shards[i+r.originalShardsCount] = recoveryShardsOut[start:end] + shards[i+r.originalShardsCount] = recoveryShardsOut[start : start+shardSize] } return nil } @@ -155,10 +153,10 @@ func (r *Encoder) Encode( // or a length of zero before decoding. // This can be used to implement R ∶ ℘⟨⎧⎩Y2, N1023⎫⎭⟩342 → ⟦Y2⟧342 in the graypaper. (Apendix H, v0.5.2-4) func (r *Encoder) Decode(shards [][]byte) error { - if shardCount(shards) < r.originalShardsCount { + if ShardCount(shards) < r.originalShardsCount { return errors.New("too few shards") } - shardSize := shardSize(shards) + shardSize := ShardSize(shards) if shardSize == 0 || shardSize > MaxShardSize { return errors.New("invalid shard size") } @@ -187,7 +185,7 @@ func (r *Encoder) Decode(shards [][]byte) error { } } - shardCountOriginal := shardCount(shards[:r.originalShardsCount]) + shardCountOriginal := ShardCount(shards[:r.originalShardsCount]) // Shards we already have aren't restored. restoredShardsCount := r.originalShardsCount - shardCountOriginal restoredShards := make([]byte, restoredShardsCount*shardSize) @@ -212,16 +210,15 @@ func (r *Encoder) Decode(shards [][]byte) error { for i := 0; i < restoredShardsCount; i++ { start := i * shardSize - end := start + shardSize index := int(restoredShardsIndexes[i]) - shards[index] = restoredShards[start:end] + shards[index] = restoredShards[start : start+shardSize] } return nil } // Returns the first non nil or length zero shard length. -func shardSize(shards [][]byte) int { +func ShardSize(shards [][]byte) int { for _, shard := range shards { if len(shard) != 0 { return len(shard) @@ -231,7 +228,7 @@ func shardSize(shards [][]byte) int { } // Returns the count of non nil or length zero shards. -func shardCount(shards [][]byte) int { +func ShardCount(shards [][]byte) int { count := 0 for _, shard := range shards { if len(shard) != 0 { @@ -242,21 +239,21 @@ func shardCount(shards [][]byte) int { } func init() { - // Load the Rust shared library in the init function + // Load the Rust shared library in the init function. libPath, err := getErasurecodingLibaryPath() if err != nil { fmt.Println("Failed to load erasure coding library path:", err) os.Exit(1) } - // Load the Rust shared library + // Load the Rust shared library. lib, err := purego.Dlopen(libPath, purego.RTLD_NOW|purego.RTLD_GLOBAL) if err != nil { fmt.Println("Failed to load erasure coding library:", err) os.Exit(1) } - // Register the Rust FFI functions with Go using purego + // Register the Rust FFI functions with Go using purego. purego.RegisterLibFunc(&reedSolomonEncode, lib, "reed_solomon_encode") purego.RegisterLibFunc(&reedSolomonDecode, lib, "reed_solomon_decode") }