Add subsystem benchmarks for availability-distribution and `biftiel…

…d-distribution` (availability write) (paritytech#2970) Introduce a new test objective : `DataAvailabilityWrite`. The new benchmark measures the network and cpu usage of `availability-distribution`, `biftield-distribution` and `availability-store` subsystems from the perspective of a validator node during the process when candidates are made available. Additionally I refactored the networking emulation to support bandwidth acounting and limits of incoming and outgoing requests. Screenshot of succesful run <img width="1293" alt="Screenshot 2024-01-17 at 19 17 44" src="https://github.com/paritytech/polkadot-sdk/assets/54316454/fde11280-e25b-4dc3-9dc9-d4b9752f9b7a"> --------- Signed-off-by: Andrei Sandu <andrei-mihail@parity.io>
gilescope · Jan 25, 2024 · 47e46d1 · 47e46d1
1 parent 73fd8cd
commit 47e46d1
Show file tree

Hide file tree

Showing 22 changed files with 1,965 additions and 799 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -159,6 +159,7 @@ members = [
 	"polkadot/node/gum/proc-macro",
 	"polkadot/node/jaeger",
 	"polkadot/node/malus",
+	"polkadot/node/subsystem-bench",
 	"polkadot/node/metrics",
 	"polkadot/node/network/approval-distribution",
 	"polkadot/node/network/availability-distribution",

diff --git a/polkadot/node/subsystem-bench/Cargo.toml b/polkadot/node/subsystem-bench/Cargo.toml
@@ -22,12 +22,16 @@ polkadot-node-subsystem-types = { path = "../subsystem-types" }
 polkadot-node-primitives = { path = "../primitives" }
 polkadot-primitives = { path = "../../primitives" }
 polkadot-node-network-protocol = { path = "../network/protocol" }
-polkadot-availability-recovery = { path = "../network/availability-recovery", features = ["subsystem-benchmarks"] }
+polkadot-availability-recovery = { path = "../network/availability-recovery", features=["subsystem-benchmarks"]}
+polkadot-availability-distribution = { path = "../network/availability-distribution"}
+polkadot-node-core-av-store = { path = "../core/av-store"}
+polkadot-node-core-chain-api = { path = "../core/chain-api"}
+polkadot-availability-bitfield-distribution = { path = "../network/bitfield-distribution"}
 color-eyre = { version = "0.6.1", default-features = false }
-polkadot-overseer = { path = "../overseer" }
+polkadot-overseer =  { path = "../overseer" }
 colored = "2.0.4"
 assert_matches = "1.5"
-async-trait = "0.1.74"
+async-trait = "0.1.57"
 sp-keystore = { path = "../../../substrate/primitives/keystore" }
 sc-keystore = { path = "../../../substrate/client/keystore" }
 sp-core = { path = "../../../substrate/primitives/core" }
@@ -39,14 +43,20 @@ polkadot-erasure-coding = { package = "polkadot-erasure-coding", path = "../../e
 log = "0.4.17"
 env_logger = "0.9.0"
 rand = "0.8.5"
-parity-scale-codec = { version = "3.6.1", features = ["derive", "std"] }
+# `rand` only supports uniform distribution, we need normal distribution for latency.
+rand_distr = "0.4.3"
+bitvec="1.0.1"
+kvdb-memorydb = "0.13.0"
+
+parity-scale-codec = { version = "3.6.1", features = ["std", "derive"] }
 tokio = "1.24.2"
 clap-num = "1.0.2"
 polkadot-node-subsystem-test-helpers = { path = "../subsystem-test-helpers" }
 sp-keyring = { path = "../../../substrate/primitives/keyring" }
 sp-application-crypto = { path = "../../../substrate/primitives/application-crypto" }
 sc-network = { path = "../../../substrate/client/network" }
 sc-service = { path = "../../../substrate/client/service" }
+sp-consensus = { path = "../../../substrate/primitives/consensus/common" }
 polkadot-node-metrics = { path = "../metrics" }
 itertools = "0.11.0"
 polkadot-primitives-test-helpers = { path = "../../primitives/test-helpers" }

diff --git a/polkadot/node/subsystem-bench/README.md b/polkadot/node/subsystem-bench/README.md
@@ -1,6 +1,6 @@
 # Subsystem benchmark client
 
-Run parachain consensus stress and performance tests on your development machine.
+Run parachain consensus stress and performance tests on your development machine.  
 
 ## Motivation
 
@@ -111,30 +111,28 @@ Commands:
 ```
 
 Note: `test-sequence` is a special test objective that wraps up an arbitrary number of test objectives. It is tipically
-used to run a suite of tests defined in a `yaml` file like in this [example](examples/availability_read.yaml).
+ used to run a suite of tests defined in a `yaml` file like in this [example](examples/availability_read.yaml).
 
 ### Standard test options
-
+  
 ```
-Options:
-    --network <NETWORK>                              The type of network to be emulated [default: ideal] [possible
-                                                     values: ideal, healthy, degraded]
-    --n-cores <N_CORES>                              Number of cores to fetch availability for [default: 100]
-    --n-validators <N_VALIDATORS>                    Number of validators to fetch chunks from [default: 500]
-    --min-pov-size <MIN_POV_SIZE>                    The minimum pov size in KiB [default: 5120]
-    --max-pov-size <MAX_POV_SIZE>                    The maximum pov size bytes [default: 5120]
--n, --num-blocks <NUM_BLOCKS>                        The number of blocks the test is going to run [default: 1]
--p, --peer-bandwidth <PEER_BANDWIDTH>                The bandwidth of simulated remote peers in KiB
--b, --bandwidth <BANDWIDTH>                          The bandwidth of our simulated node in KiB
-    --peer-error <PEER_ERROR>                        Simulated conection error ratio [0-100]
-    --peer-min-latency <PEER_MIN_LATENCY>            Minimum remote peer latency in milliseconds [0-5000]
-    --peer-max-latency <PEER_MAX_LATENCY>            Maximum remote peer latency in milliseconds [0-5000]
-    --profile                                        Enable CPU Profiling with Pyroscope
-    --pyroscope-url <PYROSCOPE_URL>                  Pyroscope Server URL [default: http://localhost:4040]
-    --pyroscope-sample-rate <PYROSCOPE_SAMPLE_RATE>  Pyroscope Sample Rate [default: 113]
-    --cache-misses                                   Enable Cache Misses Profiling with Valgrind. Linux only, Valgrind
-                                                     must be in the PATH
--h, --help                                           Print help
+      --network <NETWORK>                              The type of network to be emulated [default: ideal] [possible values: ideal, healthy,
+                                                       degraded]
+      --n-cores <N_CORES>                              Number of cores to fetch availability for [default: 100]
+      --n-validators <N_VALIDATORS>                    Number of validators to fetch chunks from [default: 500]
+      --min-pov-size <MIN_POV_SIZE>                    The minimum pov size in KiB [default: 5120]
+      --max-pov-size <MAX_POV_SIZE>                    The maximum pov size bytes [default: 5120]
+  -n, --num-blocks <NUM_BLOCKS>                        The number of blocks the test is going to run [default: 1]
+  -p, --peer-bandwidth <PEER_BANDWIDTH>                The bandwidth of emulated remote peers in KiB
+  -b, --bandwidth <BANDWIDTH>                          The bandwidth of our node in KiB
+      --connectivity <CONNECTIVITY>                    Emulated peer connection ratio [0-100]
+      --peer-mean-latency <PEER_MEAN_LATENCY>          Mean remote peer latency in milliseconds [0-5000]
+      --peer-latency-std-dev <PEER_LATENCY_STD_DEV>    Remote peer latency standard deviation
+      --profile                                        Enable CPU Profiling with Pyroscope
+      --pyroscope-url <PYROSCOPE_URL>                  Pyroscope Server URL [default: http://localhost:4040]
+      --pyroscope-sample-rate <PYROSCOPE_SAMPLE_RATE>  Pyroscope Sample Rate [default: 113]
+      --cache-misses                                   Enable Cache Misses Profiling with Valgrind. Linux only, Valgrind must be in the PATH
+  -h, --help                                           Print help
 ```
 
 These apply to all test objectives, except `test-sequence` which relies on the values being specified in a file.
@@ -152,8 +150,8 @@ Benchmark availability recovery strategies
 Usage: subsystem-bench data-availability-read [OPTIONS]
 
 Options:
-  -f, --fetch-from-backers  Turbo boost AD Read by fetching the full availability datafrom backers first. Saves CPU
-                            as we don't need to re-construct from chunks. Tipically this is only faster if nodes
+  -f, --fetch-from-backers  Turbo boost AD Read by fetching the full availability datafrom backers first. Saves CPU 
+                            as we don't need to re-construct from chunks. Tipically this is only faster if nodes 
                             have enough bandwidth
   -h, --help                Print help
 ```
@@ -181,9 +179,9 @@ Let's run an availabilty read test which will recover availability for 10 cores
 node validator network.
 
 ```
- target/testnet/subsystem-bench --n-cores 10 data-availability-read
-[2023-11-28T09:01:59Z INFO  subsystem_bench::core::display] n_validators = 500, n_cores = 10, pov_size = 5120 - 5120,
-                                                            error = 0, latency = None
+ target/testnet/subsystem-bench --n-cores 10 data-availability-read 
+[2023-11-28T09:01:59Z INFO  subsystem_bench::core::display] n_validators = 500, n_cores = 10, pov_size = 5120 - 5120, 
+                                                            latency = None
 [2023-11-28T09:01:59Z INFO  subsystem-bench::availability] Generating template candidate index=0 pov_size=5242880
 [2023-11-28T09:01:59Z INFO  subsystem-bench::availability] Created test environment.
 [2023-11-28T09:01:59Z INFO  subsystem-bench::availability] Pre-generating 10 candidates.
@@ -196,8 +194,8 @@ node validator network.
 [2023-11-28T09:02:07Z INFO  subsystem_bench::availability] All blocks processed in 6001ms
 [2023-11-28T09:02:07Z INFO  subsystem_bench::availability] Throughput: 51200 KiB/block
 [2023-11-28T09:02:07Z INFO  subsystem_bench::availability] Block time: 6001 ms
-[2023-11-28T09:02:07Z INFO  subsystem_bench::availability]
-
+[2023-11-28T09:02:07Z INFO  subsystem_bench::availability] 
+    
     Total received from network: 66 MiB
     Total sent to network: 58 KiB
     Total subsystem CPU usage 4.16s

diff --git a/polkadot/node/subsystem-bench/examples/availability_read.yaml b/polkadot/node/subsystem-bench/examples/availability_read.yaml
@@ -1,57 +1,45 @@
 TestConfiguration:
 # Test 1
 - objective: !DataAvailabilityRead
-    fetch_from_backers: false
+    fetch_from_backers: true
   n_validators: 300
   n_cores: 20
   min_pov_size: 5120
   max_pov_size: 5120
   peer_bandwidth: 52428800
   bandwidth: 52428800
   latency:
-    min_latency:
-      secs: 0
-      nanos: 1000000
-    max_latency:
-      secs: 0
-      nanos: 100000000
-  error: 3
+    mean_latency_ms: 100
+    std_dev: 1
   num_blocks: 3
+  connectivity: 90
 
 # Test 2
 - objective: !DataAvailabilityRead
-    fetch_from_backers: false
+    fetch_from_backers: true
   n_validators: 500
   n_cores: 20
   min_pov_size: 5120
   max_pov_size: 5120
   peer_bandwidth: 52428800
   bandwidth: 52428800
   latency:
-    min_latency:
-      secs: 0
-      nanos: 1000000
-    max_latency:
-      secs: 0
-      nanos: 100000000
-  error: 3
+    mean_latency_ms: 100
+    std_dev: 1
   num_blocks: 3
+  connectivity: 90
 
 # Test 3
 - objective: !DataAvailabilityRead
-    fetch_from_backers: false
+    fetch_from_backers: true
   n_validators: 1000
   n_cores: 20
   min_pov_size: 5120
   max_pov_size: 5120
   peer_bandwidth: 52428800
   bandwidth: 52428800
   latency:
-    min_latency:
-      secs: 0
-      nanos: 1000000
-    max_latency:
-      secs: 0
-      nanos: 100000000
-  error: 3
+    mean_latency_ms: 100
+    std_dev: 1
   num_blocks: 3
+  connectivity: 90
diff --git a/polkadot/node/subsystem-bench/examples/availability_write.yaml b/polkadot/node/subsystem-bench/examples/availability_write.yaml
@@ -0,0 +1,15 @@
+TestConfiguration:
+# Test 1kV, 200 cores, max Pov
+- objective: DataAvailabilityWrite
+  n_validators: 1000
+  n_cores: 200
+  max_validators_per_core: 5
+  min_pov_size: 5120
+  max_pov_size: 5120
+  peer_bandwidth: 52428800
+  bandwidth: 52428800
+  latency:
+    mean_latency_ms: 30
+    std_dev: 2.0
+  connectivity: 75
+  num_blocks: 3
diff --git a/polkadot/node/subsystem-bench/src/availability/av_store_helpers.rs b/polkadot/node/subsystem-bench/src/availability/av_store_helpers.rs
@@ -0,0 +1,57 @@
+// Copyright (C) Parity Technologies (UK) Ltd.
+// This file is part of Polkadot.
+
+// Polkadot is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Polkadot is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.
+
+use super::*;
+
+use polkadot_node_metrics::metrics::Metrics;
+
+use polkadot_node_core_av_store::Config;
+use polkadot_node_subsystem_util::database::Database;
+
+use polkadot_node_core_av_store::AvailabilityStoreSubsystem;
+
+mod columns {
+	pub const DATA: u32 = 0;
+	pub const META: u32 = 1;
+	pub const NUM_COLUMNS: u32 = 2;
+}
+
+const TEST_CONFIG: Config = Config { col_data: columns::DATA, col_meta: columns::META };
+
+struct DumbOracle;
+
+impl sp_consensus::SyncOracle for DumbOracle {
+	fn is_major_syncing(&self) -> bool {
+		false
+	}
+
+	fn is_offline(&self) -> bool {
+		unimplemented!("oh no!")
+	}
+}
+
+pub fn new_av_store(dependencies: &TestEnvironmentDependencies) -> AvailabilityStoreSubsystem {
+	let metrics = Metrics::try_register(&dependencies.registry).unwrap();
+
+	AvailabilityStoreSubsystem::new(test_store(), TEST_CONFIG, Box::new(DumbOracle), metrics)
+}
+
+fn test_store() -> Arc<dyn Database> {
+	let db = kvdb_memorydb::create(columns::NUM_COLUMNS);
+	let db =
+		polkadot_node_subsystem_util::database::kvdb_impl::DbAdapter::new(db, &[columns::META]);
+	Arc::new(db)
+}