Skip to content

Commit

Permalink
Add subsystem benchmarks for availability-distribution and `biftiel…
Browse files Browse the repository at this point in the history
…d-distribution` (availability write) (paritytech#2970)

Introduce a new test objective : `DataAvailabilityWrite`.

The new benchmark measures the network and cpu usage of
`availability-distribution`, `biftield-distribution` and
`availability-store` subsystems from the perspective of a validator node
during the process when candidates are made available.

Additionally I refactored the networking emulation to support bandwidth
acounting and limits of incoming and outgoing requests.

Screenshot of succesful run


<img width="1293" alt="Screenshot 2024-01-17 at 19 17 44"
src="https://github.com/paritytech/polkadot-sdk/assets/54316454/fde11280-e25b-4dc3-9dc9-d4b9752f9b7a">

---------

Signed-off-by: Andrei Sandu <andrei-mihail@parity.io>
  • Loading branch information
sandreim authored Jan 25, 2024
1 parent 73fd8cd commit 47e46d1
Show file tree
Hide file tree
Showing 22 changed files with 1,965 additions and 799 deletions.
16 changes: 12 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ members = [
"polkadot/node/gum/proc-macro",
"polkadot/node/jaeger",
"polkadot/node/malus",
"polkadot/node/subsystem-bench",
"polkadot/node/metrics",
"polkadot/node/network/approval-distribution",
"polkadot/node/network/availability-distribution",
Expand Down
18 changes: 14 additions & 4 deletions polkadot/node/subsystem-bench/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,16 @@ polkadot-node-subsystem-types = { path = "../subsystem-types" }
polkadot-node-primitives = { path = "../primitives" }
polkadot-primitives = { path = "../../primitives" }
polkadot-node-network-protocol = { path = "../network/protocol" }
polkadot-availability-recovery = { path = "../network/availability-recovery", features = ["subsystem-benchmarks"] }
polkadot-availability-recovery = { path = "../network/availability-recovery", features=["subsystem-benchmarks"]}
polkadot-availability-distribution = { path = "../network/availability-distribution"}
polkadot-node-core-av-store = { path = "../core/av-store"}
polkadot-node-core-chain-api = { path = "../core/chain-api"}
polkadot-availability-bitfield-distribution = { path = "../network/bitfield-distribution"}
color-eyre = { version = "0.6.1", default-features = false }
polkadot-overseer = { path = "../overseer" }
polkadot-overseer = { path = "../overseer" }
colored = "2.0.4"
assert_matches = "1.5"
async-trait = "0.1.74"
async-trait = "0.1.57"
sp-keystore = { path = "../../../substrate/primitives/keystore" }
sc-keystore = { path = "../../../substrate/client/keystore" }
sp-core = { path = "../../../substrate/primitives/core" }
Expand All @@ -39,14 +43,20 @@ polkadot-erasure-coding = { package = "polkadot-erasure-coding", path = "../../e
log = "0.4.17"
env_logger = "0.9.0"
rand = "0.8.5"
parity-scale-codec = { version = "3.6.1", features = ["derive", "std"] }
# `rand` only supports uniform distribution, we need normal distribution for latency.
rand_distr = "0.4.3"
bitvec="1.0.1"
kvdb-memorydb = "0.13.0"

parity-scale-codec = { version = "3.6.1", features = ["std", "derive"] }
tokio = "1.24.2"
clap-num = "1.0.2"
polkadot-node-subsystem-test-helpers = { path = "../subsystem-test-helpers" }
sp-keyring = { path = "../../../substrate/primitives/keyring" }
sp-application-crypto = { path = "../../../substrate/primitives/application-crypto" }
sc-network = { path = "../../../substrate/client/network" }
sc-service = { path = "../../../substrate/client/service" }
sp-consensus = { path = "../../../substrate/primitives/consensus/common" }
polkadot-node-metrics = { path = "../metrics" }
itertools = "0.11.0"
polkadot-primitives-test-helpers = { path = "../../primitives/test-helpers" }
Expand Down
56 changes: 27 additions & 29 deletions polkadot/node/subsystem-bench/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Subsystem benchmark client

Run parachain consensus stress and performance tests on your development machine.
Run parachain consensus stress and performance tests on your development machine.

## Motivation

Expand Down Expand Up @@ -111,30 +111,28 @@ Commands:
```

Note: `test-sequence` is a special test objective that wraps up an arbitrary number of test objectives. It is tipically
used to run a suite of tests defined in a `yaml` file like in this [example](examples/availability_read.yaml).
used to run a suite of tests defined in a `yaml` file like in this [example](examples/availability_read.yaml).

### Standard test options

```
Options:
--network <NETWORK> The type of network to be emulated [default: ideal] [possible
values: ideal, healthy, degraded]
--n-cores <N_CORES> Number of cores to fetch availability for [default: 100]
--n-validators <N_VALIDATORS> Number of validators to fetch chunks from [default: 500]
--min-pov-size <MIN_POV_SIZE> The minimum pov size in KiB [default: 5120]
--max-pov-size <MAX_POV_SIZE> The maximum pov size bytes [default: 5120]
-n, --num-blocks <NUM_BLOCKS> The number of blocks the test is going to run [default: 1]
-p, --peer-bandwidth <PEER_BANDWIDTH> The bandwidth of simulated remote peers in KiB
-b, --bandwidth <BANDWIDTH> The bandwidth of our simulated node in KiB
--peer-error <PEER_ERROR> Simulated conection error ratio [0-100]
--peer-min-latency <PEER_MIN_LATENCY> Minimum remote peer latency in milliseconds [0-5000]
--peer-max-latency <PEER_MAX_LATENCY> Maximum remote peer latency in milliseconds [0-5000]
--profile Enable CPU Profiling with Pyroscope
--pyroscope-url <PYROSCOPE_URL> Pyroscope Server URL [default: http://localhost:4040]
--pyroscope-sample-rate <PYROSCOPE_SAMPLE_RATE> Pyroscope Sample Rate [default: 113]
--cache-misses Enable Cache Misses Profiling with Valgrind. Linux only, Valgrind
must be in the PATH
-h, --help Print help
--network <NETWORK> The type of network to be emulated [default: ideal] [possible values: ideal, healthy,
degraded]
--n-cores <N_CORES> Number of cores to fetch availability for [default: 100]
--n-validators <N_VALIDATORS> Number of validators to fetch chunks from [default: 500]
--min-pov-size <MIN_POV_SIZE> The minimum pov size in KiB [default: 5120]
--max-pov-size <MAX_POV_SIZE> The maximum pov size bytes [default: 5120]
-n, --num-blocks <NUM_BLOCKS> The number of blocks the test is going to run [default: 1]
-p, --peer-bandwidth <PEER_BANDWIDTH> The bandwidth of emulated remote peers in KiB
-b, --bandwidth <BANDWIDTH> The bandwidth of our node in KiB
--connectivity <CONNECTIVITY> Emulated peer connection ratio [0-100]
--peer-mean-latency <PEER_MEAN_LATENCY> Mean remote peer latency in milliseconds [0-5000]
--peer-latency-std-dev <PEER_LATENCY_STD_DEV> Remote peer latency standard deviation
--profile Enable CPU Profiling with Pyroscope
--pyroscope-url <PYROSCOPE_URL> Pyroscope Server URL [default: http://localhost:4040]
--pyroscope-sample-rate <PYROSCOPE_SAMPLE_RATE> Pyroscope Sample Rate [default: 113]
--cache-misses Enable Cache Misses Profiling with Valgrind. Linux only, Valgrind must be in the PATH
-h, --help Print help
```

These apply to all test objectives, except `test-sequence` which relies on the values being specified in a file.
Expand All @@ -152,8 +150,8 @@ Benchmark availability recovery strategies
Usage: subsystem-bench data-availability-read [OPTIONS]
Options:
-f, --fetch-from-backers Turbo boost AD Read by fetching the full availability datafrom backers first. Saves CPU
as we don't need to re-construct from chunks. Tipically this is only faster if nodes
-f, --fetch-from-backers Turbo boost AD Read by fetching the full availability datafrom backers first. Saves CPU
as we don't need to re-construct from chunks. Tipically this is only faster if nodes
have enough bandwidth
-h, --help Print help
```
Expand Down Expand Up @@ -181,9 +179,9 @@ Let's run an availabilty read test which will recover availability for 10 cores
node validator network.

```
target/testnet/subsystem-bench --n-cores 10 data-availability-read
[2023-11-28T09:01:59Z INFO subsystem_bench::core::display] n_validators = 500, n_cores = 10, pov_size = 5120 - 5120,
error = 0, latency = None
target/testnet/subsystem-bench --n-cores 10 data-availability-read
[2023-11-28T09:01:59Z INFO subsystem_bench::core::display] n_validators = 500, n_cores = 10, pov_size = 5120 - 5120,
latency = None
[2023-11-28T09:01:59Z INFO subsystem-bench::availability] Generating template candidate index=0 pov_size=5242880
[2023-11-28T09:01:59Z INFO subsystem-bench::availability] Created test environment.
[2023-11-28T09:01:59Z INFO subsystem-bench::availability] Pre-generating 10 candidates.
Expand All @@ -196,8 +194,8 @@ node validator network.
[2023-11-28T09:02:07Z INFO subsystem_bench::availability] All blocks processed in 6001ms
[2023-11-28T09:02:07Z INFO subsystem_bench::availability] Throughput: 51200 KiB/block
[2023-11-28T09:02:07Z INFO subsystem_bench::availability] Block time: 6001 ms
[2023-11-28T09:02:07Z INFO subsystem_bench::availability]
[2023-11-28T09:02:07Z INFO subsystem_bench::availability]
Total received from network: 66 MiB
Total sent to network: 58 KiB
Total subsystem CPU usage 4.16s
Expand Down
36 changes: 12 additions & 24 deletions polkadot/node/subsystem-bench/examples/availability_read.yaml
Original file line number Diff line number Diff line change
@@ -1,57 +1,45 @@
TestConfiguration:
# Test 1
- objective: !DataAvailabilityRead
fetch_from_backers: false
fetch_from_backers: true
n_validators: 300
n_cores: 20
min_pov_size: 5120
max_pov_size: 5120
peer_bandwidth: 52428800
bandwidth: 52428800
latency:
min_latency:
secs: 0
nanos: 1000000
max_latency:
secs: 0
nanos: 100000000
error: 3
mean_latency_ms: 100
std_dev: 1
num_blocks: 3
connectivity: 90

# Test 2
- objective: !DataAvailabilityRead
fetch_from_backers: false
fetch_from_backers: true
n_validators: 500
n_cores: 20
min_pov_size: 5120
max_pov_size: 5120
peer_bandwidth: 52428800
bandwidth: 52428800
latency:
min_latency:
secs: 0
nanos: 1000000
max_latency:
secs: 0
nanos: 100000000
error: 3
mean_latency_ms: 100
std_dev: 1
num_blocks: 3
connectivity: 90

# Test 3
- objective: !DataAvailabilityRead
fetch_from_backers: false
fetch_from_backers: true
n_validators: 1000
n_cores: 20
min_pov_size: 5120
max_pov_size: 5120
peer_bandwidth: 52428800
bandwidth: 52428800
latency:
min_latency:
secs: 0
nanos: 1000000
max_latency:
secs: 0
nanos: 100000000
error: 3
mean_latency_ms: 100
std_dev: 1
num_blocks: 3
connectivity: 90
15 changes: 15 additions & 0 deletions polkadot/node/subsystem-bench/examples/availability_write.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
TestConfiguration:
# Test 1kV, 200 cores, max Pov
- objective: DataAvailabilityWrite
n_validators: 1000
n_cores: 200
max_validators_per_core: 5
min_pov_size: 5120
max_pov_size: 5120
peer_bandwidth: 52428800
bandwidth: 52428800
latency:
mean_latency_ms: 30
std_dev: 2.0
connectivity: 75
num_blocks: 3
57 changes: 57 additions & 0 deletions polkadot/node/subsystem-bench/src/availability/av_store_helpers.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Polkadot.

// Polkadot is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.

// Polkadot is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.

// You should have received a copy of the GNU General Public License
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.

use super::*;

use polkadot_node_metrics::metrics::Metrics;

use polkadot_node_core_av_store::Config;
use polkadot_node_subsystem_util::database::Database;

use polkadot_node_core_av_store::AvailabilityStoreSubsystem;

mod columns {
pub const DATA: u32 = 0;
pub const META: u32 = 1;
pub const NUM_COLUMNS: u32 = 2;
}

const TEST_CONFIG: Config = Config { col_data: columns::DATA, col_meta: columns::META };

struct DumbOracle;

impl sp_consensus::SyncOracle for DumbOracle {
fn is_major_syncing(&self) -> bool {
false
}

fn is_offline(&self) -> bool {
unimplemented!("oh no!")
}
}

pub fn new_av_store(dependencies: &TestEnvironmentDependencies) -> AvailabilityStoreSubsystem {
let metrics = Metrics::try_register(&dependencies.registry).unwrap();

AvailabilityStoreSubsystem::new(test_store(), TEST_CONFIG, Box::new(DumbOracle), metrics)
}

fn test_store() -> Arc<dyn Database> {
let db = kvdb_memorydb::create(columns::NUM_COLUMNS);
let db =
polkadot_node_subsystem_util::database::kvdb_impl::DbAdapter::new(db, &[columns::META]);
Arc::new(db)
}
Loading

0 comments on commit 47e46d1

Please sign in to comment.