chore: make system metric as f64 #10052
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Memory Check | ||
on: | ||
# tests must run for a PR to be valid and pass merge queue muster | ||
# on main, we want to know that all commits are passing at a glance, any deviation should help bisecting errors | ||
# the merge run checks should show on master and enable this clear test/passing history | ||
merge_group: | ||
branches: [main, alpha*, beta*, rc*] | ||
pull_request: | ||
branches: ["*"] | ||
env: | ||
ANT_DATA_PATH: /home/runner/.local/share/autonomi | ||
CLIENT_DATA_PATH: /home/runner/.local/share/autonomi/client | ||
NODE_DATA_PATH: /home/runner/.local/share/autonomi/node | ||
RESTART_TEST_NODE_DATA_PATH: /home/runner/.local/share/autonomi/restart_node | ||
jobs: | ||
memory-check: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: Checkout code | ||
uses: actions/checkout@v4 | ||
- name: Check we're on the right commit | ||
run: git log -1 --oneline | ||
- name: Install Rust | ||
uses: dtolnay/rust-toolchain@stable | ||
- uses: Swatinem/rust-cache@v2 | ||
continue-on-error: true | ||
- name: install ripgrep | ||
shell: bash | ||
run: sudo apt-get install -y ripgrep | ||
- name: Build binaries | ||
run: cargo build --release --features local --bin antnode --bin ant | ||
timeout-minutes: 30 | ||
- name: Start a local network | ||
uses: maidsafe/ant-local-testnet-action@main | ||
with: | ||
action: start | ||
enable-evm-testnet: true | ||
node-path: target/release/antnode | ||
platform: ubuntu-latest | ||
build: true | ||
- name: Check ANT_PEERS was set | ||
shell: bash | ||
run: echo "The ANT_PEERS variable has been set to $ANT_PEERS" | ||
- name: Start a node instance to be restarted | ||
run: | | ||
mkdir -p $RESTART_TEST_NODE_DATA_PATH | ||
./target/release/antnode \ | ||
--root-dir $RESTART_TEST_NODE_DATA_PATH --log-output-dest $RESTART_TEST_NODE_DATA_PATH --local --rewards-address "0x03B770D9cD32077cC0bF330c13C114a87643B124" & | ||
sleep 10 | ||
env: | ||
ANT_LOG: "all" | ||
- name: Download 95mb file to be uploaded with the safe client | ||
shell: bash | ||
run: wget https://sn-node.s3.eu-west-2.amazonaws.com/the-test-data.zip | ||
- name: export default secret key | ||
run: echo "SECRET_KEY=0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80" >> $GITHUB_ENV | ||
shell: bash | ||
- name: File upload | ||
run: ./target/release/ant --log-output-dest=data-dir file upload --public "./the-test-data.zip" > ./upload_output 2>&1 | ||
env: | ||
ANT_LOG: "v" | ||
timeout-minutes: 15 | ||
- name: showing the upload terminal output | ||
run: cat upload_output | ||
shell: bash | ||
if: always() | ||
- name: parse address | ||
run: | | ||
UPLOAD_ADDRESS=$(rg "At address: ([0-9a-f]*)" -o -r '$1' ./upload_output) | ||
echo "UPLOAD_ADDRESS=$UPLOAD_ADDRESS" >> $GITHUB_ENV | ||
shell: bash | ||
# Uploading same file using different client shall not incur any payment neither uploads | ||
# Note rg will throw an error directly in case of failed to find a matching pattern. | ||
- name: Start a different client to upload the same file | ||
run: | | ||
pwd | ||
ls -l $ANT_DATA_PATH | ||
mv $CLIENT_DATA_PATH $ANT_DATA_PATH/client_first | ||
ls -l $ANT_DATA_PATH | ||
ls -l $ANT_DATA_PATH/client_first | ||
ls -l $ANT_DATA_PATH/client_first/logs | ||
mkdir $ANT_DATA_PATH/client | ||
ls -l $ANT_DATA_PATH | ||
cp ./the-test-data.zip ./the-test-data_1.zip | ||
./target/release/ant --log-output-dest data-dir file_TYPE upload "" > ./second_upload 2>&1 | ||
enrelease-candidatev: | ||
ANT_LOG: "all" | ||
timeout-minutes: 25 | ||
- name: showing the second upload terminal output | ||
run: cat second_upload | ||
shell: bash | ||
if: always() | ||
- name: Stop the restart node | ||
run: kill $(cat $RESTART_TEST_NODE_DATA_PATH/antnode.pid) | ||
- name: Start the restart node again | ||
run: | | ||
./target/release/antnode \ | ||
--root-dir-type PARESTART_TEST_NODE_DATA_PATH \ | ||
--log-output-dest $RESTART_TEST_NODE_DATA_PATH \ | ||
--local \ | ||
--rewards-address "0x03B770D9cD32077cC0bF330c13C114a87643B124" & | ||
sleep 10 | ||
env: | ||
ANT_LOG: "all" | ||
# Records are encrypted, and seeds will change after restart | ||
# Currently, there will be `Existing record found`, but NO `Existing record loaded` | ||
# Due to the failure on decryption (as different seed used) | ||
- name: Assert we've reloaded some chunks | ||
run: rg "Existing record found" $RESTART_TEST_NODE_DATA_PATH | ||
- name: Wait at least 1min for replication to happen # it is throttled to once/30s. | ||
run: sleep 60 | ||
- name: Verify data replication using rg | ||
shell: bash | ||
timeout-minutes: 1 | ||
# get the counts, then the specific line, and then the digit count only | ||
# then check we have an expected level of replication | ||
run: | | ||
sending_list_count=$(rg "Sending a replication list" $NODE_DATA_PATH -c --stats | \ | ||
rg "(\d+) matches" | rg "\d+" -o) | ||
echo "Sent $sending_list_count replication lists" | ||
received_list_count=$(rg "Received replication list from" $NODE_DATA_PATH -c --stats | \ | ||
rg "(\d+) matches" | rg "\d+" -o) | ||
echo "Received $received_list_count replication lists" | ||
fetching_attempt_count=$(rg "FetchingKeysForReplication" $NODE_DATA_PATH -c --stats | \ | ||
rg "(\d+) matches" | rg "\d+" -o) | ||
echo "Carried out $fetching_attempt_count fetching attempts" | ||
if: always() | ||
- name: File Download | ||
run: > | ||
./target/release/ant | ||
--log-output-dest=data-dir file download ${{ env.UPLOAD_ADDRESS }} ./downloaded_resources | ||
env: | ||
ANT_LOG: "v" | ||
timeout-minutes: 2 | ||
- name: Check nodes running | ||
shell: bash | ||
timeout-minutes: 1 | ||
continue-on-error: true | ||
run: pgrep antnode | wc -l | ||
if: always() | ||
- name: Stop the local network and upload logs | ||
if: always() | ||
uses: maidsafe/ant-local-testnet-action@main | ||
with: | ||
action: stop | ||
log_file_prefix: safe_test_logs_memcheck | ||
platform: ubuntu-latest | ||
build: true | ||
- name: Check node memory usage | ||
shell: bash | ||
# The resources file and churning chunk_size we upload may change, and with it mem consumption. | ||
# This is set to a value high enough to allow for some variation depending on | ||
# resources and node location in the network, but hopefully low enough to catch | ||
# any wild memory issues | ||
# Any changes to this value should be carefully considered and tested! | ||
# As we have a bootstrap node acting as an access point for churning nodes and client, | ||
# The memory usage here will be significantly higher here than in the benchmark test, | ||
# where we don't have a bootstrap node. | ||
run: | | ||
node_peak_mem_limit_mb="300" # mb | ||
peak_mem_usage=$( | ||
rg '"memory_used_mb":[^,]*' $NODE_DATA_PATH/*/logs/* -o --no-line-number --no-filename | | ||
awk -F':' '/"memory_used_mb":/{print $2}' | | ||
sort -n | | ||
tail -n 1 | ||
) | ||
echo "Node memory usage: $peak_mem_usage MB" | ||
if (( $(echo "$peak_mem_usage > $node_peak_mem_limit_mb" | bc -l) )); then | ||
echo "Node memory usage exceeded threshold: $peak_mem_usage MB" | ||
exit 1 | ||
fi | ||
if: always() | ||
- name: Check client memory usage | ||
shell: bash | ||
# limits here are lower that benchmark tests as there is less going on. | ||
run: | | ||
client_peak_mem_limit_mb="1024" # mb | ||
client_avg_mem_limit_mb="512" # mb | ||
peak_mem_usage=$( | ||
rg '"memory_used_mb":[^,]*' $CLIENT_DATA_PATH/logs --glob ant.* -o --no-line-number --no-filename | | ||
awk -F':' '/"memory_used_mb":/{print $2}' | | ||
sort -n | | ||
tail -n 1 | ||
) | ||
echo "Peak memory usage: $peak_mem_usage MB" | ||
if (( $(echo "$peak_mem_usage > $client_peak_mem_limit_mb" | bc -l) )); then | ||
echo "Client peak memory usage exceeded threshold: $client_peak_mem_limit_mb MB" | ||
exit 1 | ||
fi | ||
total_mem=$( | ||
rg '"memory_used_mb":[^,]*' $CLIENT_DATA_PATH/logs --glob ant.* -o --no-line-number --no-filename | | ||
awk -F':' '/"memory_used_mb":/ {sum += $2} END {printf "%.0f\n", sum}' | ||
) | ||
num_of_times=$( | ||
rg "\"memory_used_mb\"" $CLIENT_DATA_PATH/logs --glob ant.* -c --stats | | ||
rg "(\d+) matches" | | ||
rg "\d+" -o | ||
) | ||
echo "num_of_times: $num_of_times" | ||
echo "Total memory is: $total_mem" | ||
average_mem=$(($total_mem/$(($num_of_times)))) | ||
echo "Average memory is: $average_mem" | ||
if (( $(echo "$average_mem > $client_avg_mem_limit_mb" | bc -l) )); then | ||
echo "Client average memory usage exceeded threshold: $client_avg_mem_limit_mb MB" | ||
exit 1 | ||
fi | ||
# Logging of handling time is on Trace level, | ||
# meanwhile the local_network startup tool sets the logging level on Debug. | ||
# | ||
# - name: Check node swarm_driver handling statistics | ||
# shell: bash | ||
# # With the latest improvements, swarm_driver will be in high chance | ||
# # has no super long handling (longer than 1s). | ||
# # As the `rg` cmd will fail the shell directly if no entry find, | ||
# # hence not covering it. | ||
# # Be aware that if do need to looking for handlings longer than second, it shall be: | ||
# # rg "SwarmCmd handled in [^m,µ,n]*s:" $NODE_DATA_PATH/*/logs/* --glob antnode.* -c --stats | ||
# run: | | ||
# num_of_times=$( | ||
# rg "SwarmCmd handled in [0-9.]+ms:" $NODE_DATA_PATH/*/logs/* --glob antnode.* -c --stats | | ||
# rg "(\d+) matches" | | ||
# rg "\d+" -o | ||
# ) | ||
# echo "Number of long cmd handling times: $num_of_times" | ||
# total_long_handling_ms=$( | ||
# rg "SwarmCmd handled in [0-9.]+ms:" $NODE_DATA_PATH/*/logs/* --glob antnode.* -o --no-line-number --no-filename | | ||
# awk -F' |ms:' '{sum += $4} END {printf "%.0f\n", sum}' | ||
# ) | ||
# echo "Total cmd long handling time is: $total_long_handling_ms ms" | ||
# average_handling_ms=$(($total_long_handling_ms/$(($num_of_times)))) | ||
# echo "Average cmd long handling time is: $average_handling_ms ms" | ||
# total_long_handling=$(($total_long_handling_ms)) | ||
# total_num_of_times=$(($num_of_times)) | ||
# num_of_times=$( | ||
# rg "SwarmEvent handled in [0-9.]+ms:" $NODE_DATA_PATH/*/logs/* --glob antnode.* -c --stats | | ||
# rg "(\d+) matches" | | ||
# rg "\d+" -o | ||
# ) | ||
# echo "Number of long event handling times: $num_of_times" | ||
# total_long_handling_ms=$( | ||
# rg "SwarmEvent handled in [0-9.]+ms:" $NODE_DATA_PATH/*/logs/* --glob antnode.* -o --no-line-number --no-filename | | ||
# awk -F' |ms:' '{sum += $4} END {printf "%.0f\n", sum}' | ||
# ) | ||
# echo "Total event long handling time is: $total_long_handling_ms ms" | ||
# average_handling_ms=$(($total_long_handling_ms/$(($num_of_times)))) | ||
# echo "Average event long handling time is: $average_handling_ms ms" | ||
# total_long_handling=$(($total_long_handling_ms+$total_long_handling)) | ||
# total_num_of_times=$(($num_of_times+$total_num_of_times)) | ||
# average_handling_ms=$(($total_long_handling/$(($total_num_of_times)))) | ||
# echo "Total swarm_driver long handling times is: $total_num_of_times" | ||
# echo "Total swarm_driver long handling duration is: $total_long_handling ms" | ||
# echo "Total average swarm_driver long handling duration is: $average_handling_ms ms" | ||
- name: Move restart_node log to the working directory | ||
run: | | ||
ls -l $RESTART_TEST_NODE_DATA_PATH | ||
mv $RESTART_TEST_NODE_DATA_PATH/antnode.log ./restart_node.log | ||
continue-on-error: true | ||
if: always() | ||
timeout-minutes: 1 | ||
- name: Upload restart_node log | ||
uses: actions/upload-artifact@main | ||
with: | ||
name: memory_check_restart_node_log | ||
path: restart_node.log | ||
continue-on-error: true | ||
if: always() |