Skip to content

fix(node): avoid unnecessary dial back #10019

fix(node): avoid unnecessary dial back

fix(node): avoid unnecessary dial back #10019

Workflow file for this run

name: Memory Check
on:
# tests must run for a PR to be valid and pass merge queue muster
# on main, we want to know that all commits are passing at a glance, any deviation should help bisecting errors
# the merge run checks should show on master and enable this clear test/passing history
merge_group:
branches: [main, alpha*, beta*, rc*]
pull_request:
branches: ["*"]
env:
ANT_DATA_PATH: /home/runner/.local/share/autonomi
CLIENT_DATA_PATH: /home/runner/.local/share/autonomi/client
NODE_DATA_PATH: /home/runner/.local/share/autonomi/node
RESTART_TEST_NODE_DATA_PATH: /home/runner/.local/share/autonomi/restart_node
jobs:
memory-check:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Check we're on the right commit
run: git log -1 --oneline
- name: Install Rust
uses: dtolnay/rust-toolchain@stable
- uses: Swatinem/rust-cache@v2
continue-on-error: true
- name: install ripgrep
shell: bash
run: sudo apt-get install -y ripgrep
- name: Build binaries
run: cargo build --release --features local --bin antnode --bin ant
timeout-minutes: 30
- name: Start a local network
uses: maidsafe/ant-local-testnet-action@main
with:
action: start
enable-evm-testnet: true
node-path: target/release/antnode
platform: ubuntu-latest
build: true
- name: Check ANT_PEERS was set
shell: bash
run: echo "The ANT_PEERS variable has been set to $ANT_PEERS"
- name: Start a node instance to be restarted
run: |
mkdir -p $RESTART_TEST_NODE_DATA_PATH
./target/release/antnode \
--root-dir $RESTART_TEST_NODE_DATA_PATH --log-output-dest $RESTART_TEST_NODE_DATA_PATH --local --rewards-address "0x03B770D9cD32077cC0bF330c13C114a87643B124" &
sleep 10
env:
ANT_LOG: "all"
- name: Download 95mb file to be uploaded with the safe client
shell: bash
run: wget https://sn-node.s3.eu-west-2.amazonaws.com/the-test-data.zip
- name: export default secret key
run: echo "SECRET_KEY=0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80" >> $GITHUB_ENV
shell: bash
- name: File upload
run: ./target/release/ant --log-output-dest=data-dir file upload --public "./the-test-data.zip" > ./upload_output 2>&1
env:
ANT_LOG: "v"
timeout-minutes: 15
- name: showing the upload terminal output
run: cat upload_output
shell: bash
if: always()
- name: parse address
run: |
UPLOAD_ADDRESS=$(rg "At address: ([0-9a-f]*)" -o -r '$1' ./upload_output)
echo "UPLOAD_ADDRESS=$UPLOAD_ADDRESS" >> $GITHUB_ENV
shell: bash
# Uploading same file using different client shall not incur any payment neither uploads
# Note rg will throw an error directly in case of failed to find a matching pattern.
- name: Start a different client to upload the same file
run: |
pwd
ls -l $ANT_DATA_PATH
mv $CLIENT_DATA_PATH $ANT_DATA_PATH/client_first
ls -l $ANT_DATA_PATH
ls -l $ANT_DATA_PATH/client_first
ls -l $ANT_DATA_PATH/client_first/logs
mkdir $ANT_DATA_PATH/client
ls -l $ANT_DATA_PATH
cp ./the-test-data.zip ./the-test-data_1.zip
./target/release/ant --log-output-dest data-dir file_TYPE upload "" > ./second_upload 2>&1
enrelease-candidatev:

Check failure on line 103 in .github/workflows/memcheck.yml

View workflow run for this annotation

GitHub Actions / Memory Check

Invalid workflow file

The workflow is not valid. .github/workflows/memcheck.yml (Line: 103, Col: 9): Unexpected value 'enrelease-candidatev'
ANT_LOG: "all"
timeout-minutes: 25
- name: showing the second upload terminal output
run: cat second_upload
shell: bash
if: always()
- name: Stop the restart node
run: kill $(cat $RESTART_TEST_NODE_DATA_PATH/antnode.pid)
- name: Start the restart node again
run: |
./target/release/antnode \
--root-dir-type PARESTART_TEST_NODE_DATA_PATH \
--log-output-dest $RESTART_TEST_NODE_DATA_PATH \
--local \
--rewards-address "0x03B770D9cD32077cC0bF330c13C114a87643B124" &
sleep 10
env:
ANT_LOG: "all"
# Records are encrypted, and seeds will change after restart
# Currently, there will be `Existing record found`, but NO `Existing record loaded`
# Due to the failure on decryption (as different seed used)
- name: Assert we've reloaded some chunks
run: rg "Existing record found" $RESTART_TEST_NODE_DATA_PATH
- name: Wait at least 1min for replication to happen # it is throttled to once/30s.
run: sleep 60
- name: Verify data replication using rg
shell: bash
timeout-minutes: 1
# get the counts, then the specific line, and then the digit count only
# then check we have an expected level of replication
run: |
sending_list_count=$(rg "Sending a replication list" $NODE_DATA_PATH -c --stats | \
rg "(\d+) matches" | rg "\d+" -o)
echo "Sent $sending_list_count replication lists"
received_list_count=$(rg "Received replication list from" $NODE_DATA_PATH -c --stats | \
rg "(\d+) matches" | rg "\d+" -o)
echo "Received $received_list_count replication lists"
fetching_attempt_count=$(rg "FetchingKeysForReplication" $NODE_DATA_PATH -c --stats | \
rg "(\d+) matches" | rg "\d+" -o)
echo "Carried out $fetching_attempt_count fetching attempts"
if: always()
- name: File Download
run: >
./target/release/ant
--log-output-dest=data-dir file download ${{ env.UPLOAD_ADDRESS }} ./downloaded_resources
env:
ANT_LOG: "v"
timeout-minutes: 2
- name: Check nodes running
shell: bash
timeout-minutes: 1
continue-on-error: true
run: pgrep antnode | wc -l
if: always()
- name: Stop the local network and upload logs
if: always()
uses: maidsafe/ant-local-testnet-action@main
with:
action: stop
log_file_prefix: safe_test_logs_memcheck
platform: ubuntu-latest
build: true
- name: Check node memory usage
shell: bash
# The resources file and churning chunk_size we upload may change, and with it mem consumption.
# This is set to a value high enough to allow for some variation depending on
# resources and node location in the network, but hopefully low enough to catch
# any wild memory issues
# Any changes to this value should be carefully considered and tested!
# As we have a bootstrap node acting as an access point for churning nodes and client,
# The memory usage here will be significantly higher here than in the benchmark test,
# where we don't have a bootstrap node.
run: |
node_peak_mem_limit_mb="300" # mb
peak_mem_usage=$(
rg '"memory_used_mb":[^,]*' $NODE_DATA_PATH/*/logs/* -o --no-line-number --no-filename |
awk -F':' '/"memory_used_mb":/{print $2}' |
sort -n |
tail -n 1
)
echo "Node memory usage: $peak_mem_usage MB"
if (( $(echo "$peak_mem_usage > $node_peak_mem_limit_mb" | bc -l) )); then
echo "Node memory usage exceeded threshold: $peak_mem_usage MB"
exit 1
fi
if: always()
- name: Check client memory usage
shell: bash
# limits here are lower that benchmark tests as there is less going on.
run: |
client_peak_mem_limit_mb="1024" # mb
client_avg_mem_limit_mb="512" # mb
peak_mem_usage=$(
rg '"memory_used_mb":[^,]*' $CLIENT_DATA_PATH/logs --glob ant.* -o --no-line-number --no-filename |
awk -F':' '/"memory_used_mb":/{print $2}' |
sort -n |
tail -n 1
)
echo "Peak memory usage: $peak_mem_usage MB"
if (( $(echo "$peak_mem_usage > $client_peak_mem_limit_mb" | bc -l) )); then
echo "Client peak memory usage exceeded threshold: $client_peak_mem_limit_mb MB"
exit 1
fi
total_mem=$(
rg '"memory_used_mb":[^,]*' $CLIENT_DATA_PATH/logs --glob ant.* -o --no-line-number --no-filename |
awk -F':' '/"memory_used_mb":/ {sum += $2} END {printf "%.0f\n", sum}'
)
num_of_times=$(
rg "\"memory_used_mb\"" $CLIENT_DATA_PATH/logs --glob ant.* -c --stats |
rg "(\d+) matches" |
rg "\d+" -o
)
echo "num_of_times: $num_of_times"
echo "Total memory is: $total_mem"
average_mem=$(($total_mem/$(($num_of_times))))
echo "Average memory is: $average_mem"
if (( $(echo "$average_mem > $client_avg_mem_limit_mb" | bc -l) )); then
echo "Client average memory usage exceeded threshold: $client_avg_mem_limit_mb MB"
exit 1
fi
# Logging of handling time is on Trace level,
# meanwhile the local_network startup tool sets the logging level on Debug.
#
# - name: Check node swarm_driver handling statistics
# shell: bash
# # With the latest improvements, swarm_driver will be in high chance
# # has no super long handling (longer than 1s).
# # As the `rg` cmd will fail the shell directly if no entry find,
# # hence not covering it.
# # Be aware that if do need to looking for handlings longer than second, it shall be:
# # rg "SwarmCmd handled in [^m,µ,n]*s:" $NODE_DATA_PATH/*/logs/* --glob antnode.* -c --stats
# run: |
# num_of_times=$(
# rg "SwarmCmd handled in [0-9.]+ms:" $NODE_DATA_PATH/*/logs/* --glob antnode.* -c --stats |
# rg "(\d+) matches" |
# rg "\d+" -o
# )
# echo "Number of long cmd handling times: $num_of_times"
# total_long_handling_ms=$(
# rg "SwarmCmd handled in [0-9.]+ms:" $NODE_DATA_PATH/*/logs/* --glob antnode.* -o --no-line-number --no-filename |
# awk -F' |ms:' '{sum += $4} END {printf "%.0f\n", sum}'
# )
# echo "Total cmd long handling time is: $total_long_handling_ms ms"
# average_handling_ms=$(($total_long_handling_ms/$(($num_of_times))))
# echo "Average cmd long handling time is: $average_handling_ms ms"
# total_long_handling=$(($total_long_handling_ms))
# total_num_of_times=$(($num_of_times))
# num_of_times=$(
# rg "SwarmEvent handled in [0-9.]+ms:" $NODE_DATA_PATH/*/logs/* --glob antnode.* -c --stats |
# rg "(\d+) matches" |
# rg "\d+" -o
# )
# echo "Number of long event handling times: $num_of_times"
# total_long_handling_ms=$(
# rg "SwarmEvent handled in [0-9.]+ms:" $NODE_DATA_PATH/*/logs/* --glob antnode.* -o --no-line-number --no-filename |
# awk -F' |ms:' '{sum += $4} END {printf "%.0f\n", sum}'
# )
# echo "Total event long handling time is: $total_long_handling_ms ms"
# average_handling_ms=$(($total_long_handling_ms/$(($num_of_times))))
# echo "Average event long handling time is: $average_handling_ms ms"
# total_long_handling=$(($total_long_handling_ms+$total_long_handling))
# total_num_of_times=$(($num_of_times+$total_num_of_times))
# average_handling_ms=$(($total_long_handling/$(($total_num_of_times))))
# echo "Total swarm_driver long handling times is: $total_num_of_times"
# echo "Total swarm_driver long handling duration is: $total_long_handling ms"
# echo "Total average swarm_driver long handling duration is: $average_handling_ms ms"
- name: Move restart_node log to the working directory
run: |
ls -l $RESTART_TEST_NODE_DATA_PATH
mv $RESTART_TEST_NODE_DATA_PATH/antnode.log ./restart_node.log
continue-on-error: true
if: always()
timeout-minutes: 1
- name: Upload restart_node log
uses: actions/upload-artifact@main
with:
name: memory_check_restart_node_log
path: restart_node.log
continue-on-error: true
if: always()