Skip to content

Commit

Permalink
WIP: profile workloads
Browse files Browse the repository at this point in the history
  • Loading branch information
fmaste committed Jan 6, 2025
1 parent 68b944b commit 4f593fa
Show file tree
Hide file tree
Showing 16 changed files with 689 additions and 590 deletions.
6 changes: 4 additions & 2 deletions nix/workbench/backend/backend.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ usage_backend() {
wait-pools-stopped RUNDIR
Wait until all pools are stopped
wait-workloads-stopped RUNDIR
Wait until all workloads are stopped
stop-cluster RUNDIR
cleanup-cluster RUNDIR
Wipe cluster state to pristine
Expand All @@ -50,16 +52,16 @@ case "${op}" in
start-tracers ) backend_$WB_BACKEND "$@";;
start-nodes ) backend_$WB_BACKEND "$@";;
start-generator ) backend_$WB_BACKEND "$@";;
start-workloads ) backend_$WB_BACKEND "$@";;
start-healthchecks ) backend_$WB_BACKEND "$@";;
start-latencies ) backend_$WB_BACKEND "$@";;
# Fine grained
start-node ) backend_$WB_BACKEND "$@";;
stop-node ) backend_$WB_BACKEND "$@";;
wait-node ) backend_$WB_BACKEND "$@";;
wait-node-stopped ) backend_$WB_BACKEND "$@";;
get-node-socket-path ) backend_$WB_BACKEND "$@";;
wait-pools-stopped ) backend_$WB_BACKEND "$@";;
wait-latencies-stopped ) backend_$WB_BACKEND "$@";;
wait-workloads-stopped ) backend_$WB_BACKEND "$@";;
# Stop functions
stop-all ) backend_$WB_BACKEND "$@";;
fetch-logs ) backend_$WB_BACKEND "$@";;
Expand Down
21 changes: 10 additions & 11 deletions nix/workbench/backend/nomad-job.nix
Original file line number Diff line number Diff line change
Expand Up @@ -902,28 +902,27 @@ let
}
])
++
# healthcheck
[
## healthcheck start.sh script.
# workloads
(builtins.map (workload:
## workload start.sh script.
{
env = false;
destination = "local/${stateDir}/healthcheck/start.sh";
data = escapeTemplate
profileData.healthcheck-service.start.value;
destination = "local/${stateDir}/workloads/${workload.name}/start.sh";
data = escapeTemplate workload.start.value;
change_mode = "noop";
error_on_missing_key = true;
perms = "744"; # Only for every "start.sh" script. Default: "644"
}
]
) profileData.workloads-service)
++
# latency
# healthcheck
[
## Latency start.sh script.
## healthcheck start.sh script.
{
env = false;
destination = "local/${stateDir}/latency/start.sh";
destination = "local/${stateDir}/healthcheck/start.sh";
data = escapeTemplate
profileData.latency-service.start.value;
profileData.healthcheck-service.start.value;
change_mode = "noop";
error_on_missing_key = true;
perms = "744"; # Only for every "start.sh" script. Default: "644"
Expand Down
557 changes: 295 additions & 262 deletions nix/workbench/backend/nomad.sh

Large diffs are not rendered by default.

40 changes: 22 additions & 18 deletions nix/workbench/backend/nomad/cloud.sh
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,13 @@ backend_nomadcloud() {
backend_nomad wait-pools-stopped 60 "$@"
;;

wait-latencies-stopped )
wait-workloads-stopped )
# It passes the sleep time (in seconds) required argument.
# This time is different between local and cloud backends to avoid
# unnecesary Nomad specific traffic (~99% happens waiting for node-0, the
# first one it waits to stop inside a loop) and at the same time be less
# sensitive to network failures.
backend_nomad wait-latencies-stopped 60 "$@"
backend_nomad wait-workloads-stopped 60 "$@"
;;

fetch-logs )
Expand Down Expand Up @@ -146,12 +146,12 @@ backend_nomadcloud() {
backend_nomad start-generator "$@"
;;

start-healthchecks )
backend_nomad start-healthchecks "$@"
start-workloads )
backend_nomad start-workloads "$@"
;;

start-latencies )
backend_nomad start-latencies "$@"
start-healthchecks )
backend_nomad start-healthchecks "$@"
;;

start-node )
Expand Down Expand Up @@ -998,18 +998,6 @@ fetch-logs-ssh-node() {
local ssh_config_path ssh_command
ssh_config_path="$(wb nomad ssh config)"
ssh_command="ssh -F ${ssh_config_path} -p 32000 -l nobody"
# Download latency(ies) logs. ################################################
##############################################################################
msg "$(blue "Fetching") $(yellow "program \"latency\"") run files from $(yellow "\"${node}\" (\"${public_ipv4}\")") ..."
if ! rsync -e "${ssh_command}" -au \
-f'- start.sh' \
"${public_ipv4}":/local/run/current/latency/ \
"${dir}"/latency/"${node}"/
then
node_ok="false"
touch "${dir}"/nomad/"${node}"/download_failed
msg "$(red Error fetching) $(yellow "program \"latency\"") $(red "run files from") $(yellow "\"${node}\" (\"${public_ipv4}\")") ..."
fi
# Download healthcheck(s) logs. ##############################################
##############################################################################
msg "$(blue "Fetching") $(yellow "program \"healthcheck\"") run files from $(yellow "\"${node}\" (\"${public_ipv4}\")") ..."
Expand All @@ -1022,6 +1010,22 @@ fetch-logs-ssh-node() {
touch "${dir}"/nomad/"${node}"/download_failed
msg "$(red Error fetching) $(yellow "program \"healthcheck\"") $(red "run files from") $(yellow "\"${node}\" (\"${public_ipv4}\")") ..."
fi
# Download workload(s) logs. #################################################
##############################################################################
# For every workload
for workload in $(jq_tolist '.workloads | map(.name)' "$dir"/profile.json)
do
msg "$(blue "Fetching") $(yellow "program \"${workload}\" workload") run files from $(yellow "\"${node}\" (\"${public_ipv4}\")") ..."
if ! rsync -e "${ssh_command}" -au \
-f'- start.sh' \
"${public_ipv4}":/local/run/current/workloads/"${workload}"/ \
"${dir}"/workloads/"${workload}"/"${node}"/
then
node_ok="false"
touch "${dir}"/nomad/"${node}"/download_failed
msg "$(red Error fetching) $(yellow "program \"${workload}\" workload") $(red "run files from") $(yellow "\"${node}\" (\"${public_ipv4}\")") ..."
fi
done
# Download generator logs. ###################################################
##############################################################################
if test "${node}" = "explorer"
Expand Down
12 changes: 6 additions & 6 deletions nix/workbench/backend/nomad/exec.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,13 @@ backend_nomadexec() {
backend_nomad wait-pools-stopped 1 "$@"
;;

wait-latencies-stopped )
wait-workloads-stopped )
# It passes the sleep time (in seconds) required argument.
# This time is different between local and cloud backends to avoid
# unnecesary Nomad specific traffic (~99% happens waiting for node-0, the
# first one it waits to stop inside a loop) and at the same time be less
# sensitive to network failures.
backend_nomad wait-latencies-stopped 1 "$@"
backend_nomad wait-workloads-stopped 1 "$@"
;;

# All or clean up everything!
Expand Down Expand Up @@ -107,12 +107,12 @@ backend_nomadexec() {
backend_nomad start-generator "$@"
;;

start-healthchecks )
backend_nomad start-healthchecks "$@"
start-workloads )
backend_nomad start-workloads "$@"
;;

start-latencies )
backend_nomad start-latencies "$@"
start-healthchecks )
backend_nomad start-healthchecks "$@"
;;

start-node )
Expand Down
19 changes: 13 additions & 6 deletions nix/workbench/backend/supervisor-conf.nix
Original file line number Diff line number Diff line change
Expand Up @@ -184,14 +184,18 @@ let
startsecs = 5;
};
}



//
{
"program:latency" = {
(builtins.listToAttrs (builtins.map (workload: {
name = "program:${workload.name}";
value = {
# "command" below assumes "directory" is set accordingly.
directory = "${stateDir}/latency";
directory = "${stateDir}/workloads/${workload.name}";
command = "${command}";
stdout_logfile = "${stateDir}/latency/stdout";
stderr_logfile = "${stateDir}/latency/stderr";
stdout_logfile = "${stateDir}/workloads/${workload.name}/stdout";
stderr_logfile = "${stateDir}/workloads/${workload.name}/stderr";
# Set these values to 0 to indicate an unlimited log size / no rotation.
stdout_logfile_maxbytes = 0;
stderr_logfile_maxbytes = 0;
Expand All @@ -204,7 +208,10 @@ let
# Seconds it needs to stay running to consider the start successful
startsecs = 5;
};
}
}) profileData.workloads))



//
lib.attrsets.optionalAttrs withSsh
{
Expand Down
70 changes: 68 additions & 2 deletions nix/workbench/backend/supervisor.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ case "$op" in

local svcs=$dir/profile/node-services.json
local gtor=$dir/profile/generator-service.json
local work=$dir/profile/workloads-service.json
local trac=$dir/profile/tracer-service.json
local hche=$dir/profile/healthcheck-service.json

Expand All @@ -76,6 +77,15 @@ case "$op" in
cp $(jq '."plutus-redeemer"' -r $gtor) "$gen_dir"/plutus-redeemer.json
cp $(jq '."plutus-datum"' -r $gtor) "$gen_dir"/plutus-datum.json

local work_dir="$dir"/workloads
mkdir -p "$work_dir"
for workload in $(jq_tolist 'map(.name)' "$work")
do
mkdir -p "$work_dir"/"${workload}"
cp $(jq "map(select(.name == \"${workload}\"))[0] | .start" -r $work) \
"$work_dir"/"${workload}"/start.sh
done

local trac_dir="$dir"/tracer
mkdir -p "$trac_dir"
cp $(jq '."start"' -r $trac) "$trac_dir"/start.sh
Expand All @@ -84,8 +94,6 @@ case "$op" in
local hche_dir="$dir"/healthcheck
mkdir -p "$hche_dir"
cp $(jq '."start"' -r $hche) "$hche_dir"/start.sh

mkdir -p "$dir"/latency
;;

deploy-genesis )
Expand Down Expand Up @@ -274,6 +282,30 @@ EOF
fi
backend_supervisor save-child-pids "$dir";;

start-workloads )
local usage="USAGE: wb backend $op RUN-DIR"
local dir=${1:?$usage}; shift

while test $# -gt 0
do case "$1" in
--* ) msg "FATAL: unknown flag '$1'"; usage_supervisor;;
* ) break;; esac; shift; done

# For every workload
for workload in $(jq_tolist '.workloads | map(.name)' "$dir"/profile.json)
do
if ! supervisorctl start "${workload}"
then progress "supervisor" "$(red fatal: failed to start) $(white "${workload} workload")"
echo "$(red "${workload}" workload stdout) ----------------------" >&2
cat "$dir"/workloads/"${workload}"/stdout
echo "$(red "${workload}" workload stderr) ----------------------" >&2
cat "$dir"/workloads/"${workload}"/stderr
echo "$(white -------------------------------------------------)" >&2
fatal "could not start $(white "${workload} workload")"
fi
done
backend_supervisor save-child-pids "$dir";;

wait-node-stopped )
local usage="USAGE: wb backend $op RUN-DIR NODE"
local dir=${1:?$usage}; shift
Expand Down Expand Up @@ -322,6 +354,40 @@ EOF
fi
;;

wait-workloads-stopped )
local usage="USAGE: wb backend $op RUN-DIR"
local dir=${1:?$usage}; shift

local start_time=$(date +%s)
msg_ne "supervisor: waiting until all workloads are stopped: 000000"
for workload in $(jq_tolist '.workloads | map(.name)' "$dir"/profile.json)
do
while \
! test -f "${dir}"/flag/cluster-stopping \
&& \
supervisorctl status "${workload}" > /dev/null
do
echo -ne "\b\b\b\b\b\b"
printf "%6d" "$(($(date +%s) - start_time))"
sleep 1
done
if ! test -f "${dir}"/flag/cluster-stopping
then
echo -ne "\b\b\b\b\b\b"
echo -n "${workload} 000000"
fi
done >&2
echo -ne "\b\b\b\b\b\b"
local elapsed=$(($(date +%s) - start_time))
if test -f "${dir}"/flag/cluster-stopping
then
echo " Termination requested -- after $(yellow ${elapsed})s" >&2
else
touch "${dir}"/flag/cluster-stopping
echo " All workloads exited -- after $(yellow ${elapsed})s" >&2
fi
;;

stop-all )
local usage="USAGE: wb backend $op RUN-DIR"
local dir=${1:?$usage}; shift
Expand Down
2 changes: 2 additions & 0 deletions nix/workbench/profile/prof0-defaults.jq
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ def era_defaults($era):
}
}

, workloads: []

, node:
{ rts_flags_override: []
, heap_limit: null ## optional: heap limit in MB (translates to RTS flag -M)
Expand Down
Loading

0 comments on commit 4f593fa

Please sign in to comment.