Skip to content

Commit

Permalink
run an independent job for the head node
Browse files Browse the repository at this point in the history
  • Loading branch information
thayeral committed Nov 4, 2024
1 parent 193b4f8 commit d565d1e
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 26 deletions.
12 changes: 6 additions & 6 deletions src/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,21 +394,21 @@ def main(args=None):
else:
if args.nodes > 1:
gpu_workers = args.nodes * args.gpus
else:
gpu_workers = args.gpus

if args.nodes > 1:
cpu_workers = args.nodes * args.cpus
else:
gpu_workers = args.gpus
cpu_workers = args.cpus

if args.span:
sjob += f" -n {cpu_workers}"
sjob += f' -R "span[ptile={args.cpus}]"'
else:
sjob += f" -n {args.cpus}"
if args.nodes == 1:
sjob += f" -n {args.cpus}"
else:
sjob += f" -n 2"

if args.gpus > 0:
if args.gpus > 0 and args.nodes == 1:
if args.partition == 'gpu_a100':
sjob += f' -gpu "num={args.gpus}:nvlink=yes"'
else:
Expand Down
50 changes: 30 additions & 20 deletions src/ray_lsf_cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ dashboard_port=$(getfreeport)
echo "Dashboard will use port: $dashboard_port"
export dashboard_port

############################## START HEAD NODE
############################## GET IPs

head_node=$(cat $LSB_DJOB_HOSTFILE | uniq | head -n1 | awk '{print $1;}')
head_node_ip=$(getent hosts $head_node | awk '{ print $1 }')
Expand All @@ -66,32 +66,42 @@ export head_node
export head_node_ip
export cluster_address

apptainer exec --userns --nv --bind $bind --bind $outdir:$tmpdir $env ./ray_start_cluster.sh -i $head_node_ip -p $port -d $dashboard_port -c $cpus -g $gpus -t $tmpdir &
sleep 10
############################## START CLUSTER

############################## ADD WORKER NODES
if [ "$nodes" -gt 1 ]; then
echo "More than one node is available."

worker_ids=()
num_workers=$((nodes - 1))
for i in $(seq 1 $num_workers)
do
mkdir -p "${outdir}/ray_worker_${i}"
echo "Adding worker: ${outdir}/ray_worker_${i}"
job="bsub -cwd "$(pwd)" -q $LSB_QUEUE -J "${outdir}/ray_worker_${i}" -n $cpus -gpu "num=$gpus:mode=shared" -o "${outdir}/ray_worker_${i}.log" apptainer exec --userns --nv --bind $bind --bind $outdir/ray_worker_${i}:$tmpdir $env ./ray_start_worker.sh -a $cluster_address -c $cpus -g $gpus -t $tmpdir"
echo $job
$job
echo "Starting head node without any GPUs"
apptainer exec --userns --nv --bind $bind --bind $outdir:$tmpdir $env ./ray_start_cluster.sh -i $head_node_ip -p $port -d $dashboard_port -c $cpus -g 0 -t $tmpdir &
sleep 10


jid=$(bjobs -J "${outdir}/ray_worker_${i}" | awk 'NR==2 {print $1;}')
while [ -z "$jid" ]
worker_ids=()
for i in $(seq 1 $nodes)
do
sleep 1
mkdir -p "${outdir}/ray_worker_${i}"
echo "Adding worker: ${outdir}/ray_worker_${i}"
job="bsub -cwd "$(pwd)" -q $LSB_QUEUE -J "${outdir}/ray_worker_${i}" -n $cpus -gpu "num=$gpus:mode=shared" -o "${outdir}/ray_worker_${i}.log" apptainer exec --userns --nv --bind $bind --bind $outdir/ray_worker_${i}:$tmpdir $env ./ray_start_worker.sh -a $cluster_address -c $cpus -g $gpus -t $tmpdir"
echo $job
$job


jid=$(bjobs -J "${outdir}/ray_worker_${i}" | awk 'NR==2 {print $1;}')
while [ -z "$jid" ]
do
sleep 1
jid=$(bjobs -J "${outdir}/ray_worker_${i}" | awk 'NR==2 {print $1;}')
done

worker_ids+=($jid)
echo "Running ray_worker_${i} @ ${jid}"
done

worker_ids+=($jid)
echo "Running ray_worker_${i} @ ${jid}"
done
else
echo "Only one node is available."
apptainer exec --userns --nv --bind $bind --bind $outdir:$tmpdir $env ./ray_start_cluster.sh -i $head_node_ip -p $port -d $dashboard_port -c $cpus -g $gpus -t $tmpdir &
sleep 10
fi


############################## CHECK STATUS

Expand Down

0 comments on commit d565d1e

Please sign in to comment.