diff --git a/run_llama_train.sh b/run_llama_train.sh index d264c2e2..2749b01d 100755 --- a/run_llama_train.sh +++ b/run_llama_train.sh @@ -4,14 +4,26 @@ set -ex TRAINER_DIR=${1:-/home/$USER/local/torchtrain} -MODEL="debugmodel" -NGPU=8 -MP=4 +# use envs as local overrides for convenience +# e.g. +# LOG_RANK=0,1 NGPU=4 SP=2 ./run_llama_train.sh + +MODEL=${MODEL:-"debugmodel"} +NGPU=${NGPU:-"8"} +PP=${PP:-"1"} +SP=${SP:-"1"} +DP=${DP:-"-1"} + +# by default log just rank 0 output, +LOG_RANK=${LOG_RANK:-0} + # Change this string to a meaningful one to enable checkpoint -CHECKPOINT_FOLDER="" +CHECKPOINT_FOLDER=${CHECKPOINT_FOLDER:-""} # Please adjust this to a longer interval period. The unit of measurement is in steps. -CHECKPOINT_INTERVAL=5 +CHECKPOINT_INTERVAL=${CHECKPOINT_INTERVAL:-5} torchrun --nproc_per_node=${NGPU} \ +--local-ranks-filter ${LOG_RANK} --role rank --tee 3 \ train.py --steps 10 --compile \ +--pp_degree ${PP} --sp_degree ${SP} --dp_degree ${DP} --checkpoint-folder=${CHECKPOINT_FOLDER} --checkpoint-interval=${CHECKPOINT_INTERVAL}