Skip to content

Commit

Permalink
add some flags to sub runner, control memory, max pct steps + scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
Niccolo-Ajroldi committed Nov 16, 2024
1 parent adc5ea9 commit 68735ea
Show file tree
Hide file tree
Showing 8 changed files with 334 additions and 0 deletions.
42 changes: 42 additions & 0 deletions script/jax/a100/condor.sub
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Define variables
LOGS_DIR=/fast/najroldi/logs/algoperf
EXE=/home/najroldi/algorithmic-efficiency/script/jax/a100/auto_run_array.sh

num_jobs=8

# Job specific vars
workload_or_id=$(Process)
# workload_or_id=imagenet_vit
framework=jax
submission=prize_qualification_baselines/external_tuning/jax_nadamw_full_budget.py
search_space=prize_qualification_baselines/external_tuning/tuning_search_space.json
name=a100_yesTF32_04
study=1
num_tuning_trials=1
rng_seed=96

# Args
executable = $(EXE)
arguments = \
$(workload_or_id) \
$(framework) \
$(submission) \
$(search_space) \
$(name) \
$(study) \
$(num_tuning_trials) \
$(rng_seed)

# Logs
error = $(LOGS_DIR)/err/job.$(Cluster).$(Process).err
output = $(LOGS_DIR)/out/job.$(Cluster).$(Process).out
log = $(LOGS_DIR)/log/job.$(Cluster).$(Process).log

# Specs
request_memory = 1000000
request_cpus = 24
request_gpus = 8
requirements = (TARGET.CUDADeviceName == "NVIDIA A100-SXM4-80GB")
# requirements = (TARGET.CUDADeviceName == "NVIDIA A100-SXM4-40GB")

queue $(num_jobs)
41 changes: 41 additions & 0 deletions script/jax/v100/condor.sub
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Define variables
LOGS_DIR=/fast/najroldi/logs/algoperf
EXE=/home/najroldi/algorithmic-efficiency/script/jax/v100/auto_run_array.sh

num_jobs=8

# Job specific vars
workload_or_id=$(Process)
# workload_or_id=imagenet_vit
framework=jax
submission=prize_qualification_baselines/external_tuning/jax_nadamw_full_budget.py
search_space=prize_qualification_baselines/external_tuning/tuning_search_space.json
name=v100_04
study=1
num_tuning_trials=1
rng_seed=96

# Args
executable = $(EXE)
arguments = \
$(workload_or_id) \
$(framework) \
$(submission) \
$(search_space) \
$(name) \
$(study) \
$(num_tuning_trials) \
$(rng_seed)

# Logs
error = $(LOGS_DIR)/err/job.$(Cluster).$(Process).err
output = $(LOGS_DIR)/out/job.$(Cluster).$(Process).out
log = $(LOGS_DIR)/log/job.$(Cluster).$(Process).log

# Specs
request_memory = 700000
request_cpus = 24
request_gpus = 8
requirements = (TARGET.CUDADeviceName == "Tesla V100-SXM2-32GB")

queue $(num_jobs)
45 changes: 45 additions & 0 deletions script/pytorch/2xa100/condor.sub
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Define variables
LOGS_DIR=/fast/najroldi/logs/algoperf
EXE=/home/najroldi/algorithmic-efficiency/script/pytorch/2xa100/auto_run_array.sh

num_jobs=8

# Job specific vars
workload_or_id=$(Process)
# workload_or_id=criteo1tb
framework=pytorch
submission=prize_qualification_baselines/external_tuning/pytorch_nadamw_full_budget.py
search_space=prize_qualification_baselines/external_tuning/tuning_search_space.json
name=a10040GB_x2_noTF32_01
study=1
num_tuning_trials=1
rng_seed=96
allow_tf_32=0
halve_cuda_mem=0

# Args
executable = $(EXE)
arguments = \
$(workload_or_id) \
$(framework) \
$(submission) \
$(search_space) \
$(name) \
$(study) \
$(num_tuning_trials) \
$(rng_seed) \
$(allow_tf_32) \
$(halve_cuda_mem)

# Logs
error = $(LOGS_DIR)/err/job.$(Cluster).$(Process).err
output = $(LOGS_DIR)/out/job.$(Cluster).$(Process).out
log = $(LOGS_DIR)/log/job.$(Cluster).$(Process).log

# Specs
request_memory = 250000
request_cpus = 12
request_gpus = 2
requirements = (TARGET.CUDADeviceName == "NVIDIA A100-SXM4-40GB")

queue $(num_jobs)
45 changes: 45 additions & 0 deletions script/pytorch/4xa100/condor.sub
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Define variables
LOGS_DIR=/fast/najroldi/logs/algoperf
EXE=/home/najroldi/algorithmic-efficiency/script/pytorch/4xa100/auto_run_array.sh

num_jobs=8

# Job specific vars
workload_or_id=$(Process)
# workload_or_id=imagenet_resnet
framework=pytorch
submission=prize_qualification_baselines/external_tuning/pytorch_nadamw_full_budget.py
search_space=prize_qualification_baselines/external_tuning/tuning_search_space.json
name=a100_x4_yesTF32_10
study=1
num_tuning_trials=1
rng_seed=96
allow_tf_32=1
eval_num_workers=4

# Args
executable = $(EXE)
arguments = \
$(workload_or_id) \
$(framework) \
$(submission) \
$(search_space) \
$(name) \
$(study) \
$(num_tuning_trials) \
$(rng_seed) \
$(allow_tf_32) \
$(eval_num_workers)

# Logs
error = $(LOGS_DIR)/err/job.$(Cluster).$(Process).err
output = $(LOGS_DIR)/out/job.$(Cluster).$(Process).out
log = $(LOGS_DIR)/log/job.$(Cluster).$(Process).log

# Specs
request_memory = 700000
request_cpus = 36
request_gpus = 4
requirements = (TARGET.CUDADeviceName == "NVIDIA A100-SXM4-40GB")

queue $(num_jobs)
43 changes: 43 additions & 0 deletions script/pytorch/8xa100/condor.sub
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Define variables
LOGS_DIR=/fast/najroldi/logs/algoperf
EXE=/home/najroldi/algorithmic-efficiency/script/pytorch/8xa100/auto_run_array.sh

num_jobs=8

# Job specific vars
workload_or_id=$(Process)
# workload_or_id=imagenet_vit
framework=pytorch
submission=prize_qualification_baselines/external_tuning/pytorch_nadamw_full_budget.py
search_space=prize_qualification_baselines/external_tuning/tuning_search_space.json
name=a100_x8_noTF32_10
study=1
num_tuning_trials=1
rng_seed=96
allow_tf_32=0

# Args
executable = $(EXE)
arguments = \
$(workload_or_id) \
$(framework) \
$(submission) \
$(search_space) \
$(name) \
$(study) \
$(num_tuning_trials) \
$(rng_seed) \
$(allow_tf_32)

# Logs
error = $(LOGS_DIR)/err/job.$(Cluster).$(Process).err
output = $(LOGS_DIR)/out/job.$(Cluster).$(Process).out
log = $(LOGS_DIR)/log/job.$(Cluster).$(Process).log

# Specs
request_memory = 700000
request_cpus = 36
request_gpus = 8
requirements = (TARGET.CUDADeviceName == "NVIDIA A100-SXM4-40GB")

queue $(num_jobs)
41 changes: 41 additions & 0 deletions script/pytorch/v100/condor.sub
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Define variables
LOGS_DIR=/fast/najroldi/logs/algoperf
EXE=/home/najroldi/algorithmic-efficiency/script/pytorch/v100/auto_run_array.sh

num_jobs=8

# Job specific vars
workload_or_id=$(Process)
# workload_or_id=imagenet_resnet
framework=pytorch
submission=prize_qualification_baselines/external_tuning/pytorch_nadamw_full_budget.py
search_space=prize_qualification_baselines/external_tuning/tuning_search_space.json
name=v100_10
study=1
num_tuning_trials=1
rng_seed=96

# Args
executable = $(EXE)
arguments = \
$(workload_or_id) \
$(framework) \
$(submission) \
$(search_space) \
$(name) \
$(study) \
$(num_tuning_trials) \
$(rng_seed)

# Logs
error = $(LOGS_DIR)/err/job.$(Cluster).$(Process).err
output = $(LOGS_DIR)/out/job.$(Cluster).$(Process).out
log = $(LOGS_DIR)/log/job.$(Cluster).$(Process).log

# Specs
request_memory = 700000
request_cpus = 36
request_gpus = 8
requirements = (TARGET.CUDADeviceName == "Tesla V100-SXM2-32GB")

queue $(num_jobs)
45 changes: 45 additions & 0 deletions script/pytorch/v100_test/condor.sub
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Define variables
LOGS_DIR=/fast/najroldi/logs/algoperf
EXE=/home/najroldi/algorithmic-efficiency/script/pytorch/v100_test/auto_run_array.sh

num_jobs=1

# Job specific vars
# workload_or_id=$(Process)
workload_or_id=imagenet_resnet
framework=pytorch
submission=prize_qualification_baselines/external_tuning/pytorch_nadamw_full_budget.py
search_space=prize_qualification_baselines/external_tuning/tuning_search_space.json
name=v100_test_05
study=1
num_tuning_trials=1
rng_seed=96
eval_workers=2
omp_threads=2

# Args
executable = $(EXE)
arguments = \
$(workload_or_id) \
$(framework) \
$(submission) \
$(search_space) \
$(name) \
$(study) \
$(num_tuning_trials) \
$(rng_seed) \
$(eval_workers) \
$(omp_threads)

# Logs
error = $(LOGS_DIR)/err/job.$(Cluster).$(Process).err
output = $(LOGS_DIR)/out/job.$(Cluster).$(Process).out
log = $(LOGS_DIR)/log/job.$(Cluster).$(Process).log

# Specs
request_memory = 700000
request_cpus = 36
request_gpus = 8
requirements = (TARGET.CUDADeviceName == "Tesla V100-SXM2-32GB")

queue $(num_jobs)
Loading

0 comments on commit 68735ea

Please sign in to comment.