diff --git a/script/jax/a100/condor.sub b/script/jax/a100/condor.sub new file mode 100644 index 000000000..1745a814b --- /dev/null +++ b/script/jax/a100/condor.sub @@ -0,0 +1,42 @@ +# Define variables +LOGS_DIR=/fast/najroldi/logs/algoperf +EXE=/home/najroldi/algorithmic-efficiency/script/jax/a100/auto_run_array.sh + +num_jobs=8 + +# Job specific vars +workload_or_id=$(Process) +# workload_or_id=imagenet_vit +framework=jax +submission=prize_qualification_baselines/external_tuning/jax_nadamw_full_budget.py +search_space=prize_qualification_baselines/external_tuning/tuning_search_space.json +name=a100_yesTF32_04 +study=1 +num_tuning_trials=1 +rng_seed=96 + +# Args +executable = $(EXE) +arguments = \ + $(workload_or_id) \ + $(framework) \ + $(submission) \ + $(search_space) \ + $(name) \ + $(study) \ + $(num_tuning_trials) \ + $(rng_seed) + +# Logs +error = $(LOGS_DIR)/err/job.$(Cluster).$(Process).err +output = $(LOGS_DIR)/out/job.$(Cluster).$(Process).out +log = $(LOGS_DIR)/log/job.$(Cluster).$(Process).log + +# Specs +request_memory = 1000000 +request_cpus = 24 +request_gpus = 8 +requirements = (TARGET.CUDADeviceName == "NVIDIA A100-SXM4-80GB") +# requirements = (TARGET.CUDADeviceName == "NVIDIA A100-SXM4-40GB") + +queue $(num_jobs) \ No newline at end of file diff --git a/script/jax/v100/condor.sub b/script/jax/v100/condor.sub new file mode 100644 index 000000000..a172967ce --- /dev/null +++ b/script/jax/v100/condor.sub @@ -0,0 +1,41 @@ +# Define variables +LOGS_DIR=/fast/najroldi/logs/algoperf +EXE=/home/najroldi/algorithmic-efficiency/script/jax/v100/auto_run_array.sh + +num_jobs=8 + +# Job specific vars +workload_or_id=$(Process) +# workload_or_id=imagenet_vit +framework=jax +submission=prize_qualification_baselines/external_tuning/jax_nadamw_full_budget.py +search_space=prize_qualification_baselines/external_tuning/tuning_search_space.json +name=v100_04 +study=1 +num_tuning_trials=1 +rng_seed=96 + +# Args +executable = $(EXE) +arguments = \ + $(workload_or_id) \ + $(framework) \ + $(submission) \ + $(search_space) \ + $(name) \ + $(study) \ + $(num_tuning_trials) \ + $(rng_seed) + +# Logs +error = $(LOGS_DIR)/err/job.$(Cluster).$(Process).err +output = $(LOGS_DIR)/out/job.$(Cluster).$(Process).out +log = $(LOGS_DIR)/log/job.$(Cluster).$(Process).log + +# Specs +request_memory = 700000 +request_cpus = 24 +request_gpus = 8 +requirements = (TARGET.CUDADeviceName == "Tesla V100-SXM2-32GB") + +queue $(num_jobs) \ No newline at end of file diff --git a/script/pytorch/2xa100/condor.sub b/script/pytorch/2xa100/condor.sub new file mode 100644 index 000000000..e6b1c8455 --- /dev/null +++ b/script/pytorch/2xa100/condor.sub @@ -0,0 +1,45 @@ +# Define variables +LOGS_DIR=/fast/najroldi/logs/algoperf +EXE=/home/najroldi/algorithmic-efficiency/script/pytorch/2xa100/auto_run_array.sh + +num_jobs=8 + +# Job specific vars +workload_or_id=$(Process) +# workload_or_id=criteo1tb +framework=pytorch +submission=prize_qualification_baselines/external_tuning/pytorch_nadamw_full_budget.py +search_space=prize_qualification_baselines/external_tuning/tuning_search_space.json +name=a10040GB_x2_noTF32_01 +study=1 +num_tuning_trials=1 +rng_seed=96 +allow_tf_32=0 +halve_cuda_mem=0 + +# Args +executable = $(EXE) +arguments = \ + $(workload_or_id) \ + $(framework) \ + $(submission) \ + $(search_space) \ + $(name) \ + $(study) \ + $(num_tuning_trials) \ + $(rng_seed) \ + $(allow_tf_32) \ + $(halve_cuda_mem) + +# Logs +error = $(LOGS_DIR)/err/job.$(Cluster).$(Process).err +output = $(LOGS_DIR)/out/job.$(Cluster).$(Process).out +log = $(LOGS_DIR)/log/job.$(Cluster).$(Process).log + +# Specs +request_memory = 250000 +request_cpus = 12 +request_gpus = 2 +requirements = (TARGET.CUDADeviceName == "NVIDIA A100-SXM4-40GB") + +queue $(num_jobs) \ No newline at end of file diff --git a/script/pytorch/4xa100/condor.sub b/script/pytorch/4xa100/condor.sub new file mode 100644 index 000000000..ff2e3c39b --- /dev/null +++ b/script/pytorch/4xa100/condor.sub @@ -0,0 +1,45 @@ +# Define variables +LOGS_DIR=/fast/najroldi/logs/algoperf +EXE=/home/najroldi/algorithmic-efficiency/script/pytorch/4xa100/auto_run_array.sh + +num_jobs=8 + +# Job specific vars +workload_or_id=$(Process) +# workload_or_id=imagenet_resnet +framework=pytorch +submission=prize_qualification_baselines/external_tuning/pytorch_nadamw_full_budget.py +search_space=prize_qualification_baselines/external_tuning/tuning_search_space.json +name=a100_x4_yesTF32_10 +study=1 +num_tuning_trials=1 +rng_seed=96 +allow_tf_32=1 +eval_num_workers=4 + +# Args +executable = $(EXE) +arguments = \ + $(workload_or_id) \ + $(framework) \ + $(submission) \ + $(search_space) \ + $(name) \ + $(study) \ + $(num_tuning_trials) \ + $(rng_seed) \ + $(allow_tf_32) \ + $(eval_num_workers) + +# Logs +error = $(LOGS_DIR)/err/job.$(Cluster).$(Process).err +output = $(LOGS_DIR)/out/job.$(Cluster).$(Process).out +log = $(LOGS_DIR)/log/job.$(Cluster).$(Process).log + +# Specs +request_memory = 700000 +request_cpus = 36 +request_gpus = 4 +requirements = (TARGET.CUDADeviceName == "NVIDIA A100-SXM4-40GB") + +queue $(num_jobs) \ No newline at end of file diff --git a/script/pytorch/8xa100/condor.sub b/script/pytorch/8xa100/condor.sub new file mode 100644 index 000000000..4eb11b415 --- /dev/null +++ b/script/pytorch/8xa100/condor.sub @@ -0,0 +1,43 @@ +# Define variables +LOGS_DIR=/fast/najroldi/logs/algoperf +EXE=/home/najroldi/algorithmic-efficiency/script/pytorch/8xa100/auto_run_array.sh + +num_jobs=8 + +# Job specific vars +workload_or_id=$(Process) +# workload_or_id=imagenet_vit +framework=pytorch +submission=prize_qualification_baselines/external_tuning/pytorch_nadamw_full_budget.py +search_space=prize_qualification_baselines/external_tuning/tuning_search_space.json +name=a100_x8_noTF32_10 +study=1 +num_tuning_trials=1 +rng_seed=96 +allow_tf_32=0 + +# Args +executable = $(EXE) +arguments = \ + $(workload_or_id) \ + $(framework) \ + $(submission) \ + $(search_space) \ + $(name) \ + $(study) \ + $(num_tuning_trials) \ + $(rng_seed) \ + $(allow_tf_32) + +# Logs +error = $(LOGS_DIR)/err/job.$(Cluster).$(Process).err +output = $(LOGS_DIR)/out/job.$(Cluster).$(Process).out +log = $(LOGS_DIR)/log/job.$(Cluster).$(Process).log + +# Specs +request_memory = 700000 +request_cpus = 36 +request_gpus = 8 +requirements = (TARGET.CUDADeviceName == "NVIDIA A100-SXM4-40GB") + +queue $(num_jobs) \ No newline at end of file diff --git a/script/pytorch/v100/condor.sub b/script/pytorch/v100/condor.sub new file mode 100644 index 000000000..2d14bcd2f --- /dev/null +++ b/script/pytorch/v100/condor.sub @@ -0,0 +1,41 @@ +# Define variables +LOGS_DIR=/fast/najroldi/logs/algoperf +EXE=/home/najroldi/algorithmic-efficiency/script/pytorch/v100/auto_run_array.sh + +num_jobs=8 + +# Job specific vars +workload_or_id=$(Process) +# workload_or_id=imagenet_resnet +framework=pytorch +submission=prize_qualification_baselines/external_tuning/pytorch_nadamw_full_budget.py +search_space=prize_qualification_baselines/external_tuning/tuning_search_space.json +name=v100_10 +study=1 +num_tuning_trials=1 +rng_seed=96 + +# Args +executable = $(EXE) +arguments = \ + $(workload_or_id) \ + $(framework) \ + $(submission) \ + $(search_space) \ + $(name) \ + $(study) \ + $(num_tuning_trials) \ + $(rng_seed) + +# Logs +error = $(LOGS_DIR)/err/job.$(Cluster).$(Process).err +output = $(LOGS_DIR)/out/job.$(Cluster).$(Process).out +log = $(LOGS_DIR)/log/job.$(Cluster).$(Process).log + +# Specs +request_memory = 700000 +request_cpus = 36 +request_gpus = 8 +requirements = (TARGET.CUDADeviceName == "Tesla V100-SXM2-32GB") + +queue $(num_jobs) \ No newline at end of file diff --git a/script/pytorch/v100_test/condor.sub b/script/pytorch/v100_test/condor.sub new file mode 100644 index 000000000..7803590d1 --- /dev/null +++ b/script/pytorch/v100_test/condor.sub @@ -0,0 +1,45 @@ +# Define variables +LOGS_DIR=/fast/najroldi/logs/algoperf +EXE=/home/najroldi/algorithmic-efficiency/script/pytorch/v100_test/auto_run_array.sh + +num_jobs=1 + +# Job specific vars +# workload_or_id=$(Process) +workload_or_id=imagenet_resnet +framework=pytorch +submission=prize_qualification_baselines/external_tuning/pytorch_nadamw_full_budget.py +search_space=prize_qualification_baselines/external_tuning/tuning_search_space.json +name=v100_test_05 +study=1 +num_tuning_trials=1 +rng_seed=96 +eval_workers=2 +omp_threads=2 + +# Args +executable = $(EXE) +arguments = \ + $(workload_or_id) \ + $(framework) \ + $(submission) \ + $(search_space) \ + $(name) \ + $(study) \ + $(num_tuning_trials) \ + $(rng_seed) \ + $(eval_workers) \ + $(omp_threads) + +# Logs +error = $(LOGS_DIR)/err/job.$(Cluster).$(Process).err +output = $(LOGS_DIR)/out/job.$(Cluster).$(Process).out +log = $(LOGS_DIR)/log/job.$(Cluster).$(Process).log + +# Specs +request_memory = 700000 +request_cpus = 36 +request_gpus = 8 +requirements = (TARGET.CUDADeviceName == "Tesla V100-SXM2-32GB") + +queue $(num_jobs) \ No newline at end of file diff --git a/submission_runner.py b/submission_runner.py index 551173bf5..60e736e9d 100644 --- a/submission_runner.py +++ b/submission_runner.py @@ -133,6 +133,9 @@ flags.DEFINE_integer('max_global_steps', None, 'Maximum number of update steps.') +flags.DEFINE_float('max_pct_of_global_steps', + 0.0, + 'Maximum number of update steps.') flags.DEFINE_boolean( 'overwrite', False, @@ -160,6 +163,14 @@ 'Number of workers for ImageNet PyTorch evaluation data loaders.' 'WARNING: Setting pytorch_eval_num_workers != 0, will result ' 'in incorrect evals currently, see issues/732.') +flags.DEFINE_boolean( + 'halve_CUDA_mem', + False, + 'Halve the available VRAM.') +flags.DEFINE_boolean( + 'allow_tf32', + False, + 'Allow TF32 on Ampere.') FLAGS = flags.FLAGS USE_PYTORCH_DDP, RANK, DEVICE, N_GPUS = pytorch_setup() @@ -205,6 +216,7 @@ def train_once( rng: spec.RandomState, profiler: Profiler, max_global_steps: int = None, + max_pct_of_global_steps: float = None, log_dir: Optional[str] = None, save_checkpoints: Optional[bool] = True ) -> Tuple[spec.Timing, Dict[str, Any]]: @@ -365,6 +377,10 @@ def train_once( global_step += 1 if (max_global_steps is not None) and (global_step == max_global_steps): train_state['training_complete'] = True + # (nico): train for a fixed pct of step_hint + if (max_pct_of_global_steps is not None) and \ + (global_step / workload.step_hint >= max_pct_of_global_steps): + train_state['training_complete'] = True train_step_end_time = get_time() @@ -498,6 +514,7 @@ def score_submission_on_workload(workload: spec.Workload, tuning_ruleset: str, profiler: Optional[Profiler] = None, max_global_steps: Optional[int] = None, + max_pct_of_global_steps: Optional[float] = None, imagenet_v2_data_dir: Optional[str] = None, tuning_search_space: Optional[str] = None, num_tuning_trials: Optional[int] = None, @@ -595,6 +612,7 @@ def score_submission_on_workload(workload: spec.Workload, rng, profiler, max_global_steps, + max_pct_of_global_steps, tuning_dir_name, save_checkpoints=save_checkpoints,) all_timings[hi] = timing @@ -631,6 +649,19 @@ def score_submission_on_workload(workload: spec.Workload, def main(_): + + if FLAGS.framework == 'pytorch': + + if FLAGS.halve_CUDA_mem: + torch.cuda.set_per_process_memory_fraction(0.5, device=DEVICE) + + if FLAGS.allow_tf32: + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + else: + torch.backends.cuda.matmul.allow_tf32 = False + torch.backends.cudnn.allow_tf32 = False + if FLAGS.profile: profiler = Profiler() else: @@ -687,6 +718,7 @@ def main(_): tuning_ruleset=FLAGS.tuning_ruleset, profiler=profiler, max_global_steps=FLAGS.max_global_steps, + max_pct_of_global_steps=FLAGS.max_pct_of_global_steps, imagenet_v2_data_dir=FLAGS.imagenet_v2_data_dir, tuning_search_space=FLAGS.tuning_search_space, num_tuning_trials=FLAGS.num_tuning_trials,