diff --git a/algorithmic_efficiency/logger_utils.py b/algorithmic_efficiency/logger_utils.py index b7bde226a..813af1ab6 100644 --- a/algorithmic_efficiency/logger_utils.py +++ b/algorithmic_efficiency/logger_utils.py @@ -267,8 +267,8 @@ def get_meta_data(workload: spec.Workload, meta_data = {} workload_properties = _get_workload_properties(workload) meta_data.update(workload_properties) - utilization_measurements = _get_utilization() - meta_data.update(utilization_measurements) + # utilization_measurements = _get_utilization() + # meta_data.update(utilization_measurements) system_software_info = _get_system_software_info() meta_data.update(system_software_info) system_hardware_info = _get_system_hardware_info() diff --git a/exp/slurm/check1.py b/exp/slurm/check1.py new file mode 100644 index 000000000..35b008177 --- /dev/null +++ b/exp/slurm/check1.py @@ -0,0 +1,14 @@ +import psutil + +def check_disk_io_counters_support(): + try: + disk_io = psutil.disk_io_counters() + if disk_io is None: + print("disk_io_counters() is not supported on this machine.") + else: + print("disk_io_counters() is supported on this machine.") + print("Disk I/O stats:", disk_io) + except Exception as e: + print(f"An error occurred: {e}") + +check_disk_io_counters_support() diff --git a/exp/slurm/mnist.sh b/exp/slurm/mnist.sh index 7a680808c..2708de055 100755 --- a/exp/slurm/mnist.sh +++ b/exp/slurm/mnist.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash -l # add conda TODO: make it more portable! source ~/miniconda3/etc/profile.d/conda.sh @@ -12,6 +12,7 @@ echo "------ $CONDA_DEFAULT_ENV ------" export CODE_DIR=~/algorithmic-efficiency export DATA_DIR=~/data export EXP_DIR=/ptmp/najroldi/exp/algoperf +# export CUDA_VISIBLE_DEVICES=0 # Job specific vars workload=mnist @@ -35,5 +36,4 @@ python3 $CODE_DIR/submission_runner.py \ --num_tuning_trials=$trials \ --experiment_dir=$EXP_DIR \ --experiment_name=$name \ - --use_wandb \ --overwrite \ No newline at end of file diff --git a/exp/slurm/mnist_sub.sh b/exp/slurm/mnist_sub.sh index d0866c338..41e7d8df8 100644 --- a/exp/slurm/mnist_sub.sh +++ b/exp/slurm/mnist_sub.sh @@ -1,19 +1,34 @@ -#!/bin/bash +#!/bin/bash -l #SBATCH --job-name=mnist_01 #SBATCH --output=/ptmp/najroldi/logs/algoperf/job_%j.out #SBATCH --error=/ptmp/najroldi/logs/algoperf/job_%j.err - -#SBATCH --time=00:05:00 +# +#SBATCH --time=00:10:00 #SBATCH --ntasks 1 #SBATCH --requeue +# +# --- default case: use a single GPU on a shared node --- +#SBATCH --gres=gpu:a100:1 +#SBATCH --cpus-per-task=18 +#SBATCH --mem=125000 +# +# --- uncomment to use 2 GPUs on a shared node --- +# #SBATCH --gres=gpu:a100:2 +# #SBATCH --cpus-per-task=36 +# #SBATCH --mem=250000 +# +# --- uncomment to use 4 GPUs on a full node --- +# #SBATCH --gres=gpu:a100:4 +# #SBATCH --cpus-per-task=72 +# #SBATCH --mem=500000 -#SBATCH --cpus-per-task 2 -#SBATCH --mem=1000M - -# Get node with GPUs -#SBATCH --partition=gpu -#SBATCH --gres=gpu:1 -# the last constraint ensures that we are not reserving gpu-bw +srun ~/algorithmic-efficiency/exp/slurm/mnist.sh -srun ~/algorithmic-efficiency/exp/slurm/mnist.sh \ No newline at end of file +## FARLO MEGLIO!!!! +# -l +# cpus? +# request a100 +# nvidia mps +# --- is this even useful????? --- +# #SBATCH --partition=gpu diff --git a/exp/slurm/test_1.sh b/exp/slurm/test_1.sh new file mode 100755 index 000000000..60d7e42dc --- /dev/null +++ b/exp/slurm/test_1.sh @@ -0,0 +1,42 @@ +#!/bin/bash -l + + +# add conda TODO: make it more portable! +source ~/miniconda3/etc/profile.d/conda.sh + +# Activate conda environment TODO: should I use source activate alpe instead? +conda activate alpe + +echo "------ $CONDA_DEFAULT_ENV ------" + +# Env vars +export CODE_DIR=~/algorithmic-efficiency +export DATA_DIR=~/data +export EXP_DIR=~/exp/algoperf +# export EXP_DIR=/ptmp/najroldi/exp/algoperf +export CUDA_VISIBLE_DEVICES=0 + +# Job specific vars +workload=mnist +dataset=MNIST +submission='reference_algorithms/development_algorithms/mnist/mnist_pytorch/submission.py' +search_space='reference_algorithms/development_algorithms/mnist/tuning_search_space.json' +trials=1 +name="mnist_01" + +# Print GPU infos +# nvidia-smi + +# Execute python script +python3 $CODE_DIR/submission_runner.py \ + --workload=$workload \ + --framework=pytorch \ + --tuning_ruleset=external \ + --data_dir=$DATA_DIR/$dataset \ + --submission_path=$submission \ + --tuning_search_space=$search_space \ + --num_tuning_trials=$trials \ + --experiment_dir=$EXP_DIR \ + --experiment_name=$name \ + --use_wandb \ + --overwrite \ No newline at end of file diff --git a/submission_runner.py b/submission_runner.py index 98267741e..305689fa9 100644 --- a/submission_runner.py +++ b/submission_runner.py @@ -513,14 +513,8 @@ def score_submission_on_workload(workload: spec.Workload, # If the user specifies the eval batch size, use the provided one. global_eval_batch_size = submission_module.get_eval_batch_size( workload_name) - # print(" ####### global_eval_batch_size = {}".format(global_eval_batch_size)) - # logging.info(' ####### global_eval_batch_size = %d', global_eval_batch_size) - # raise ValueError("STOP") else: global_eval_batch_size = workload.eval_batch_size - # print(" ####### global_eval_batch_size = {}".format(global_eval_batch_size)) - # logging.info(' ####### global_eval_batch_size = %d', global_eval_batch_size) - # raise ValueError("STOP") if global_eval_batch_size % n_gpus != 0: raise ValueError( f'The global eval batch size ({global_eval_batch_size}) has to be '