Skip to content

Commit

Permalink
mnist check slurm
Browse files Browse the repository at this point in the history
  • Loading branch information
Niccolo-Ajroldi committed Feb 20, 2024
1 parent 4ce6efd commit cd36e76
Show file tree
Hide file tree
Showing 6 changed files with 86 additions and 21 deletions.
4 changes: 2 additions & 2 deletions algorithmic_efficiency/logger_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,8 +267,8 @@ def get_meta_data(workload: spec.Workload,
meta_data = {}
workload_properties = _get_workload_properties(workload)
meta_data.update(workload_properties)
utilization_measurements = _get_utilization()
meta_data.update(utilization_measurements)
# utilization_measurements = _get_utilization()
# meta_data.update(utilization_measurements)
system_software_info = _get_system_software_info()
meta_data.update(system_software_info)
system_hardware_info = _get_system_hardware_info()
Expand Down
14 changes: 14 additions & 0 deletions exp/slurm/check1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import psutil

def check_disk_io_counters_support():
try:
disk_io = psutil.disk_io_counters()
if disk_io is None:
print("disk_io_counters() is not supported on this machine.")
else:
print("disk_io_counters() is supported on this machine.")
print("Disk I/O stats:", disk_io)
except Exception as e:
print(f"An error occurred: {e}")

check_disk_io_counters_support()
4 changes: 2 additions & 2 deletions exp/slurm/mnist.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/bash
#!/bin/bash -l

# add conda TODO: make it more portable!
source ~/miniconda3/etc/profile.d/conda.sh
Expand All @@ -12,6 +12,7 @@ echo "------ $CONDA_DEFAULT_ENV ------"
export CODE_DIR=~/algorithmic-efficiency
export DATA_DIR=~/data
export EXP_DIR=/ptmp/najroldi/exp/algoperf
# export CUDA_VISIBLE_DEVICES=0

# Job specific vars
workload=mnist
Expand All @@ -35,5 +36,4 @@ python3 $CODE_DIR/submission_runner.py \
--num_tuning_trials=$trials \
--experiment_dir=$EXP_DIR \
--experiment_name=$name \
--use_wandb \
--overwrite
37 changes: 26 additions & 11 deletions exp/slurm/mnist_sub.sh
Original file line number Diff line number Diff line change
@@ -1,19 +1,34 @@
#!/bin/bash
#!/bin/bash -l

#SBATCH --job-name=mnist_01
#SBATCH --output=/ptmp/najroldi/logs/algoperf/job_%j.out
#SBATCH --error=/ptmp/najroldi/logs/algoperf/job_%j.err

#SBATCH --time=00:05:00
#
#SBATCH --time=00:10:00
#SBATCH --ntasks 1
#SBATCH --requeue
#
# --- default case: use a single GPU on a shared node ---
#SBATCH --gres=gpu:a100:1
#SBATCH --cpus-per-task=18
#SBATCH --mem=125000
#
# --- uncomment to use 2 GPUs on a shared node ---
# #SBATCH --gres=gpu:a100:2
# #SBATCH --cpus-per-task=36
# #SBATCH --mem=250000
#
# --- uncomment to use 4 GPUs on a full node ---
# #SBATCH --gres=gpu:a100:4
# #SBATCH --cpus-per-task=72
# #SBATCH --mem=500000

#SBATCH --cpus-per-task 2
#SBATCH --mem=1000M

# Get node with GPUs
#SBATCH --partition=gpu
#SBATCH --gres=gpu:1
# the last constraint ensures that we are not reserving gpu-bw
srun ~/algorithmic-efficiency/exp/slurm/mnist.sh

srun ~/algorithmic-efficiency/exp/slurm/mnist.sh
## FARLO MEGLIO!!!!
# -l
# cpus?
# request a100
# nvidia mps
# --- is this even useful????? ---
# #SBATCH --partition=gpu
42 changes: 42 additions & 0 deletions exp/slurm/test_1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/bin/bash -l


# add conda TODO: make it more portable!
source ~/miniconda3/etc/profile.d/conda.sh

# Activate conda environment TODO: should I use source activate alpe instead?
conda activate alpe

echo "------ $CONDA_DEFAULT_ENV ------"

# Env vars
export CODE_DIR=~/algorithmic-efficiency
export DATA_DIR=~/data
export EXP_DIR=~/exp/algoperf
# export EXP_DIR=/ptmp/najroldi/exp/algoperf
export CUDA_VISIBLE_DEVICES=0

# Job specific vars
workload=mnist
dataset=MNIST
submission='reference_algorithms/development_algorithms/mnist/mnist_pytorch/submission.py'
search_space='reference_algorithms/development_algorithms/mnist/tuning_search_space.json'
trials=1
name="mnist_01"

# Print GPU infos
# nvidia-smi

# Execute python script
python3 $CODE_DIR/submission_runner.py \
--workload=$workload \
--framework=pytorch \
--tuning_ruleset=external \
--data_dir=$DATA_DIR/$dataset \
--submission_path=$submission \
--tuning_search_space=$search_space \
--num_tuning_trials=$trials \
--experiment_dir=$EXP_DIR \
--experiment_name=$name \
--use_wandb \
--overwrite
6 changes: 0 additions & 6 deletions submission_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,14 +513,8 @@ def score_submission_on_workload(workload: spec.Workload,
# If the user specifies the eval batch size, use the provided one.
global_eval_batch_size = submission_module.get_eval_batch_size(
workload_name)
# print(" ####### global_eval_batch_size = {}".format(global_eval_batch_size))
# logging.info(' ####### global_eval_batch_size = %d', global_eval_batch_size)
# raise ValueError("STOP")
else:
global_eval_batch_size = workload.eval_batch_size
# print(" ####### global_eval_batch_size = {}".format(global_eval_batch_size))
# logging.info(' ####### global_eval_batch_size = %d', global_eval_batch_size)
# raise ValueError("STOP")
if global_eval_batch_size % n_gpus != 0:
raise ValueError(
f'The global eval batch size ({global_eval_batch_size}) has to be '
Expand Down

0 comments on commit cd36e76

Please sign in to comment.