Skip to content

Commit

Permalink
imporved slurm and download scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
Niccolo-Ajroldi committed Feb 20, 2024
1 parent b5b70b9 commit d4bf670
Show file tree
Hide file tree
Showing 16 changed files with 99 additions and 149 deletions.
12 changes: 6 additions & 6 deletions algorithmic_efficiency/logger_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,10 @@ def _get_utilization() -> Dict:
util_data['mem.used'] = memory_util.used
util_data['mem.percent_used'] = memory_util.percent

# Disk
disk_io_counters = psutil.disk_io_counters()
util_data['mem.read_bytes_since_boot'] = disk_io_counters.read_bytes
util_data['mem.write_bytes_since_boot'] = disk_io_counters.write_bytes
# # Disk
# disk_io_counters = psutil.disk_io_counters()
# util_data['mem.read_bytes_since_boot'] = disk_io_counters.read_bytes
# util_data['mem.write_bytes_since_boot'] = disk_io_counters.write_bytes

# Network
net_io_counters = psutil.net_io_counters()
Expand Down Expand Up @@ -267,8 +267,8 @@ def get_meta_data(workload: spec.Workload,
meta_data = {}
workload_properties = _get_workload_properties(workload)
meta_data.update(workload_properties)
# utilization_measurements = _get_utilization()
# meta_data.update(utilization_measurements)
utilization_measurements = _get_utilization()
meta_data.update(utilization_measurements)
system_software_info = _get_system_software_info()
meta_data.update(system_software_info)
system_hardware_info = _get_system_hardware_info()
Expand Down
8 changes: 1 addition & 7 deletions exp/data_setup/fastmri.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,4 @@ python datasets/dataset_setup.py \
--fastmri_knee_singlecoil_train_url=$train_url \
--fastmri_knee_singlecoil_val_url=$valid_url \
--fastmri_knee_singlecoil_test_url=$test_url \
--interactive_deletion=False

# OR MANUAL DOWNLOAD
# curl -C $train_url --output knee_singlecoil_train.tar.xz
# curl -C $valid_url --output knee_singlecoil_val.tar.xz
# curl -C $test_url --output knee_singlecoil_test.tar.xz
# ...
--interactive_deletion=True
1 change: 1 addition & 0 deletions exp/data_setup/librispeech.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@ source exp/data_setup/set_env.sh
python3 datasets/dataset_setup.py \
--data_dir $DATA_DIR \
--temp_dir=$TMP_DIR \
--framework=pytorch \
--librispeech \
--interactive_deletion=False
2 changes: 0 additions & 2 deletions exp/data_setup/wmt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
# activate conda env, export DATA_DIR
source exp/data_setup/set_env.sh

export CUDA_VISIBLE_DEVICES=0

python3 datasets/dataset_setup.py \
--data_dir $DATA_DIR \
--temp_dir=$TMP_DIR \
Expand Down
40 changes: 22 additions & 18 deletions exp/slurm/imagenet_resnet.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
#!/bin/bash

# add conda TODO: make it more portable!
source ~/miniconda3/etc/profile.d/conda.sh
# provvisorio
source ~/.bashrc

# # add conda TODO: make it more portable!
# source ~/miniconda3/etc/profile.d/conda.sh

# Activate conda environment TODO: should I use source activate alpe instead?
conda activate alpe
Expand All @@ -26,19 +29,20 @@ num_gpu=4
nvidia-smi

# Execute python script
torchrun --redirects 1:0,2:0,3:0 \
--standalone \
--nproc_per_node=$num_gpu \
$CODE_DIR/submission_runner.py \
--workload=$workload \
--framework=pytorch \
--tuning_ruleset=external \
--data_dir=$DATA_DIR/imagenet/pytorch \
--imagenet_v2_data_dir=$DATA_DIR/imagenet/pytorch/imagenet_v2 \
--submission_path=$submission \
--tuning_search_space=$search_space \
--num_tuning_trials=$trials \
--experiment_dir=$EXP_DIR \
--experiment_name=$name \
--use_wandb \
--overwrite
torchrun \
--redirects 1:0,2:0,3:0 \
--standalone \
--nproc_per_node=$num_gpu \
$CODE_DIR/submission_runner.py \
--workload=$workload \
--framework=pytorch \
--tuning_ruleset=external \
--data_dir=$DATA_DIR/imagenet/pytorch \
--imagenet_v2_data_dir=$DATA_DIR/imagenet/pytorch \
--submission_path=$submission \
--tuning_search_space=$search_space \
--num_tuning_trials=$trials \
--experiment_dir=$EXP_DIR \
--experiment_name=$name \
--use_wandb \
--overwrite
38 changes: 0 additions & 38 deletions exp/slurm/imagenet_resnet_debug.sh

This file was deleted.

12 changes: 4 additions & 8 deletions exp/slurm/imagenet_resnet_sub.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,9 @@
#SBATCH --ntasks 1
#SBATCH --requeue

#SBATCH --cpus-per-task 16
#SBATCH --mem=500000M

# Get node with GPUs
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --constraint="gpu&^gpu-bw"
# the constraint ensures that we are not reserving gpu-bw
# --- 4 GPUs on a full node ---
#SBATCH --gres=gpu:a100:4
#SBATCH --cpus-per-task=72
#SBATCH --mem=500000

srun ~/algorithmic-efficiency/exp/slurm/imagenet_resnet.sh
40 changes: 23 additions & 17 deletions exp/slurm/mnist.sh
Original file line number Diff line number Diff line change
@@ -1,39 +1,45 @@
#!/bin/bash -l

# add conda TODO: make it more portable!
source ~/miniconda3/etc/profile.d/conda.sh
# provvisorio
source ~/.bashrc

# # add conda TODO: make it more portable!
# source ~/miniconda3/etc/profile.d/conda.sh

# Activate conda environment TODO: should I use source activate alpe instead?
conda activate alpe

echo "------ $CONDA_DEFAULT_ENV ------"

# Env vars
export CODE_DIR=~/algorithmic-efficiency
export DATA_DIR=~/data
export EXP_DIR=/ptmp/najroldi/exp/algoperf
# export CUDA_VISIBLE_DEVICES=0

# Job specific vars
workload=mnist
dataset=MNIST
submission='reference_algorithms/development_algorithms/mnist/mnist_pytorch/submission.py'
search_space='reference_algorithms/development_algorithms/mnist/tuning_search_space.json'
trials=1
name="mnist_01"
name="mnist_02"

# Print GPU infos
# nvidia-smi
num_gpu=2

# Execute python script
python3 $CODE_DIR/submission_runner.py \
--workload=$workload \
--framework=pytorch \
--tuning_ruleset=external \
--data_dir=$DATA_DIR/$dataset \
--submission_path=$submission \
--tuning_search_space=$search_space \
--num_tuning_trials=$trials \
--experiment_dir=$EXP_DIR \
--experiment_name=$name \
--overwrite
# python3 \
torchrun --redirects 1:0 \
--standalone \
--nproc_per_node=$num_gpu \
$CODE_DIR/submission_runner.py \
--workload=$workload \
--framework=pytorch \
--tuning_ruleset=external \
--data_dir=$DATA_DIR/$dataset \
--submission_path=$submission \
--tuning_search_space=$search_space \
--num_tuning_trials=$trials \
--experiment_dir=$EXP_DIR \
--experiment_name=$name \
--use_wandb \
--overwrite
22 changes: 11 additions & 11 deletions exp/slurm/mnist_sub.sh
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
#!/bin/bash -l

#SBATCH --job-name=mnist_01
#SBATCH --job-name=mnist_02
#SBATCH --output=/ptmp/najroldi/logs/algoperf/job_%j.out
#SBATCH --error=/ptmp/najroldi/logs/algoperf/job_%j.err
#

#SBATCH --time=00:10:00
#SBATCH --ntasks 1
#SBATCH --requeue
#

# --- default case: use a single GPU on a shared node ---
#SBATCH --gres=gpu:a100:1
#SBATCH --cpus-per-task=18
#SBATCH --mem=125000
#
# #SBATCH --gres=gpu:a100:1
# #SBATCH --cpus-per-task=18
# #SBATCH --mem=125000

# --- uncomment to use 2 GPUs on a shared node ---
# #SBATCH --gres=gpu:a100:2
# #SBATCH --cpus-per-task=36
# #SBATCH --mem=250000
#
#SBATCH --gres=gpu:a100:2
#SBATCH --cpus-per-task=36
#SBATCH --mem=250000

# --- uncomment to use 4 GPUs on a full node ---
# #SBATCH --gres=gpu:a100:4
# #SBATCH --cpus-per-task=72
Expand Down
42 changes: 0 additions & 42 deletions exp/slurm/test_1.sh

This file was deleted.

File renamed without changes.
File renamed without changes.
9 changes: 9 additions & 0 deletions exp/slurm/tests/prova2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash -l

source ~/.bashrc

echo $(which conda)

which $(which ldconfig)

echo "=== $CONDA_DEFAULT_ENV ==="
17 changes: 17 additions & 0 deletions exp/slurm/tests/prova2_sub.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

#SBATCH --job-name=prova2
#SBATCH --output=/ptmp/najroldi/logs/algoperf/job_%j.out
#SBATCH --error=/ptmp/najroldi/logs/algoperf/job_%j.err

#SBATCH -D ./
#SBATCH --time=00:03:00
#SBATCH --ntasks 1
#SBATCH --requeue

# --- default case: use a single GPU on a shared node ---
#SBATCH --gres=gpu:a100:1
#SBATCH --cpus-per-task=18
#SBATCH --mem=125000

srun ~/algorithmic-efficiency/exp/slurm/prova2.sh
File renamed without changes.
5 changes: 5 additions & 0 deletions exp/slurm/tests/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash

which ldconfig

which conda

0 comments on commit d4bf670

Please sign in to comment.