Skip to content

Commit

Permalink
lawa_queue tested on ogbg
Browse files Browse the repository at this point in the history
  • Loading branch information
Niccolo-Ajroldi committed Nov 17, 2024
1 parent 8bab9b3 commit 6c7e69d
Show file tree
Hide file tree
Showing 8 changed files with 861 additions and 40 deletions.
4 changes: 3 additions & 1 deletion algorithmic_efficiency/logger_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,9 @@ def __init__(self,
self._tb_metric_writer = metric_writers.create_default_writer(events_dir)
if wandb is not None and self.use_wandb:
wandb.init(
dir=events_dir, tags=[flags.FLAGS.workload, flags.FLAGS.framework])
project='algoperf_lawa',
dir=events_dir,
tags=[flags.FLAGS.workload, flags.FLAGS.framework])
wandb.config.update(configs)
wandb.config.update(hyperparameters._asdict())

Expand Down
64 changes: 35 additions & 29 deletions algorithmic_efficiency/spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,17 +318,20 @@ def eval_model(self,
imagenet_v2_data_dir: Optional[str],
global_step: int) -> Dict[str, float]:
"""Run a full evaluation of the model."""
logging.info('Evaluating on the training split.')
train_metrics = self._eval_model_on_split(
split='eval_train',
num_examples=self.num_eval_train_examples,
global_batch_size=global_batch_size,
params=params,
model_state=model_state,
rng=rng,
data_dir=data_dir,
global_step=global_step)
eval_metrics = {'train/' + k: v for k, v in train_metrics.items()}
# (nico): skip eval on test
eval_metrics = {}
if False:
logging.info('Evaluating on the training split.')
train_metrics = self._eval_model_on_split(
split='eval_train',
num_examples=self.num_eval_train_examples,
global_batch_size=global_batch_size,
params=params,
model_state=model_state,
rng=rng,
data_dir=data_dir,
global_step=global_step)
eval_metrics = {'train/' + k: v for k, v in train_metrics.items()}
# We always require a validation set.
logging.info('Evaluating on the validation split.')
validation_metrics = self._eval_model_on_split(
Expand All @@ -343,24 +346,27 @@ def eval_model(self,
for k, v in validation_metrics.items():
eval_metrics['validation/' + k] = v
eval_metrics['validation/num_examples'] = self.num_validation_examples
# Evaluate on the test set. TODO(znado): always eval on the test set.
try:
if self.num_test_examples is not None:
logging.info('Evaluating on the test split.')
test_metrics = self._eval_model_on_split(
'test',
num_examples=self.num_test_examples,
global_batch_size=global_batch_size,
params=params,
model_state=model_state,
rng=rng,
data_dir=imagenet_v2_data_dir if imagenet_v2_data_dir else data_dir,
global_step=global_step)
for k, v in test_metrics.items():
eval_metrics['test/' + k] = v
eval_metrics['test/num_examples'] = self.num_test_examples
except NotImplementedError:
pass

# (nico): skip eval on test
if False:
# Evaluate on the test set. TODO(znado): always eval on the test set.
try:
if self.num_test_examples is not None:
logging.info('Evaluating on the test split.')
test_metrics = self._eval_model_on_split(
'test',
num_examples=self.num_test_examples,
global_batch_size=global_batch_size,
params=params,
model_state=model_state,
rng=rng,
data_dir=imagenet_v2_data_dir if imagenet_v2_data_dir else data_dir,
global_step=global_step)
for k, v in test_metrics.items():
eval_metrics['test/' + k] = v
eval_metrics['test/num_examples'] = self.num_test_examples
except NotImplementedError:
pass

return eval_metrics

Expand Down
2 changes: 1 addition & 1 deletion script/pytorch/2xa100/auto_run_array.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ study=$6
num_tuning_trials=$7
rng_seed=$8
allow_tf_32=$9
halve_cuda_mem=$10
halve_cuda_mem=${10}

workload_list=(
criteo1tb
Expand Down
4 changes: 2 additions & 2 deletions script/pytorch/2xa100/condor.sub
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ num_jobs=8
workload_or_id=$(Process)
# workload_or_id=criteo1tb
framework=pytorch
submission=prize_qualification_baselines/external_tuning/pytorch_nadamw_full_budget.py
search_space=prize_qualification_baselines/external_tuning/tuning_search_space.json
submission=submissions/lawa_queue/lawa_queue.py
search_space=submissions/lawa_queue/tuning_search_space.json
name=a10040GB_x2_noTF32_01
study=1
num_tuning_trials=1
Expand Down
Loading

0 comments on commit 6c7e69d

Please sign in to comment.