-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtrain_raybohb.py
96 lines (83 loc) · 3.12 KB
/
train_raybohb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from typing import Optional
import os
import shutil
import json
import utils.util as utils
from datetime import datetime
# import wandb
from ray import tune
from ray.tune.suggest.bohb import TuneBOHB
from ray.tune.suggest.hyperopt import HyperOptSearch
from ray.tune.schedulers import PopulationBasedTraining, HyperBandForBOHB, ASHAScheduler
from ray.tune import CLIReporter
# from ray.tune.integration.wandb import WandbLoggerCallback
from ray.tune.logger import DEFAULT_LOGGERS
from transformers import Trainer, TrainingArguments
from transformers.trainer_utils import BestRun
from ray.tune.stopper import TrialPlateauStopper
from pathlib import Path
def hyperparameter_tune(trainer: Trainer, training_args: TrainingArguments, experiment_name) -> BestRun:
resume = False
def ray_hp_space(trial):
return {
# "weight_decay": tune.uniform(0.0, 0.3),
"num_train_epochs": tune.randint(5, 12),
"learning_rate": tune.loguniform(5e-6, 3e-5),
"warmup_ratio": tune.uniform(0, 0.5),
# "attention_probs_dropout_prob": tune.uniform(0, 0.2),
# "hidden_dropout_prob": tune.uniform(0, 0.2),
# "per_device_train_batch_size": tune.choice([16]),
}
# time_attr = "training_iteration"
time_attr = "epoch"
scheduler = HyperBandForBOHB(
time_attr=time_attr,
# metric=utils.compute_metrics,
# number of training_iterations (evaluations) to run for each trial, * 2 to allow for grace period
# max_t=max_training_iterations * 2,
max_t=int(training_args.num_train_epochs) * 2,
reduction_factor=4,
stop_last_trials=True,
)
search = TuneBOHB(
# space=config_space, # If you want to set the space manually
max_concurrent=4
)
reporter = CLIReporter(
parameter_columns={
# "weight_decay": "w_decay",
"learning_rate": "lr",
"warmup_ratio": "wr",
# "attention_probs_dropout_prob": "att_do",
# "hidden_dropout_prob": "hi_do",
# "per_device_train_batch_size": "bs",
# "per_device_train_batch_size": "train_bs/gpu",
"num_train_epochs": "num_epochs",
},
metric_columns=["eval_auprc", "eval_loss", "eval_micro f1 score", "epoch", "training_iteration"],
)
def my_objective(metrics):
return metrics["eval_micro f1 score"]
best_run = trainer.hyperparameter_search(
hp_space=ray_hp_space,
metric="eval_micro f1 score",
mode="max",
direction="maximize",
backend="ray",
n_trials=10,
scheduler=scheduler,
search_alg=search,
# keep_checkpoints_num=1,
# checkpoint_score_attr="training_iteration",
checkpoint_score_attr="epoch",
stop=TrialPlateauStopper("eval_micro f1 score"),
progress_reporter=reporter,
local_dir="./raytune_log",
name=experiment_name,
log_to_file=True,
fail_fast=True,
resume=resume,
compute_objective=my_objective,
)
# save_best_model(best_run, experiment_name, './raytune_log')
return best_run