From d554db7e32f6944903e8bc7d37840c970d94ada7 Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Tue, 22 Oct 2024 10:27:36 +0800 Subject: [PATCH] fix hifigan init bug --- cosyvoice/bin/train.py | 11 ++- cosyvoice/utils/executor.py | 24 +++--- cosyvoice/utils/train_utils.py | 115 +++++++++++++++++++---------- examples/libritts/cosyvoice/run.sh | 1 + 4 files changed, 96 insertions(+), 55 deletions(-) diff --git a/cosyvoice/bin/train.py b/cosyvoice/bin/train.py index 2fd32c8..338375e 100644 --- a/cosyvoice/bin/train.py +++ b/cosyvoice/bin/train.py @@ -68,6 +68,10 @@ def get_args(): action='store_true', default=False, help='Use pinned memory buffers used for reading') + parser.add_argument('--use_amp', + action='store_true', + default=False, + help='Use automatic mixed precision training') parser.add_argument('--deepspeed.save_states', dest='save_states', default='model_only', @@ -133,6 +137,9 @@ def main(): # Get executor executor = Executor(gan=gan) + # Init scaler, used for pytorch amp mixed precision training + scaler = torch.cuda.amp.GradScaler() if args.use_amp else None + # Start training loop for epoch in range(info_dict['max_epoch']): executor.epoch = epoch @@ -141,9 +148,9 @@ def main(): group_join = dist.new_group(backend="gloo", timeout=datetime.timedelta(seconds=args.timeout)) if gan is True: executor.train_one_epoc_gan(model, optimizer, scheduler, optimizer_d, scheduler_d, train_data_loader, cv_data_loader, - writer, info_dict, group_join) + writer, info_dict, scaler, group_join) else: - executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join) + executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, scaler, group_join) dist.destroy_process_group(group_join) diff --git a/cosyvoice/utils/executor.py b/cosyvoice/utils/executor.py index 14550fa..8c38bf0 100644 --- a/cosyvoice/utils/executor.py +++ b/cosyvoice/utils/executor.py @@ -32,7 +32,7 @@ def __init__(self, gan: bool = False): self.rank = int(os.environ.get('RANK', 0)) self.device = torch.device('cuda:{}'.format(self.rank)) - def train_one_epoc(self, model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join): + def train_one_epoc(self, model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, scaler, group_join): ''' Train one epoch ''' @@ -65,10 +65,10 @@ def train_one_epoc(self, model, optimizer, scheduler, train_data_loader, cv_data context = nullcontext with context(): - info_dict = batch_forward(model, batch_dict, info_dict) - info_dict = batch_backward(model, info_dict) + info_dict = batch_forward(model, batch_dict, scaler, info_dict) + info_dict = batch_backward(model, scaler, info_dict) - info_dict = update_parameter_and_lr(model, optimizer, scheduler, info_dict) + info_dict = update_parameter_and_lr(model, optimizer, scheduler, scaler, info_dict) log_per_step(writer, info_dict) # NOTE specify save_per_step in cosyvoice.yaml if you want to enable step save if info_dict['save_per_step'] > 0 and (self.step + 1) % info_dict['save_per_step'] == 0 and \ @@ -82,7 +82,7 @@ def train_one_epoc(self, model, optimizer, scheduler, train_data_loader, cv_data self.cv(model, cv_data_loader, writer, info_dict, on_batch_end=True) def train_one_epoc_gan(self, model, optimizer, scheduler, optimizer_d, scheduler_d, train_data_loader, cv_data_loader, - writer, info_dict, group_join): + writer, info_dict, scaler, group_join): ''' Train one epoch ''' @@ -116,16 +116,16 @@ def train_one_epoc_gan(self, model, optimizer, scheduler, optimizer_d, scheduler with context(): batch_dict['turn'] = 'discriminator' - info_dict = batch_forward(model, batch_dict, info_dict) - info_dict = batch_backward(model, info_dict) - info_dict = update_parameter_and_lr(model, optimizer_d, scheduler_d, info_dict) + info_dict = batch_forward(model, batch_dict, scaler, info_dict) + info_dict = batch_backward(model, scaler, info_dict) + info_dict = update_parameter_and_lr(model, optimizer_d, scheduler_d, scaler, info_dict) optimizer.zero_grad() log_per_step(writer, info_dict) with context(): batch_dict['turn'] = 'generator' - info_dict = batch_forward(model, batch_dict, info_dict) - info_dict = batch_backward(model, info_dict) - info_dict = update_parameter_and_lr(model, optimizer, scheduler, info_dict) + info_dict = batch_forward(model, batch_dict, scaler, info_dict) + info_dict = batch_backward(model, scaler, info_dict) + info_dict = update_parameter_and_lr(model, optimizer, scheduler, scaler, info_dict) optimizer_d.zero_grad() log_per_step(writer, info_dict) # NOTE specify save_per_step in cosyvoice.yaml if you want to enable step save @@ -157,7 +157,7 @@ def cv(self, model, cv_data_loader, writer, info_dict, on_batch_end=True): if self.gan is True: batch_dict['turn'] = 'generator' - info_dict = batch_forward(model, batch_dict, info_dict) + info_dict = batch_forward(model, batch_dict, None, info_dict) for k, v in info_dict['loss_dict'].items(): if k not in total_loss_dict: diff --git a/cosyvoice/utils/train_utils.py b/cosyvoice/utils/train_utils.py index 65e83d6..8678422 100644 --- a/cosyvoice/utils/train_utils.py +++ b/cosyvoice/utils/train_utils.py @@ -110,38 +110,60 @@ def wrap_cuda_model(args, model): def init_optimizer_and_scheduler(args, configs, model, gan): - if configs['train_conf']['optim'] == 'adam': - optimizer = optim.Adam(model.parameters(), **configs['train_conf']['optim_conf']) - elif configs['train_conf']['optim'] == 'adamw': - optimizer = optim.AdamW(model.parameters(), **configs['train_conf']['optim_conf']) - else: - raise ValueError("unknown optimizer: " + configs['train_conf']) - - if configs['train_conf']['scheduler'] == 'warmuplr': - scheduler_type = WarmupLR - scheduler = WarmupLR(optimizer, **configs['train_conf']['scheduler_conf']) - elif configs['train_conf']['scheduler'] == 'NoamHoldAnnealing': - scheduler_type = NoamHoldAnnealing - scheduler = NoamHoldAnnealing(optimizer, **configs['train_conf']['scheduler_conf']) - elif configs['train_conf']['scheduler'] == 'constantlr': - scheduler_type = ConstantLR - scheduler = ConstantLR(optimizer) + if gan is False: + if configs['train_conf']['optim'] == 'adam': + optimizer = optim.Adam(model.parameters(), **configs['train_conf']['optim_conf']) + elif configs['train_conf']['optim'] == 'adamw': + optimizer = optim.AdamW(model.parameters(), **configs['train_conf']['optim_conf']) + else: + raise ValueError("unknown optimizer: " + configs['train_conf']) + + if configs['train_conf']['scheduler'] == 'warmuplr': + scheduler_type = WarmupLR + scheduler = WarmupLR(optimizer, **configs['train_conf']['scheduler_conf']) + elif configs['train_conf']['scheduler'] == 'NoamHoldAnnealing': + scheduler_type = NoamHoldAnnealing + scheduler = NoamHoldAnnealing(optimizer, **configs['train_conf']['scheduler_conf']) + elif configs['train_conf']['scheduler'] == 'constantlr': + scheduler_type = ConstantLR + scheduler = ConstantLR(optimizer) + else: + raise ValueError("unknown scheduler: " + configs['train_conf']) + + # use deepspeed optimizer for speedup + if args.train_engine == "deepspeed": + def scheduler(opt): + return scheduler_type(opt, **configs['train_conf']['scheduler_conf']) + model, optimizer, _, scheduler = deepspeed.initialize( + args=args, + model=model, + optimizer=None, + lr_scheduler=scheduler, + model_parameters=model.parameters()) + + optimizer_d, scheduler_d = None, None + else: - raise ValueError("unknown scheduler: " + configs['train_conf']) - - # use deepspeed optimizer for speedup - if args.train_engine == "deepspeed": - def scheduler(opt): - return scheduler_type(opt, **configs['train_conf']['scheduler_conf']) - model, optimizer, _, scheduler = deepspeed.initialize( - args=args, - model=model, - optimizer=None, - lr_scheduler=scheduler, - model_parameters=model.parameters()) - - # currently we wrap generator and discriminator in one model, so we cannot use deepspeed - if gan is True: + # currently we wrap generator and discriminator in one model, so we cannot use deepspeed + if configs['train_conf']['optim'] == 'adam': + optimizer = optim.Adam(model.module.generator.parameters(), **configs['train_conf']['optim_conf']) + elif configs['train_conf']['optim'] == 'adamw': + optimizer = optim.AdamW(model.module.generator.parameters(), **configs['train_conf']['optim_conf']) + else: + raise ValueError("unknown optimizer: " + configs['train_conf']) + + if configs['train_conf']['scheduler'] == 'warmuplr': + scheduler_type = WarmupLR + scheduler = WarmupLR(optimizer, **configs['train_conf']['scheduler_conf']) + elif configs['train_conf']['scheduler'] == 'NoamHoldAnnealing': + scheduler_type = NoamHoldAnnealing + scheduler = NoamHoldAnnealing(optimizer, **configs['train_conf']['scheduler_conf']) + elif configs['train_conf']['scheduler'] == 'constantlr': + scheduler_type = ConstantLR + scheduler = ConstantLR(optimizer) + else: + raise ValueError("unknown scheduler: " + configs['train_conf']) + if configs['train_conf']['optim_d'] == 'adam': optimizer_d = optim.Adam(model.module.discriminator.parameters(), **configs['train_conf']['optim_conf']) elif configs['train_conf']['optim_d'] == 'adamw': @@ -160,8 +182,6 @@ def scheduler(opt): scheduler_d = ConstantLR(optimizer_d) else: raise ValueError("unknown scheduler: " + configs['train_conf']) - else: - optimizer_d, scheduler_d = None, None return model, optimizer, scheduler, optimizer_d, scheduler_d @@ -216,7 +236,7 @@ def cosyvoice_join(group_join, info_dict): return False -def batch_forward(model, batch, info_dict): +def batch_forward(model, batch, scaler, info_dict): device = int(os.environ.get('LOCAL_RANK', 0)) dtype = info_dict["dtype"] @@ -228,7 +248,7 @@ def batch_forward(model, batch, info_dict): dtype = torch.float32 if info_dict['train_engine'] == 'torch_ddp': - autocast = nullcontext() + autocast = torch.cuda.amp.autocast(enabled=scaler is not None) else: autocast = torch.cuda.amp.autocast(enabled=True, dtype=dtype, cache_enabled=False) @@ -237,27 +257,40 @@ def batch_forward(model, batch, info_dict): return info_dict -def batch_backward(model, info_dict): +def batch_backward(model, scaler, info_dict): if info_dict["train_engine"] == "deepspeed": scaled_loss = model.backward(info_dict['loss_dict']['loss']) else: scaled_loss = info_dict['loss_dict']['loss'] / info_dict['accum_grad'] - scaled_loss.backward() + if scaler is not None: + scaler.scale(scaled_loss).backward() + else: + scaled_loss.backward() info_dict['loss_dict']['loss'] = scaled_loss return info_dict -def update_parameter_and_lr(model, optimizer, scheduler, info_dict): +def update_parameter_and_lr(model, optimizer, scheduler, scaler, info_dict): grad_norm = 0.0 if info_dict['train_engine'] == "deepspeed": info_dict["is_gradient_accumulation_boundary"] = model.is_gradient_accumulation_boundary() model.step() grad_norm = model.get_global_grad_norm() elif (info_dict['batch_idx'] + 1) % info_dict["accum_grad"] == 0: - grad_norm = clip_grad_norm_(model.parameters(), info_dict['grad_clip']) - if torch.isfinite(grad_norm): - optimizer.step() + # Use mixed precision training + if scaler is not None: + scaler.unscale_(optimizer) + grad_norm = clip_grad_norm_(model.parameters(), info_dict['grad_clip']) + # We don't check grad here since that if the gradient + # has inf/nan values, scaler.step will skip + # optimizer.step(). + scaler.step(optimizer) + scaler.update() + else: + grad_norm = clip_grad_norm_(model.parameters(), info_dict['grad_clip']) + if torch.isfinite(grad_norm): + optimizer.step() optimizer.zero_grad() scheduler.step() info_dict["lr"] = optimizer.param_groups[0]['lr'] diff --git a/examples/libritts/cosyvoice/run.sh b/examples/libritts/cosyvoice/run.sh index 5ed5a61..b26d7f0 100644 --- a/examples/libritts/cosyvoice/run.sh +++ b/examples/libritts/cosyvoice/run.sh @@ -99,6 +99,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then --num_workers ${num_workers} \ --prefetch ${prefetch} \ --pin_memory \ + --use_amp \ --deepspeed_config ./conf/ds_stage2.json \ --deepspeed.save_states model+optimizer done