-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
speedyspeech code adapt for npu (#3804)
* speedyspeech code adapt for npu * fix npu inference * fix e2e synthesize * add paddle version control for memory optim config * fix code style * fix code style * fix help message * fix code style * fix help message
- Loading branch information
1 parent
d615fc3
commit 0b56813
Showing
10 changed files
with
392 additions
and
18 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
#!/bin/bash | ||
|
||
train_output_path=$1 | ||
|
||
stage=0 | ||
stop_stage=0 | ||
|
||
# pwgan | ||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then | ||
python3 ${BIN_DIR}/../inference.py \ | ||
--inference_dir=${train_output_path}/inference \ | ||
--am=speedyspeech_csmsc \ | ||
--voc=pwgan_csmsc \ | ||
--text=${BIN_DIR}/../../assets/sentences.txt \ | ||
--output_dir=${train_output_path}/pd_infer_out \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--device npu | ||
fi | ||
|
||
# for more GAN Vocoders | ||
# multi band melgan | ||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then | ||
python3 ${BIN_DIR}/../inference.py \ | ||
--inference_dir=${train_output_path}/inference \ | ||
--am=speedyspeech_csmsc \ | ||
--voc=mb_melgan_csmsc \ | ||
--text=${BIN_DIR}/../../assets/sentences.txt \ | ||
--output_dir=${train_output_path}/pd_infer_out \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--device npu | ||
fi | ||
|
||
# hifigan | ||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then | ||
python3 ${BIN_DIR}/../inference.py \ | ||
--inference_dir=${train_output_path}/inference \ | ||
--am=speedyspeech_csmsc \ | ||
--voc=hifigan_csmsc \ | ||
--text=${BIN_DIR}/../../assets/sentences.txt \ | ||
--output_dir=${train_output_path}/pd_infer_out \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--device npu | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
#!/bin/bash | ||
|
||
config_path=$1 | ||
train_output_path=$2 | ||
ckpt_name=$3 | ||
|
||
stage=0 | ||
stop_stage=0 | ||
|
||
# pwgan | ||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize_e2e.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=pwgan_csmsc \ | ||
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ | ||
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ | ||
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ | ||
--lang=zh \ | ||
--text=${BIN_DIR}/../../assets/sentences.txt \ | ||
--output_dir=${train_output_path}/test_e2e \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--inference_dir=${train_output_path}/inference \ | ||
--ngpu=0 \ | ||
--nnpu=1 | ||
|
||
|
||
fi | ||
|
||
# for more GAN Vocoders | ||
# multi band melgan | ||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize_e2e.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=mb_melgan_csmsc \ | ||
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ | ||
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ | ||
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ | ||
--lang=zh \ | ||
--text=${BIN_DIR}/../../assets/sentences.txt \ | ||
--output_dir=${train_output_path}/test_e2e \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--inference_dir=${train_output_path}/inference \ | ||
--ngpu=0 \ | ||
--nnpu=1 | ||
fi | ||
|
||
# the pretrained models haven't release now | ||
# style melgan | ||
# style melgan's Dygraph to Static Graph is not ready now | ||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize_e2e.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=style_melgan_csmsc \ | ||
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ | ||
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ | ||
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ | ||
--lang=zh \ | ||
--text=${BIN_DIR}/../../assets/sentences.txt \ | ||
--output_dir=${train_output_path}/test_e2e \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--ngpu=0 \ | ||
--nnpu=1 | ||
# --inference_dir=${train_output_path}/inference | ||
fi | ||
|
||
# hifigan | ||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize_e2e.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=hifigan_csmsc \ | ||
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ | ||
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ | ||
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ | ||
--lang=zh \ | ||
--text=${BIN_DIR}/../../assets/sentences.txt \ | ||
--output_dir=${train_output_path}/test_e2e \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--inference_dir=${train_output_path}/inference \ | ||
--ngpu=0 \ | ||
--nnpu=1 | ||
fi | ||
|
||
# wavernn | ||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then | ||
echo "in wavernn syn_e2e" | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize_e2e.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=wavernn_csmsc \ | ||
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ | ||
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ | ||
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ | ||
--lang=zh \ | ||
--text=${BIN_DIR}/../../assets/sentences.txt \ | ||
--output_dir=${train_output_path}/test_e2e \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--inference_dir=${train_output_path}/inference \ | ||
--ngpu=0 \ | ||
--nnpu=1 | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
#!/bin/bash | ||
|
||
config_path=$1 | ||
train_output_path=$2 | ||
ckpt_name=$3 | ||
stage=0 | ||
stop_stage=0 | ||
|
||
# pwgan | ||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=pwgan_csmsc \ | ||
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ | ||
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ | ||
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ | ||
--test_metadata=dump/test/norm/metadata.jsonl \ | ||
--output_dir=${train_output_path}/test \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--ngpu=0 \ | ||
--nnpu=1 | ||
fi | ||
|
||
# for more GAN Vocoders | ||
# multi band melgan | ||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=mb_melgan_csmsc \ | ||
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ | ||
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ | ||
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ | ||
--test_metadata=dump/test/norm/metadata.jsonl \ | ||
--output_dir=${train_output_path}/test \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--ngpu=0 \ | ||
--nnpu=1 | ||
fi | ||
|
||
# style melgan | ||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=style_melgan_csmsc \ | ||
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ | ||
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ | ||
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ | ||
--test_metadata=dump/test/norm/metadata.jsonl \ | ||
--output_dir=${train_output_path}/test \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--ngpu=0 \ | ||
--nnpu=1 | ||
fi | ||
|
||
# hifigan | ||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then | ||
echo "in hifigan syn" | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=hifigan_csmsc \ | ||
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ | ||
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ | ||
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ | ||
--test_metadata=dump/test/norm/metadata.jsonl \ | ||
--output_dir=${train_output_path}/test \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--ngpu=0 \ | ||
--nnpu=1 | ||
fi | ||
|
||
# wavernn | ||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then | ||
echo "in wavernn syn" | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=wavernn_csmsc \ | ||
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ | ||
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ | ||
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ | ||
--test_metadata=dump/test/norm/metadata.jsonl \ | ||
--output_dir=${train_output_path}/test \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--ngpu=0 \ | ||
--nnpu=1 | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
|
||
#!/bin/bash | ||
|
||
config_path=$1 | ||
train_output_path=$2 | ||
|
||
python ${BIN_DIR}/train.py \ | ||
--train-metadata=dump/train/norm/metadata.jsonl \ | ||
--dev-metadata=dump/dev/norm/metadata.jsonl \ | ||
--config=${config_path} \ | ||
--output-dir=${train_output_path} \ | ||
--ngpu=0 \ | ||
--nnpu=1 \ | ||
--phones-dict=dump/phone_id_map.txt \ | ||
--tones-dict=dump/tone_id_map.txt \ | ||
--use-relative-path=True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
#!/bin/bash | ||
|
||
set -e | ||
source path.sh | ||
|
||
npus=0 | ||
stage=0 | ||
stop_stage=100 | ||
|
||
conf_path=conf/default.yaml | ||
train_output_path=exp/default | ||
ckpt_name=snapshot_iter_76.pdz | ||
|
||
# with the following command, you can choose the stage range you want to run | ||
# such as `./run_xpu.sh --stage 0 --stop-stage 0` | ||
# this can not be mixed use with `$1`, `$2` ... | ||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 | ||
|
||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then | ||
# prepare data | ||
./local/preprocess.sh ${conf_path} || exit -1 | ||
fi | ||
|
||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then | ||
# train model, all `ckpt` under `train_output_path/checkpoints/` dir | ||
FLAGS_selected_npus=${npus} ./local/train_npu.sh ${conf_path} ${train_output_path} || exit -1 | ||
fi | ||
|
||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then | ||
# synthesize, vocoder is pwgan by default | ||
FLAGS_selected_npus=${npus} ./local/synthesize_npu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 | ||
fi | ||
|
||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then | ||
# synthesize_e2e, vocoder is pwgan by default | ||
FLAGS_selected_npus=${npus} ./local/synthesize_e2e_npu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 | ||
fi | ||
|
||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then | ||
# inference with static model | ||
FLAGS_selected_npus=${npus} ./local/inference_npu.sh ${train_output_path} || exit -1 | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.