-
Notifications
You must be signed in to change notification settings - Fork 4
Experiments set I: baselines
justheuristic edited this page Jun 22, 2022
·
5 revisions
Tensorboard: https://huggingface.co/bigscience/dechonk-logs-1/tensorboard
Using bloom-6b3 at 156K steps Model version: https://huggingface.co/bigscience/bloom-6b3/commit/7e0ac14c5fcba85ad009953ab243eef4624f1503 Re-warming-up the learning rate to value 8e-4 (the last recorded LR in https://huggingface.co/bigscience/tr11f-6B3-logs/tensorboard ) Training params:
deepspeed --num_gpus 8 ./run_clm.py --do_train --do_eval \
--model_name $INITIAL_MODEL_PATH --tokenizer_name $INITIAL_MODEL_PATH \
--dataset_name $DATASET_NAME_OR_PATH --dataset_config_name $DATASET_CONFIG_NAME --run_name $RUN_NAME \
--block_size 2048 --per_device_train_batch_size 4 --per_device_eval_batch_size 4 --gradient_accumulation_steps 16 \
--learning_rate 0.00008 --max_grad_norm 1.0 --lr_scheduler_type cosine --max_steps 200000 --warmup_steps 1000 \
--adam_epsilon 1e-8 --weight_decay 0.1 --adam_beta1 0.9 --adam_beta2 0.95 --fp16=True --seed 42 \
--cache_dir $INPUT_PATH/data/cache --output_dir $SNAPSHOT_PATH --overwrite_output_dir=True \
--logging_dir $LOGS_PATH --report_to tensorboard --logging_first_step --logging_steps 10 \
--evaluation_strategy steps --eval_steps 100 --prediction_loss_only --eval_subset_size 512 \
--save_steps 100 --save_total_limit 1 --dataloader_num_workers 4 --deepspeed ds_config.json
Why: to get an early checkpoint with which we can test Iz's proposal. Model: same as above, but re-initialize weights using HF BLOOM initializer.
Training params:
deepspeed --num_gpus 8 ./run_clm.py --do_train --do_eval \
--config_name $MODEL_CONFIG_PATH --tokenizer_name $TOKENIZER_PATH \
--dataset_name $DATASET_NAME_OR_PATH --dataset_config_name $DATASET_CONFIG_NAME --run_name $RUN_NAME \
--block_size 2048 --per_device_train_batch_size 4 --per_device_eval_batch_size 4 --gradient_accumulation_steps 16 \
--learning_rate 0.00012 --max_grad_norm 1.0 --lr_scheduler_type cosine --max_steps 350000 --warmup_steps 1000 \
--adam_epsilon 1e-8 --weight_decay 0.1 --adam_beta1 0.9 --adam_beta2 0.95 --fp16=True --seed 42 \
--cache_dir $INPUT_PATH/data/cache --output_dir $SNAPSHOT_PATH --overwrite_output_dir=True \
--logging_dir $LOGS_PATH --report_to tensorboard --logging_first_step --logging_steps 10 \
--evaluation_strategy steps --eval_steps 100 --prediction_loss_only --eval_subset_size 512 \
--save_steps 100 --save_total_limit 1 --dataloader_num_workers 4 --deepspeed ds_config.json
Run 2: train student-sized model from scratch [tensorboard/bloom-smaller-from-scratch-0.5hid-lr2e-4
]
model_config.json:
{
"apply_residual_connection_post_layernorm": false,
"attention_dropout": 0.0,
"attention_softmax_in_fp32": true,
"bias_dropout_fusion": true,
"bos_token_id": 1,
"dtype": "float16",
"eos_token_id": 2,
"pad_token_id": 3,
"unk_token_id": 0,
"hidden_dropout": 0.0,
"initializer_range": 0.02,
"layer_norm_epsilon": 1e-05,
"masked_softmax_fusion": true,
"model_type": "bloom",
"n_embed": 2048,
"n_inner": null,
"n_layer": 30,
"num_attention_heads": 16,
"offset_alibi": 100,
"pretraining_tp": 4,
"seq_length": 2048,
"skip_bias_add": true,
"skip_bias_add_qkv": false,
"transformers_version": "4.20.0.dev0",
"use_cache": false,
"vocab_size": 250880
}
Training params:
deepspeed --num_gpus 8 ./run_clm.py --do_train --do_eval \
--config_name $MODEL_CONFIG_PATH --tokenizer_name $TOKENIZER_PATH \
--dataset_name $DATASET_NAME_OR_PATH --dataset_config_name $DATASET_CONFIG_NAME --run_name $RUN_NAME \
--block_size 2048 --per_device_train_batch_size 4 --per_device_eval_batch_size 4 --gradient_accumulation_steps 16 \
--learning_rate 0.0002 --max_grad_norm 1.0 --lr_scheduler_type cosine --max_steps 350000 --warmup_steps 1000 \
--adam_epsilon 1e-8 --weight_decay 0.1 --adam_beta1 0.9 --adam_beta2 0.95 --fp16=True --seed 42 \
--cache_dir $INPUT_PATH/data/cache --output_dir $SNAPSHOT_PATH --overwrite_output_dir=True \
--logging_dir $LOGS_PATH --report_to tensorboard --logging_first_step --logging_steps 10 \
--evaluation_strategy steps --eval_steps 100 --prediction_loss_only --eval_subset_size 512 \
--save_steps 100 --save_total_limit 5 --dataloader_num_workers 4 --deepspeed ds_config.json
export CONVERTED_MODEL_PATH=./model_downsampled
python downsample_model.py --model_name $INITIAL_MODEL_PATH --output_model_name $CONVERTED_MODEL_PATH --hidden_downsampling_rate 0.5 --aggregation_strategy mean --layer_downsampling_rate 1 --layer_selection_strategy first
cp $INITIAL_MODEL_PATH/*token*.json $CONVERTED_MODEL_PATH
deepspeed --num_gpus 8 ./run_clm.py --do_train --do_eval \
--model_name $CONVERTED_MODEL_PATH --tokenizer_name $CONVERTED_MODEL_PATH \
--dataset_name $DATASET_NAME_OR_PATH --dataset_config_name $DATASET_CONFIG_NAME --run_name $RUN_NAME \
--block_size 2048 --per_device_train_batch_size 4 --per_device_eval_batch_size 4 --gradient_accumulation_steps 16 \
--learning_rate 0.0002 --max_grad_norm 1.0 --lr_scheduler_type cosine --max_steps 3200000 --warmup_steps 16000 \
--adam_epsilon 1e-8 --weight_decay 0.1 --adam_beta1 0.9 --adam_beta2 0.95 --fp16=True --seed 42 \
--cache_dir $INPUT_PATH/data/cache --output_dir $SNAPSHOT_PATH --overwrite_output_dir=True \
--logging_dir $LOGS_PATH --report_to tensorboard --logging_first_step --logging_steps 10 \
--evaluation_strategy steps --eval_steps 100 --prediction_loss_only --eval_subset_size 512 \
--save_steps 500 --save_total_limit 1 --dataloader_num_workers 4 --ignore_data_skip \
--deepspeed ds_config.json
export CONVERTED_MODEL_PATH=./model_downsampled
python downsample_model.py --model_name $INITIAL_MODEL_PATH --output_model_name $CONVERTED_MODEL_PATH --hidden_downsampling_rate 1.0 --aggregation_strategy first --layer_downsampling_rate 0.2334 --layer_selection_strategy step
cp $INITIAL_MODEL_PATH/*token*.json $CONVERTED_MODEL_PATH
deepspeed --num_gpus 8 ./run_clm.py --do_train --do_eval \
--model_name $CONVERTED_MODEL_PATH --tokenizer_name $CONVERTED_MODEL_PATH \
--dataset_name $DATASET_NAME_OR_PATH --dataset_config_name $DATASET_CONFIG_NAME --run_name $RUN_NAME \
--block_size 2048 --per_device_train_batch_size 4 --per_device_eval_batch_size 4 --gradient_accumulation_steps 16 \
--learning_rate 0.0002 --max_grad_norm 1.0 --lr_scheduler_type cosine --max_steps 3200000 --warmup_steps 16000 \
--adam_epsilon 1e-8 --weight_decay 0.1 --adam_beta1 0.9 --adam_beta2 0.95 --fp16=True --seed 42 \
--cache_dir $INPUT_PATH/data/cache --output_dir $SNAPSHOT_PATH --overwrite_output_dir=True \
--logging_dir $LOGS_PATH --report_to tensorboard --logging_first_step --logging_steps 10 \
--evaluation_strategy steps --eval_steps 100 --prediction_loss_only --eval_subset_size 512 \
--save_steps 500 --save_total_limit 1 --dataloader_num_workers 4 --ignore_data_skip \
--deepspeed ds_config.json
export CONVERTED_MODEL_PATH=./model_downsampled
python downsample_model.py --model_name $INITIAL_MODEL_PATH --output_model_name $CONVERTED_MODEL_PATH --hidden_downsampling_rate 1.0 --aggregation_strategy first --layer_downsampling_rate 0.2334 --layer_selection_strategy mean
cp $INITIAL_MODEL_PATH/*token*.json $CONVERTED_MODEL_PATH
deepspeed --num_gpus 8 ./run_clm.py --do_train --do_eval \
--model_name $CONVERTED_MODEL_PATH --tokenizer_name $CONVERTED_MODEL_PATH \
--dataset_name $DATASET_NAME_OR_PATH --dataset_config_name $DATASET_CONFIG_NAME --run_name $RUN_NAME \
--block_size 2048 --per_device_train_batch_size 4 --per_device_eval_batch_size 4 --gradient_accumulation_steps 16 \
--learning_rate 0.0002 --max_grad_norm 1.0 --lr_scheduler_type cosine --max_steps 3200000 --warmup_steps 16000 \
--adam_epsilon 1e-8 --weight_decay 0.1 --adam_beta1 0.9 --adam_beta2 0.95 --fp16=True --seed 42 \
--cache_dir $INPUT_PATH/data/cache --output_dir $SNAPSHOT_PATH --overwrite_output_dir=True \
--logging_dir $LOGS_PATH --report_to tensorboard --logging_first_step --logging_steps 10 \
--evaluation_strategy steps --eval_steps 100 --prediction_loss_only --eval_subset_size 512 \
--save_steps 500 --save_total_limit 1 --dataloader_num_workers 4 --ignore_data_skip \
--deepspeed ds_config.json