-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmegatron_llama3_1_8b_nemo.yaml
168 lines (165 loc) · 5.04 KB
/
megatron_llama3_1_8b_nemo.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
# Referred from https://github.com/NVIDIA/NeMo-Framework-Launcher/blob/main/launcher_scripts/conf/training/llama/llama3_1_8b.yaml
run:
name: llama3_1_8b
results_dir: ${base_results_dir}/${.name}
time_limit: "0-01:30:00"
dependency: "singleton"
trainer:
num_nodes: 16
devices: 8
accelerator: gpu
precision: bf16
logger: False # logger provided by exp_manager
enable_checkpointing: False
use_distributed_sampler: False
max_epochs: null
max_steps: 300000 # consumed_samples = global_step * global_batch_size
max_time: "05:23:30:00" # days:hours:minutes:seconds
log_every_n_steps: 10
val_check_interval: 2000
limit_val_batches: 32
limit_test_batches: 50
accumulate_grad_batches: 1
gradient_clip_val: 1.0
exp_manager:
explicit_log_dir: ${training.run.results_dir}/results
exp_dir: null
name: megatron_llama
create_wandb_logger: False
wandb_logger_kwargs:
project: nemo_llama_pretrain
name: ${training.run.name}
resume_if_exists: false
resume_ignore_no_checkpoint: true
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: val_loss
save_top_k: 10
mode: min
always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
filename: 'megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}'
model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
log_step_timing: True
step_timing_kwargs:
sync_cuda: True
buffer_size: 5
model:
mcore_gpt: true
micro_batch_size: 1
global_batch_size: 2048
rampup_batch_size: null
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
virtual_pipeline_model_parallel_size: null
context_parallel_size: 1
encoder_seq_length: 8192
max_position_embeddings: 8192
num_layers: 32
hidden_size: 4096
ffn_hidden_size: 14336
num_attention_heads: 32
num_query_groups: 8
init_method_std: 0.01
use_scaled_init_method: true
hidden_dropout: 0.0
attention_dropout: 0.0
ffn_dropout: 0.0
kv_channels: null
apply_query_key_layer_scaling: true
normalization: rmsnorm
layernorm_epsilon: 1.0e-05
do_layer_norm_weight_decay: false
make_vocab_size_divisible_by: 128
pre_process: true
post_process: true
persist_layer_norm: true
bias: false
activation: fast-swiglu
headscale: false
transformer_block_type: pre_ln
openai_gelu: false
normalize_attention_scores: true
position_embedding_type: rope
rotary_percentage: 1.0
apply_rope_fusion: true
cross_entropy_loss_fusion: true
attention_type: multihead
share_embeddings_and_output_weights: false
scale_positional_embedding: true
tokenizer:
library: 'sentencepiece'
type: null
model: <path_to_my_model>/tokenizer.model
delimiter: null
vocab_file: null
merge_file: null
sentencepiece_legacy: False
native_amp_init_scale: 4294967296
native_amp_growth_interval: 1000
hysteresis: 2
fp32_residual_connection: false
fp16_lm_cross_entropy: false
megatron_amp_O2: true
grad_allreduce_chunk_size_mb: 125
grad_div_ar_fusion: true
gradient_accumulation_fusion: true
bias_activation_fusion: true
bias_dropout_add_fusion: true
masked_softmax_fusion: true
seed: 1234
resume_from_checkpoint: null
use_cpu_initialization: false
onnx_safe: false
apex_transformer_log_level: 30
gradient_as_bucket_view: true
sync_batch_comm: false
activations_checkpoint_granularity: null
activations_checkpoint_method: null
activations_checkpoint_num_layers: null
num_micro_batches_with_partial_activation_checkpoints: null
activations_checkpoint_layers_per_pipeline: null
sequence_parallel: false
deterministic_mode: false
## Transformer Engine
transformer_engine: true
fp8: False # enables fp8 in TransformerLayer forward
fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
fp8_margin: 0 # scaling margin
fp8_interval: 1 # scaling update interval
fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
use_emha: False
ub_tp_comm_overlap: False
use_flash_attention: true
optim:
name: distributed_fused_adam
lr: 1e-4
weight_decay: 0.1
betas:
- 0.9
- 0.95
bucket_cap_mb: 125
overlap_grad_sync: true
overlap_param_sync: true
contiguous_grad_buffer: true
contiguous_param_buffer: true
sched:
name: CosineAnnealing
warmup_steps: 500
constant_steps: 0
min_lr: 1e-5
data:
data_impl: mock
splits_string: 99990,8,2
seq_length: ${training.model.encoder_seq_length}
skip_warmup: true
num_workers: 2
dataloader_type: single
reset_position_ids: true
reset_attention_mask: true
eod_mask_loss: false
index_mapping_dir: null
data_prefix: []