recipes_collection/recipes/training/llama/megatron_llama3_1_8b_nemo.yaml

# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com

# Referred from https://github.com/NVIDIA/NeMo-Framework-Launcher/blob/main/launcher_scripts/conf/training/llama/llama3_1_8b.yaml
run:
  name: llama3_1_8b
  results_dir: ${base_results_dir}/${.name}
  time_limit: "0-01:30:00"
  dependency: "singleton"
trainer:
  num_nodes: 16
  devices: 8
  accelerator: gpu
  precision: bf16
  logger: False # logger provided by exp_manager
  enable_checkpointing: False
  use_distributed_sampler: False
  max_epochs: null
  max_steps: 300000 # consumed_samples = global_step * global_batch_size
  max_time: "05:23:30:00" # days:hours:minutes:seconds
  log_every_n_steps: 10
  val_check_interval: 2000
  limit_val_batches: 32
  limit_test_batches: 50
  accumulate_grad_batches: 1
  gradient_clip_val: 1.0
exp_manager:
  explicit_log_dir: ${training.run.results_dir}/results
  exp_dir: null
  name: megatron_llama
  create_wandb_logger: False
  wandb_logger_kwargs:
    project: nemo_llama_pretrain
    name: ${training.run.name}
  resume_if_exists: false
  resume_ignore_no_checkpoint: true
  create_checkpoint_callback: True
  checkpoint_callback_params:
    monitor: val_loss
    save_top_k: 10
    mode: min
    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
    save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
    filename: 'megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}'
    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
  log_step_timing: True
  step_timing_kwargs:
    sync_cuda: True
    buffer_size: 5

model:
  mcore_gpt: true
  micro_batch_size: 1
  global_batch_size: 2048
  rampup_batch_size: null
  tensor_model_parallel_size: 1
  pipeline_model_parallel_size: 1
  virtual_pipeline_model_parallel_size: null
  context_parallel_size: 1
  encoder_seq_length: 8192
  max_position_embeddings: 8192
  num_layers: 32
  hidden_size: 4096
  ffn_hidden_size: 14336
  num_attention_heads: 32
  num_query_groups: 8
  init_method_std: 0.01
  use_scaled_init_method: true
  hidden_dropout: 0.0
  attention_dropout: 0.0
  ffn_dropout: 0.0
  kv_channels: null
  apply_query_key_layer_scaling: true
  normalization: rmsnorm
  layernorm_epsilon: 1.0e-05
  do_layer_norm_weight_decay: false
  make_vocab_size_divisible_by: 128
  pre_process: true
  post_process: true
  persist_layer_norm: true
  bias: false
  activation: fast-swiglu
  headscale: false
  transformer_block_type: pre_ln
  openai_gelu: false
  normalize_attention_scores: true
  position_embedding_type: rope
  rotary_percentage: 1.0
  apply_rope_fusion: true
  cross_entropy_loss_fusion: true
  attention_type: multihead
  share_embeddings_and_output_weights: false
  scale_positional_embedding: true
  tokenizer:
    library: 'sentencepiece'
    type: null
    model: <path_to_my_model>/tokenizer.model
    delimiter: null
    vocab_file: null
    merge_file: null
    sentencepiece_legacy: False
  native_amp_init_scale: 4294967296
  native_amp_growth_interval: 1000
  hysteresis: 2
  fp32_residual_connection: false
  fp16_lm_cross_entropy: false
  megatron_amp_O2: true
  grad_allreduce_chunk_size_mb: 125
  grad_div_ar_fusion: true
  gradient_accumulation_fusion: true
  bias_activation_fusion: true
  bias_dropout_add_fusion: true
  masked_softmax_fusion: true
  seed: 1234
  resume_from_checkpoint: null
  use_cpu_initialization: false
  onnx_safe: false
  apex_transformer_log_level: 30
  gradient_as_bucket_view: true
  sync_batch_comm: false
  activations_checkpoint_granularity: null
  activations_checkpoint_method: null
  activations_checkpoint_num_layers: null
  num_micro_batches_with_partial_activation_checkpoints: null
  activations_checkpoint_layers_per_pipeline: null
  sequence_parallel: false
  deterministic_mode: false

  ## Transformer Engine
  transformer_engine: true
  fp8: False # enables fp8 in TransformerLayer forward
  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
  fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
  fp8_margin: 0 # scaling margin
  fp8_interval: 1 # scaling update interval
  fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
  fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
  use_emha: False
  ub_tp_comm_overlap: False
  use_flash_attention: true
  optim:
    name: distributed_fused_adam
    lr: 1e-4
    weight_decay: 0.1
    betas:
      - 0.9
      - 0.95
    bucket_cap_mb: 125
    overlap_grad_sync: true
    overlap_param_sync: true
    contiguous_grad_buffer: true
    contiguous_param_buffer: true
    sched:
      name: CosineAnnealing
      warmup_steps: 500
      constant_steps: 0
      min_lr: 1e-5
  data:
    data_impl: mock
    splits_string: 99990,8,2
    seq_length: ${training.model.encoder_seq_length}
    skip_warmup: true
    num_workers: 2
    dataloader_type: single
    reset_position_ids: true
    reset_attention_mask: true
    eod_mask_loss: false
    index_mapping_dir: null
    data_prefix: []