configs/pile-tiny.yaml

run_name: <run_name>
seed: 6198
dry_run: false

  #wandb:
  #  name: ${run_name}
  #  log_interval: 100

model:
  d_model: 512
  n_heads: 8
  n_layers: 8
  mlp_ratio: 8
  alibi: true
  alibi_bias_max: 8.0
  flash_attention: false
  attention_dropout: 0.0
  attention_layer_norm: false
  multi_query_attention: true
  block_type: sequential
  layer_norm_type: low_precision  # if not compiling, use 'low_precision'
  activation_type: gelu
  residual_dropout: 0.0
  embedding_dropout: 0.0
  max_sequence_length: 1024
  include_bias: true

  vocab_size: 50277
  embedding_size: 50304
  eos_token_id: 50276
  pad_token_id: 50276
  init_device: meta
  init_std: 0.02

    #compile:
    #mode: default

optimizer:
  name: adamw
  learning_rate: 1.0e-3
  weight_decay: 0.01
  betas:
  - 0.9
  - 0.95

scheduler:
  name: cosine_with_warmup
  t_warmup: 4000
  alpha_f: 0.1

data:
  paths: ${path.glob:<train_dir>/*.npy}
  pad_direction: right
  num_workers: 2
  drop_last: true
  pin_memory: true
  prefetch_factor: 4  # bump to 16 if on LUMI
  persistent_workers: true
  timeout: 0

tokenizer:
  identifier: EleutherAI/gpt-neox-20b
  truncate_direction: right

save_folder: <save_folder>/${run_name}
save_overwrite: true
# Sharded checkpoints (best for restarts)
save_interval: 10000000000
save_num_checkpoints_to_keep: 2
# Unsharded checkpoints (for final storage)
save_interval_unsharded: 1000
save_num_unsharded_checkpoints_to_keep: 2

load_path: null

max_duration: 10000  # 2T tokens
global_train_batch_size: 128
device_train_microbatch_size: 8

precision: amp_bf16

max_grad_norm: 1.0

speed_monitor:
  window_size: 20

console_log_interval: 100
matformer_factor: 1

eval_interval: 100
eval_subset_num_batches: 10
device_eval_batch_size: ${device_train_microbatch_size}
evaluators:
  ##########################
  # Perplexity evaluations #
  ##########################
  - label: pile-validation
    subset_num_batches: 10
    data:
      paths: [<Validation paths>]
      num_workers: 2
      drop_last: true
      pin_memory: true
      persistent_workers: true
      prefetch_factor: 4

  ##########################
  # Downstream evaluations #
  ##########################
  #  - label: piqa
  #    type: downstream
  #
  #  - label: hellaswag
  #    type: downstream
  #
  #  - label: winogrande
  #    type: downstream
  #
  #  - label: openbook_qa
  #    type: downstream
  #
  #  # - label: boolq  # requires implemention of the pmi_dc matrix
  #    # type: downstream
  #    #
  #  - label: sciq
  #    type: downstream
  #
  #  - label: arc_easy
  #    type: downstream
  #
  #  # - label: arc_challenge  # requires implemention of the pmi_dc matrix
  #    # type: downstream
  #    #
  #  - label: copa
  #    type: downstream
  #
  #  - label: rte
  #    type: downstream
  #
  #  - label: commitment_bank
  #    type: downstream
  #
  #  - label: mrpc
  #    type: downstream
  #
  #  - label: sst2
  #    type: downstream