Merge pull request #123 from huggingface/xrsrke/mu_transfer

[Feature] Spectral µTransfer
huggingface · Apr 16, 2024 · 450fb67 · 450fb67
2 parents e445950 + 074c1ad
commit 450fb67
Show file tree

Hide file tree

Showing 26 changed files with 1,129 additions and 144 deletions.
diff --git a/.gitignore b/.gitignore
@@ -162,4 +162,4 @@ cython_debug/
 .vscode
 
 checkpoints/
-wandb/*
+wandb/
diff --git a/README.md b/README.md
@@ -44,6 +44,7 @@ We support the following:
  - ZeRO-1 optimizer
  - FP32 gradient accumulation
  - Parameter tying/sharding
+ - Spectral µTransfer parametrization for scaling up neural networks
 
 # Installation
 
@@ -111,6 +112,10 @@ Features we would like to add:
 - `scripts/log_lighteval_to_wandb.py`: logs the evaluation results of LightEval to wandb, including summary statistics.
 
 
+# Environment Variables
+- `NANOTRON_BENCHMARK=1`: if you want to log the throughput during training
+
+
 # Credits
 
 We would like to thank everyone working on LLMs, especially those sharing their work openly from which we took great inspiration: Nvidia for `Megatron-LM/apex`, Microsoft for `DeepSpeed`, HazyResearch for `flash-attn`
diff --git a/examples/config_tiny_llama.yaml b/examples/config_tiny_llama.yaml
@@ -1,34 +1,3 @@
-checkpoints:
-  checkpoint_interval: 10
-  checkpoints_path: /fsx/nouamane/projects/nanotron/checkpoints
-  checkpoints_path_is_shared_file_system: false
-  resume_checkpoint_path: null
-  save_initial_state: false
-data_stages:
-- data:
-    dataset:
-      dataset_overwrite_cache: false
-      dataset_processing_num_proc_per_process: 1
-      hf_dataset_config_name: null
-      hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
-      hf_dataset_splits: train
-      text_column_name: completion
-    num_loading_workers: 1
-    seed: 42
-  name: Stable Training Stage
-  start_training_step: 1
-- data:
-    dataset:
-      dataset_overwrite_cache: false
-      dataset_processing_num_proc_per_process: 1
-      hf_dataset_config_name: null
-      hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
-      hf_dataset_splits: train
-      text_column_name: completion
-    num_loading_workers: 1
-    seed: 42
-  name: Annealing Phase
-  start_training_step: 10
 general:
   benchmark_csv_path: null
   consumed_train_samples: null
@@ -37,28 +6,24 @@ general:
   run: tiny_llama_%date_%jobid
   seed: 42
   step: null
-lighteval: null
-logging:
-  iteration_step_info_interval: 1
-  log_level: info
-  log_level_replica: info
 model:
   ddp_bucket_cap_mb: 25
   dtype: bfloat16
   init_method:
     std: 0.025
+    # use_mup: true # uncomment this and comment the std line above to use spectral µTransfer
   make_vocab_size_divisible_by: 1
   model_config:
     bos_token_id: 1
     eos_token_id: 2
     hidden_act: silu
-    hidden_size: 16
+    hidden_size: 32
     initializer_range: 0.02
-    intermediate_size: 64
+    intermediate_size: 128
     is_llama_config: true
     max_position_embeddings: 256
     num_attention_heads: 4
-    num_hidden_layers: 2
+    num_hidden_layers: 10
     num_key_value_heads: 4
     pad_token_id: null
     pretraining_tp: 1
@@ -74,15 +39,15 @@ optimizer:
   adam_eps: 1.0e-08
   clip_grad: 1.0
   learning_rate_scheduler:
-    learning_rate: 0.0003
+    learning_rate: 0.001
     lr_decay_starting_step: null
-    lr_decay_steps: 8
+    lr_decay_steps: null
     lr_decay_style: cosine
-    lr_warmup_steps: 2
+    lr_warmup_steps: 2000 # 20% of the total steps
     lr_warmup_style: linear
     min_decay_lr: 1.0e-05
   torch_adam_is_fused: true
-  weight_decay: 0.01
+  weight_decay: 0.1
   zero_stage: 0
 parallelism:
   dp: 2
@@ -92,7 +57,32 @@ parallelism:
   tp: 2
   tp_linear_async_communication: true
   tp_mode: REDUCE_SCATTER
-profiler: null
+data_stages:
+  - name: Stable Training Stage
+    start_training_step: 1
+    data:
+      dataset:
+        dataset_overwrite_cache: false
+        dataset_processing_num_proc_per_process: 1
+        hf_dataset_config_name: null
+        hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
+        hf_dataset_splits: train
+        text_column_name: completion
+      num_loading_workers: 1
+      seed: 42
+  - name: Annealing Phase
+    start_training_step: 10
+    data:
+      dataset:
+        dataset_overwrite_cache: false
+        dataset_processing_num_proc_per_process: 1
+        hf_dataset_config_name: null
+        hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
+        hf_dataset_splits: train
+        text_column_name: completion
+      num_loading_workers: 1
+      seed: 42
+lighteval: null
 tokenizer:
   tokenizer_max_length: null
   tokenizer_name_or_path: gpt2
@@ -103,5 +93,16 @@ tokens:
   limit_val_batches: 0
   micro_batch_size: 2
   sequence_length: 32
-  train_steps: 10
+  train_steps: 15
   val_check_interval: -1
+checkpoints:
+  checkpoint_interval: 10
+  checkpoints_path: checkpoints
+  checkpoints_path_is_shared_file_system: false
+  resume_checkpoint_path: null
+  save_initial_state: false
+profiler: null
+logging:
+  iteration_step_info_interval: 1
+  log_level: info
+  log_level_replica: info
diff --git a/examples/mup/README.md b/examples/mup/README.md
@@ -0,0 +1,34 @@
+OpenAI's scaling laws [[link]](https://arxiv.org/abs/2001.08361) in 2020 has shown that scaling is one of the core ingredients for the success of LLMs. But naively stacking more layers can lead to unstable training due to exploding or vanishing gradients. In our implementation, the experimental results show that in a 350m llama, spectral µTransfer matches the pretraining performance of the baseline (albeit with a slightly higher training loss of 0.04). In another MLP-only experiment, µTransfer maintains a consistent L1 norm of activation across widths, and depths and allows scaling up to 2B while the SP baseline blows up and becomes untrainable.
+
+
+# How to use Spectral µTransfer
+In your Nanotron configuration, simply set `use_mup` to `true`. Nanotron will automatically determine the right standard deviation and learning rate for each parameter.
+
+
+```diff
+model:
+  ...
+  init_method:
+-    std: 0.025
++    use_mup: true
+```
+
+# MLP Only Experiment
+
+We ran a systematic experiment varying the number of layers from 8 to 32, width from 128 to 8192, and batch size from 32 to 2048, all on a logarithmic scale, CIFAR dataset, using an MSE training objective for 4 epochs with Adam optimizer. [[Experiment Report]](https://wandb.ai/neuralink/exp14_mup_grid_search/reports/-Spectral-Transfer-MLP-s-Experiment-Results--Vmlldzo3NDQ0NTQw?accessToken=xe0mkunx3y8t0xzbzxu9caqcre57or5la58d9o209hinanlmzoaj7es24m4elvdj)
+
+
+![Scale across widths](./assets/scale-across-width.png)
+
+
+
+![Scale across depths](./assets/scale-across-depth.png)
+
+
+# On 350m LLaMA
+
+We trained a 350m model with spectral µTransfer and standard parametrization using Nanotron, a global batch size of 1m tokens at a learning rate of 0.001. µTransfer matches the performance of standard parametrization, with a slightly higher training loss of 0.04. [[Experiment Report]](https://api.wandb.ai/links/neuralink/i70nnpu9)
+
+Please check the directory [[./examples/mup/configs]](/examples/mup/configs) for the configurations we used to reproduce the experiments.
+
+![LLaMA](./assets/llama.png)
diff --git a/examples/mup/assets/llama.png b/examples/mup/assets/llama.png
diff --git a/examples/mup/assets/scale-across-depth.png b/examples/mup/assets/scale-across-depth.png
diff --git a/examples/mup/assets/scale-across-width.png b/examples/mup/assets/scale-across-width.png
diff --git a/examples/mup/configs/mup_350m_llama_config.yaml b/examples/mup/configs/mup_350m_llama_config.yaml
@@ -0,0 +1,141 @@
+checkpoints:
+  checkpoint_interval: 10000
+  checkpoints_path: checkpoints
+  checkpoints_path_is_shared_file_system: false
+  resume_checkpoint_path: null
+  save_initial_state: false
+
+data_stages:
+  - name: Stable Training Stage
+    start_training_step: 1
+    data:
+      dataset:
+        dataset_overwrite_cache: false
+        dataset_processing_num_proc_per_process: 1
+        hf_dataset_config_name: null
+        hf_dataset_or_datasets: roneneldan/TinyStories
+        hf_dataset_splits: train
+        text_column_name: text
+      num_loading_workers: 1
+      seed: 42
+  - name: Annealing Phase
+    start_training_step: 9000
+    data:
+      dataset:
+        dataset_overwrite_cache: false
+        dataset_processing_num_proc_per_process: 1
+        hf_dataset_config_name: null
+        hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
+        hf_dataset_splits: train
+        text_column_name: completion
+      num_loading_workers: 1
+      seed: 42
+
+general:
+  benchmark_csv_path: null
+  consumed_train_samples: null
+  ignore_sanity_checks: true
+  project: debug
+  run: llama_350m_mup
+  seed: 42
+  step: null
+logging:
+  iteration_step_info_interval: 1
+  log_level: debug
+  log_level_replica: info
+model:
+  ddp_bucket_cap_mb: 120
+  dtype: bfloat16
+  init_method:
+    use_mup: true
+  make_vocab_size_divisible_by: 1
+  model_config:
+    bos_token_id: 1
+    eos_token_id: 2
+    hidden_act: silu
+    initializer_range: 0.02
+
+    hidden_size: 1024
+    intermediate_size: 4096
+    num_hidden_layers: 14
+
+    is_llama_config: true
+    max_position_embeddings: 1024
+    num_attention_heads: 8
+    num_key_value_heads: 4
+    pad_token_id: null
+    pretraining_tp: 1
+    rms_norm_eps: 1.0e-05
+    rope_scaling: null
+    tie_word_embeddings: false
+    use_cache: true
+    vocab_size: 49152
+optimizer:
+  accumulate_grad_in_fp32: false
+  adam_beta1: 0.9
+  adam_beta2: 0.95
+  adam_eps: 1.0e-08
+  clip_grad: 1.0
+  learning_rate_scheduler:
+    learning_rate: 0.001
+    lr_decay_starting_step: null
+    lr_decay_steps: null
+    lr_decay_style: cosine
+    lr_warmup_steps: 100 # 10% warm up of total training steps
+    lr_warmup_style: linear
+    min_decay_lr: 1.0e-05
+  torch_adam_is_fused: true
+  weight_decay: 0.1
+  zero_stage: 0
+parallelism:
+  dp: 4
+  pp: 1
+  pp_engine: 1f1b
+  tp: 2
+  tp_linear_async_communication: true
+  tp_mode: REDUCE_SCATTER
+profiler: null
+tokenizer:
+  tokenizer_max_length: null
+  tokenizer_name_or_path: gpt2
+  tokenizer_revision: null
+tokens:
+  batch_accumulation_per_replica: 8
+  limit_test_batches: 0
+  limit_val_batches: 0
+  micro_batch_size: 32
+  sequence_length: 1024
+  train_steps: 440
+  val_check_interval: -1
+lighteval:
+  batch_size: 16
+  checkpoints_path: null
+  generation: null
+  logging:
+    hub_repo_details: null
+    hub_repo_results: null
+    # hub_repo_tensorboard: HuggingFaceBR4/fmom-mamba2
+    local_output_path: /fsx/phuc/new_workspace/experiments/mup_for_mamba2/test_mamba350M_tp4_917cfc66/logs
+    push_details_to_hub: null
+    push_results_to_hub: null
+    push_results_to_tensorboard: true
+    tensorboard_metric_prefix: e
+  parallelism:
+    dp: 2
+    expert_parallel_size: 1
+    pp: 1
+    pp_engine: 1f1b
+    tp: 2
+    tp_linear_async_communication: false
+    tp_mode: ALL_REDUCE
+  # slurm_script_dir: /fsx/phuc/new_workspace/experiments/mup_for_mamba2/test_mamba350M_tp4_917cfc66/lighteval/slurm_scripts
+  # slurm_template: /fsx/phuc/new_workspace/experiments/mup_for_mamba2/test_mamba350M_tp4_917cfc66/run_eval.slurm.jinja
+  tasks:
+    # custom_tasks: brrr.lighteval.custom_tasks
+    dataset_loading_processes: 8
+    max_samples: 1000
+    multichoice_continuations_start_space: null
+    no_multichoice_continuations_start_space: null
+    num_fewshot_seeds: null
+    tasks: early-signal
+  wandb: null