From 9578b69a6a0161cb2fe51612cc38b1d16eeb296f Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Sun, 19 May 2024 09:16:09 +0000
Subject: [PATCH 01/11] Add HF Generation script

---
 generate_hf_predictions.py | 59 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 generate_hf_predictions.py

diff --git a/generate_hf_predictions.py b/generate_hf_predictions.py
new file mode 100644
index 00000000..5f7ec395
--- /dev/null
+++ b/generate_hf_predictions.py
@@ -0,0 +1,59 @@
+import argparse
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+TXT = "The prologue of Romeo and Juliet calls the title characters “star-crossed lovers”—and the stars do seem to conspire against these young lovers.  Romeo is a Montague, and Juliet a Capulet. Their families are enmeshed in a feud, but the moment they meet—when Romeo and his friends attend a party at Juliets house in disguise—the two fall in love and quickly decide that they want to be married.  A friar secretly marries them, hoping to end the feud. Romeo and his companions almost immediately encounter Juliets cousin Tybalt, who challenges Romeo. When Romeo refuses to fight, Romeos friend Mercutio accepts the challenge and is killed. Romeo then kills Tybalt and is banished. He spends that night with Juliet and then leaves for Mantua.  Juliets father forces her into a marriage with Count Paris. To avoid this marriage, Juliet takes a potion, given her by the friar, that makes her appear dead. The friar will send Romeo word to be at her family tomb when she awakes. The plan goes awry, and Romeo learns instead that she is dead. In the tomb, Romeo kills himself. Juliet wakes, sees his body, and commits suicide. Their deaths appear finally to end the feud."
+SEQ_LENGTH = 256  # For truncating the TXT if GPU can't fit too many tokens
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    group = parser.add_argument_group(title="HuggingFace Model")
+    group.add_argument(
+        "--pretrained-model-name-or-path",
+        type=str,
+        required=True,
+        help="A path to a directory containing model weights saved using save_pretrained() or the model id of a pretrained model hosted inside a model repo on the Hugging Face Hub",
+    )
+
+    args = parser.parse_args()
+
+    return args
+
+
+def main(args):
+    # TODO Refractor with HF pipeline or .generate()?
+    hf_model = (
+        AutoModelForCausalLM.from_pretrained(
+            args.pretrained_model_name_or_path, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
+        )
+        .to("cuda")
+        .eval()
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_name_or_path)
+    tokens = tokenizer(TXT, return_tensors="pt", truncation=True, max_length=(SEQ_LENGTH + 1))["input_ids"].to("cuda")
+    inputs_hf = tokens[:, :-1]
+
+    with torch.no_grad():
+        output_hf = hf_model(inputs_hf)
+
+    predicted_tokens = [5, 27, 34]  # Index of the predictions to compare across models
+
+    for predicted_token in predicted_tokens:
+        next_tokens_hf = torch.softmax(output_hf.logits[0, predicted_token, :], -1)
+        hf_topk_next_tokens = torch.topk(next_tokens_hf, 10)
+
+        print(
+            *[
+                f"[HF Model] Next token: {idx.item()}, probability: {prob}"
+                for idx, prob in zip(hf_topk_next_tokens.indices, hf_topk_next_tokens.values)
+            ],
+            sep="\n",
+        )
+
+
+if __name__ == "__main__":
+    _args = get_args()
+    main(_args)

From a7f918c35753e3e6f202a2babf3b3b0931e0bafa Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Sun, 19 May 2024 09:36:30 +0000
Subject: [PATCH 02/11] Add Nanotron Generation Script

---
 generate_hf_predictions.py       |  16 ++--
 generate_nanotron_predictions.py | 122 +++++++++++++++++++++++++++++++
 2 files changed, 132 insertions(+), 6 deletions(-)
 create mode 100644 generate_nanotron_predictions.py

diff --git a/generate_hf_predictions.py b/generate_hf_predictions.py
index 5f7ec395..5fd5bc3f 100644
--- a/generate_hf_predictions.py
+++ b/generate_hf_predictions.py
@@ -1,4 +1,5 @@
 import argparse
+import os
 
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -24,7 +25,7 @@ def get_args():
 
 def main(args):
     # TODO Refractor with HF pipeline or .generate()?
-    hf_model = (
+    model = (
         AutoModelForCausalLM.from_pretrained(
             args.pretrained_model_name_or_path, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
         )
@@ -34,21 +35,24 @@ def main(args):
 
     tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_name_or_path)
     tokens = tokenizer(TXT, return_tensors="pt", truncation=True, max_length=(SEQ_LENGTH + 1))["input_ids"].to("cuda")
-    inputs_hf = tokens[:, :-1]
+    inputs = tokens[:, :-1]
 
     with torch.no_grad():
-        output_hf = hf_model(inputs_hf)
+        output = model(inputs)
 
     predicted_tokens = [5, 27, 34]  # Index of the predictions to compare across models
+    term_cols = int(os.get_terminal_size().columns / 3)
 
     for predicted_token in predicted_tokens:
-        next_tokens_hf = torch.softmax(output_hf.logits[0, predicted_token, :], -1)
-        hf_topk_next_tokens = torch.topk(next_tokens_hf, 10)
+
+        print("\n", "=" * term_cols, f"Predictions of token {predicted_token}", "=" * term_cols)
+        next_tokens = torch.softmax(output.logits[0, predicted_token, :], -1)
+        topk_next_tokens = torch.topk(next_tokens, 10)
 
         print(
             *[
                 f"[HF Model] Next token: {idx.item()}, probability: {prob}"
-                for idx, prob in zip(hf_topk_next_tokens.indices, hf_topk_next_tokens.values)
+                for idx, prob in zip(topk_next_tokens.indices, topk_next_tokens.values)
             ],
             sep="\n",
         )
diff --git a/generate_nanotron_predictions.py b/generate_nanotron_predictions.py
new file mode 100644
index 00000000..ff613f4d
--- /dev/null
+++ b/generate_nanotron_predictions.py
@@ -0,0 +1,122 @@
+import argparse
+import os
+
+import torch
+from nanotron.config import Config, ParallelismArgs, get_config_from_file
+from nanotron.models import build_model
+from nanotron.models.llama import LlamaForTraining
+from nanotron.parallel import ParallelContext
+from nanotron.parallel.parameters import sanity_check
+from nanotron.parallel.pipeline_parallel.engine import AllForwardAllBackwardPipelineEngine
+from nanotron.parallel.tensor_parallel.nn import TensorParallelLinearMode
+from nanotron.serialize import load_weights
+from nanotron.trainer import mark_tied_parameters
+from transformers import AutoTokenizer
+
+# TODO Currentyly just sopporting Llama8B that doesn't needs any kind of parallelism
+DP = 1
+PP = 1
+
+TXT = "The prologue of Romeo and Juliet calls the title characters “star-crossed lovers”—and the stars do seem to conspire against these young lovers.  Romeo is a Montague, and Juliet a Capulet. Their families are enmeshed in a feud, but the moment they meet—when Romeo and his friends attend a party at Juliets house in disguise—the two fall in love and quickly decide that they want to be married.  A friar secretly marries them, hoping to end the feud. Romeo and his companions almost immediately encounter Juliets cousin Tybalt, who challenges Romeo. When Romeo refuses to fight, Romeos friend Mercutio accepts the challenge and is killed. Romeo then kills Tybalt and is banished. He spends that night with Juliet and then leaves for Mantua.  Juliets father forces her into a marriage with Count Paris. To avoid this marriage, Juliet takes a potion, given her by the friar, that makes her appear dead. The friar will send Romeo word to be at her family tomb when she awakes. The plan goes awry, and Romeo learns instead that she is dead. In the tomb, Romeo kills himself. Juliet wakes, sees his body, and commits suicide. Their deaths appear finally to end the feud."
+SEQ_LENGTH = 256  # For truncating the TXT if GPU can't fit too many tokens
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    group = parser.add_argument_group(title="Nanotron Model")
+    group.add_argument(
+        "--nanotron-checkpoint-path",
+        type=str,
+        required=True,
+        help="A path to a directory containing a Nanotron Checkpoint",
+    )
+
+    group = parser.add_argument_group(title="Nanotron Parallelism")
+    group.add_argument("--tp", type=int, required=True, help="Tensor Parallelism Degree of the Nanotron Checkpoint")
+
+    group = parser.add_argument_group(title="Tokenizer")
+    group.add_argument(
+        "--tokenizer-name-or-path",
+        type=str,
+        required=True,
+        help="A path to a directory containing vocabulary files required by the tokenizer or the model id of a predefined tokenizer hosted inside a model repo on the Hugging Face Hub.",
+    )
+
+    args = parser.parse_args()
+
+    return args
+
+
+def main(args):
+
+    parallel_config = ParallelismArgs(
+        dp=DP,
+        pp=PP,
+        tp=args.tp,
+        pp_engine=AllForwardAllBackwardPipelineEngine(),
+        tp_mode=TensorParallelLinearMode.ALL_REDUCE,
+        tp_linear_async_communication=False,
+    )
+    assert (
+        parallel_config.tp_mode == TensorParallelLinearMode.ALL_REDUCE
+        and parallel_config.tp_linear_async_communication is False
+    )
+
+    parallel_context = ParallelContext(
+        data_parallel_size=parallel_config.dp,
+        pipeline_parallel_size=parallel_config.pp,
+        tensor_parallel_size=parallel_config.tp,
+    )
+
+    nanotron_config = get_config_from_file(
+        os.path.join(args.nanotron_checkpoint_path, "config.yaml"), config_class=Config, model_config_class=None
+    )
+
+    model = build_model(
+        model_builder=lambda: LlamaForTraining(
+            config=nanotron_config.model.model_config,
+            parallel_context=parallel_context,
+            parallel_config=parallel_config,
+            random_states=None,
+        ),
+        parallel_context=parallel_context,
+        dtype=torch.bfloat16,
+        device=torch.device("cuda"),  # TODO Check with different parallelism
+    )
+
+    mark_tied_parameters(model=model, parallel_context=parallel_context)
+    sanity_check(root_module=model)
+
+    # Load checkpoint directly in memory and then only keep the state dictionary
+    load_weights(model=model, parallel_context=parallel_context, root_folder=args.nanotron_checkpoint_path)
+
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name_or_path)
+    tokens = tokenizer(TXT, return_tensors="pt", truncation=True, max_length=(SEQ_LENGTH + 1))["input_ids"].to("cuda")
+    inputs = {"input_ids": tokens[:, :-1], "input_mask": torch.ones((1, SEQ_LENGTH), device="cuda")}
+
+    model.eval()
+
+    with torch.no_grad():
+        output = model(inputs)
+
+    predicted_tokens = [5, 27, 34]  # Index of the predictions to compare across models
+    term_cols = int(os.get_terminal_size().columns / 3)
+
+    for predicted_token in predicted_tokens:
+
+        print("\n", "=" * term_cols, f"Predictions of token {predicted_token}", "=" * term_cols)
+        next_tokens = torch.softmax(output.transpose(0, 1)[0, predicted_token, :], -1)
+        topk_next_tokens = torch.topk(next_tokens, 10)
+
+        print(
+            *[
+                f"[Nanotron Model] Next token: {idx.item()}, probability: {prob}"
+                for idx, prob in zip(topk_next_tokens.indices, topk_next_tokens.values)
+            ],
+            sep="\n",
+        )
+
+
+if __name__ == "__main__":
+    _args = get_args()
+    main(_args)

From 7728482ff717470fdf544724b2fd595bc441c846 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Sun, 19 May 2024 11:11:09 +0000
Subject: [PATCH 03/11] Add  HF to Nanotron conversion script

---
 convert_hf_to_nanotron.py | 248 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 248 insertions(+)
 create mode 100644 convert_hf_to_nanotron.py

diff --git a/convert_hf_to_nanotron.py b/convert_hf_to_nanotron.py
new file mode 100644
index 00000000..3b66cef4
--- /dev/null
+++ b/convert_hf_to_nanotron.py
@@ -0,0 +1,248 @@
+"""
+torchrun --nproc-per-node 1 convert_hf_to_nanotron.py --tp 1 --nanotron-checkpoint-path n_c/second  --pretrained-model-name-or-path /mloscratch/homes/solergib/models/Meta-Llama-3-8B-Instruct
+"""
+import argparse
+import json
+from dataclasses import asdict
+from pathlib import Path
+
+import torch
+import yaml
+from nanotron.config import Config, GeneralArgs, ModelArgs, ParallelismArgs, TokenizerArgs
+from nanotron.config.models_config import ExistingCheckpointInit
+from nanotron.config.models_config import LlamaConfig as LlamaConfigNanotron
+from nanotron.models import build_model
+from nanotron.models.llama import LlamaForTraining
+from nanotron.parallel import ParallelContext
+from nanotron.parallel.parameters import sanity_check
+from nanotron.parallel.pipeline_parallel.engine import AllForwardAllBackwardPipelineEngine
+from nanotron.parallel.tensor_parallel.nn import TensorParallelLinearMode
+from nanotron.serialize import TrainingMetadata, save_meta, save_weights
+from nanotron.serialize.metadata import DataStageMetadata
+from nanotron.trainer import mark_tied_parameters
+from transformers import AutoModelForCausalLM
+
+# TODO Currentyly just sopporting Llama8B that doesn't needs any kind of parallelism
+DP = 1
+PP = 1
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    group = parser.add_argument_group(title="Nanotron Model")
+    group.add_argument(
+        "--nanotron-checkpoint-path",
+        type=str,
+        required=True,
+        help="A path to a directory to store the converted Nanotron Checkpoint",
+    )
+
+    group = parser.add_argument_group(title="Nanotron Parallelism")
+    group.add_argument("--tp", type=int, required=True, help="Tensor Parallelism Degree of the Nanotron Checkpoint")
+
+    group = parser.add_argument_group(title="HuggingFace Model")
+    group.add_argument(
+        "--pretrained-model-name-or-path",
+        type=str,
+        required=True,
+        help="A path to a directory containing model weights saved using save_pretrained() or the model id of a pretrained model hosted inside a model repo on the Hugging Face Hub",
+    )
+
+    args = parser.parse_args()
+
+    return args
+
+
+def main(args):
+    # Load Llama3-8B HF model
+    hf_model = AutoModelForCausalLM.from_pretrained(
+        args.pretrained_model_name_or_path, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
+    ).to("cuda")
+    hf_config = hf_model.config
+
+    # Set Nanotron LlamaConfig
+    nanotron_llama_config = LlamaConfigNanotron(
+        bos_token_id=hf_config.bos_token_id,
+        eos_token_id=hf_config.eos_token_id,
+        hidden_act=hf_config.hidden_act,
+        hidden_size=hf_config.hidden_size,
+        initializer_range=hf_config.initializer_range,
+        intermediate_size=hf_config.intermediate_size,
+        is_llama_config=True,
+        max_position_embeddings=hf_config.max_position_embeddings,
+        num_attention_heads=hf_config.num_attention_heads,
+        num_hidden_layers=hf_config.num_hidden_layers,
+        num_key_value_heads=hf_config.num_key_value_heads,
+        pad_token_id=None,
+        pretraining_tp=hf_config.pretraining_tp,
+        rms_norm_eps=hf_config.rms_norm_eps,
+        rope_scaling=hf_config.rope_scaling,
+        rope_theta=hf_config.rope_theta,
+        rope_interleaved=False,
+        tie_word_embeddings=hf_config.tie_word_embeddings,
+        use_cache=hf_config.use_cache,
+        vocab_size=hf_config.vocab_size,
+    )
+
+    # Init Llama3-8B Nanotron model
+    parallel_config = ParallelismArgs(
+        dp=DP,
+        pp=PP,
+        tp=args.tp,
+        pp_engine=AllForwardAllBackwardPipelineEngine(),
+        tp_mode=TensorParallelLinearMode.ALL_REDUCE,
+        tp_linear_async_communication=False,
+    )
+    assert (
+        parallel_config.tp_mode == TensorParallelLinearMode.ALL_REDUCE
+        and parallel_config.tp_linear_async_communication is False
+    )
+
+    parallel_context = ParallelContext(
+        data_parallel_size=parallel_config.dp,
+        pipeline_parallel_size=parallel_config.pp,
+        tensor_parallel_size=parallel_config.tp,
+    )
+
+    nanotron_model = build_model(
+        model_builder=lambda: LlamaForTraining(
+            config=nanotron_llama_config,
+            parallel_context=parallel_context,
+            parallel_config=parallel_config,
+            random_states=None,
+        ),
+        parallel_context=parallel_context,
+        dtype=torch.bfloat16,
+        device=torch.device("cuda"),
+    )
+
+    mark_tied_parameters(model=nanotron_model, parallel_context=parallel_context)
+    sanity_check(root_module=nanotron_model)
+
+    # Copy params from HF to Nanotron
+    # Token embeddings
+    assert (
+        nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight.shape
+        == hf_model.model.embed_tokens.weight.shape
+    )
+    with torch.no_grad():
+        nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight.copy_(
+            hf_model.model.embed_tokens.weight
+        )
+
+    # Decoder layers
+    for i in range(nanotron_llama_config.num_hidden_layers):
+        # Input layer norm
+        assert (
+            hf_model.model.layers[i].input_layernorm.weight.shape
+            == nanotron_model.model.decoder[i].pp_block.input_layernorm.weight.shape
+        )
+        with torch.no_grad():
+            nanotron_model.model.decoder[i].pp_block.input_layernorm.weight.copy_(
+                hf_model.model.layers[i].input_layernorm.weight
+            )
+
+        # Self attn
+        ## QKV
+        tmp_qkv_proj = torch.cat(
+            [
+                hf_model.model.layers[i].self_attn.q_proj.weight,
+                hf_model.model.layers[i].self_attn.k_proj.weight,
+                hf_model.model.layers[i].self_attn.v_proj.weight,
+            ],
+            dim=0,
+        )
+        assert tmp_qkv_proj.shape == nanotron_model.model.decoder[i].pp_block.attn.qkv_proj.weight.shape
+        with torch.no_grad():
+            nanotron_model.model.decoder[i].pp_block.attn.qkv_proj.weight.copy_(tmp_qkv_proj)
+
+        ## O
+        assert (
+            hf_model.model.layers[i].self_attn.o_proj.weight.shape
+            == nanotron_model.model.decoder[i].pp_block.attn.o_proj.weight.shape
+        )
+        with torch.no_grad():
+            nanotron_model.model.decoder[i].pp_block.attn.o_proj.weight.copy_(
+                hf_model.model.layers[i].self_attn.o_proj.weight
+            )
+
+        # MLP
+        ## Gate Up Proj
+        tmp_gate_up_proj = torch.cat(
+            [
+                hf_model.model.layers[i].mlp.gate_proj.weight,
+                hf_model.model.layers[i].mlp.up_proj.weight,
+            ],
+            dim=0,
+        )
+
+        assert tmp_gate_up_proj.shape == nanotron_model.model.decoder[i].pp_block.mlp.gate_up_proj.weight.shape
+        with torch.no_grad():
+            nanotron_model.model.decoder[i].pp_block.mlp.gate_up_proj.weight.copy_(tmp_gate_up_proj)
+
+        ## Down Proj
+        assert (
+            hf_model.model.layers[i].mlp.down_proj.weight.shape
+            == nanotron_model.model.decoder[i].pp_block.mlp.down_proj.weight.shape
+        )
+        with torch.no_grad():
+            nanotron_model.model.decoder[i].pp_block.mlp.down_proj.weight.copy_(
+                hf_model.model.layers[i].mlp.down_proj.weight
+            )
+
+        # Post attn layer norm
+        assert (
+            hf_model.model.layers[i].post_attention_layernorm.weight.shape
+            == nanotron_model.model.decoder[i].pp_block.post_attention_layernorm.weight.shape
+        )
+        with torch.no_grad():
+            nanotron_model.model.decoder[i].pp_block.post_attention_layernorm.weight.copy_(
+                hf_model.model.layers[i].post_attention_layernorm.weight
+            )
+
+    # Last layer norm
+    assert nanotron_model.model.final_layer_norm.pp_block.weight.shape == hf_model.model.norm.weight.shape
+    with torch.no_grad():
+        nanotron_model.model.final_layer_norm.pp_block.weight.copy_(hf_model.model.norm.weight)
+
+    # LM_Head
+    assert nanotron_model.model.lm_head.pp_block.weight.shape == hf_model.lm_head.weight.shape
+    with torch.no_grad():
+        nanotron_model.model.lm_head.pp_block.weight.copy_(hf_model.lm_head.weight)
+
+    # Store weights
+    nanotron_checkpoint_path = Path(args.nanotron_checkpoint_path)
+    save_weights(model=nanotron_model, parallel_context=parallel_context, root_folder=nanotron_checkpoint_path)
+
+    # Store metadata
+    training_metadata = TrainingMetadata(
+        last_train_step=0,
+        consumed_train_samples=0,
+        data_stages=[DataStageMetadata(name="Empty", consumed_train_samples=0, start_training_step=0)],
+    )
+    save_meta(
+        root_folder=nanotron_checkpoint_path, parallel_context=parallel_context, training_metadata=training_metadata
+    )
+
+    # Store Config and Model Config files
+    with open(nanotron_checkpoint_path / "config.yaml", "w") as f:
+        config = Config(
+            general=GeneralArgs(project="conversion", run="Llama3-8B"),
+            parallelism=parallel_config,
+            model=ModelArgs(
+                init_method=ExistingCheckpointInit(nanotron_checkpoint_path),
+                model_config=nanotron_llama_config,
+            ),
+            tokenizer=TokenizerArgs(args.pretrained_model_name_or_path),
+        )
+        print("Saving config ...")
+        yaml.dump(config.as_dict(), f)
+
+    with open(nanotron_checkpoint_path / "model_config.json", "w") as f:
+        print("Saving model config ...")
+        json.dump(asdict(nanotron_llama_config), f)
+
+
+if __name__ == "__main__":
+    _args = get_args()
+    main(_args)

From 4107ed4a2ce8cf7cf05aa27ad9201265421314cb Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Sun, 19 May 2024 11:12:05 +0000
Subject: [PATCH 04/11] Add Nanorton to HF conversion script

---
 convert_nanotron_to_hf.py | 213 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 213 insertions(+)
 create mode 100644 convert_nanotron_to_hf.py

diff --git a/convert_nanotron_to_hf.py b/convert_nanotron_to_hf.py
new file mode 100644
index 00000000..363e0c51
--- /dev/null
+++ b/convert_nanotron_to_hf.py
@@ -0,0 +1,213 @@
+"""
+torchrun --nproc-per-node 1 convert_nanotron_to_hf.py --tp 1 --nanotron-checkpoint-path n_c/second --hugging-face-checkpoint-path hf_c/second
+"""
+import argparse
+import os
+from dataclasses import asdict
+from pathlib import Path
+
+import torch
+from nanotron.config import Config, ParallelismArgs, get_config_from_file
+from nanotron.models import build_model
+from nanotron.models.llama import LlamaForTraining
+from nanotron.parallel import ParallelContext
+from nanotron.parallel.parameters import sanity_check
+from nanotron.parallel.pipeline_parallel.engine import AllForwardAllBackwardPipelineEngine
+from nanotron.parallel.tensor_parallel.nn import TensorParallelLinearMode
+from nanotron.serialize import load_weights
+from nanotron.trainer import mark_tied_parameters
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.models.llama import LlamaConfig as LlamaConfigHF
+
+# TODO Currentyly just sopporting Llama8B that doesn't needs any kind of parallelism
+DP = 1
+PP = 1
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    group = parser.add_argument_group(title="Nanotron Model")
+    group.add_argument(
+        "--nanotron-checkpoint-path",
+        type=str,
+        required=True,
+        help="A path to a directory with a Nanotron Checkpoint",
+    )
+
+    group = parser.add_argument_group(title="Nanotron Parallelism")
+    group.add_argument("--tp", type=int, required=True, help="Tensor Parallelism Degree of the Nanotron Checkpoint")
+
+    group = parser.add_argument_group(title="HuggingFace Model")
+    group.add_argument(
+        "--hugging-face-checkpoint-path",
+        type=str,
+        required=True,
+        help="A path to a directory to store the converted checkpoint",
+    )
+    # TODO Add push to hub
+
+    args = parser.parse_args()
+
+    return args
+
+
+def main(args):
+    # Load Nanotron checkpoint config
+    nanotron_config = get_config_from_file(
+        os.path.join(args.nanotron_checkpoint_path, "config.yaml"), config_class=Config, model_config_class=None
+    )
+    nanotron_llama_config = nanotron_config.model.model_config
+
+    # Init Llama3-8B Nanotron model
+    parallel_config = ParallelismArgs(
+        dp=DP,
+        pp=PP,
+        tp=args.tp,
+        pp_engine=AllForwardAllBackwardPipelineEngine(),
+        tp_mode=TensorParallelLinearMode.ALL_REDUCE,
+        tp_linear_async_communication=False,
+    )
+    assert (
+        parallel_config.tp_mode == TensorParallelLinearMode.ALL_REDUCE
+        and parallel_config.tp_linear_async_communication is False
+    )
+
+    parallel_context = ParallelContext(
+        data_parallel_size=parallel_config.dp,
+        pipeline_parallel_size=parallel_config.pp,
+        tensor_parallel_size=parallel_config.tp,
+    )
+
+    nanotron_model = build_model(
+        model_builder=lambda: LlamaForTraining(
+            config=nanotron_config.model.model_config,
+            parallel_context=parallel_context,
+            parallel_config=parallel_config,
+            random_states=None,
+        ),
+        parallel_context=parallel_context,
+        dtype=torch.bfloat16,
+        device=torch.device("cuda"),
+    )
+
+    mark_tied_parameters(model=nanotron_model, parallel_context=parallel_context)
+    sanity_check(root_module=nanotron_model)
+
+    # Load Nanotron Checkpoint
+    load_weights(
+        model=nanotron_model, parallel_context=parallel_context, root_folder=Path(args.nanotron_checkpoint_path)
+    )
+
+    # Build empty HF Model
+    ## TODO This takes pretty long time
+    hf_model = AutoModelForCausalLM.from_config(
+        config=LlamaConfigHF(**asdict(nanotron_llama_config)),
+        torch_dtype=torch.bfloat16,
+        attn_implementation="flash_attention_2",
+    ).to("cuda")
+
+    # Copy params from Nanotron to HF
+    # Token embeddings
+    assert (
+        nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight.shape
+        == hf_model.model.embed_tokens.weight.shape
+    )
+    with torch.no_grad():
+        hf_model.model.embed_tokens.weight.copy_(
+            nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight
+        )
+
+    # Decoder layers
+    for i in range(nanotron_config.model.model_config.num_hidden_layers):
+        # Input layer norm
+        assert (
+            hf_model.model.layers[i].input_layernorm.weight.shape
+            == nanotron_model.model.decoder[i].pp_block.input_layernorm.weight.shape
+        )
+        with torch.no_grad():
+            hf_model.model.layers[i].input_layernorm.weight.copy_(
+                nanotron_model.model.decoder[i].pp_block.input_layernorm.weight
+            )
+
+        # Self attn
+        # Split Nanotrn qkv projection into q, k, v
+        q, k, v = torch.split(
+            nanotron_model.model.decoder[i].pp_block.attn.qkv_proj.weight,
+            [
+                nanotron_llama_config.num_attention_heads * nanotron_model.model.decoder[i].pp_block.attn.d_qk,
+                nanotron_llama_config.num_key_value_heads * nanotron_model.model.decoder[i].pp_block.attn.d_qk,
+                nanotron_llama_config.num_key_value_heads * nanotron_model.model.decoder[i].pp_block.attn.d_qk,
+            ],
+        )
+        assert q.shape == hf_model.model.layers[i].self_attn.q_proj.weight.shape
+        assert k.shape == hf_model.model.layers[i].self_attn.k_proj.weight.shape
+        assert v.shape == hf_model.model.layers[i].self_attn.v_proj.weight.shape
+
+        with torch.no_grad():
+            hf_model.model.layers[i].self_attn.q_proj.weight.copy_(q)
+            hf_model.model.layers[i].self_attn.k_proj.weight.copy_(k)
+            hf_model.model.layers[i].self_attn.v_proj.weight.copy_(v)
+
+        ## O
+        assert (
+            hf_model.model.layers[i].self_attn.o_proj.weight.shape
+            == nanotron_model.model.decoder[i].pp_block.attn.o_proj.weight.shape
+        )
+        with torch.no_grad():
+            hf_model.model.layers[i].self_attn.o_proj.weight.copy_(
+                nanotron_model.model.decoder[i].pp_block.attn.o_proj.weight
+            )
+
+        # MLP
+        ## Gate Up Proj
+        gate_proj, up_proj = torch.split(
+            nanotron_model.model.decoder[i].pp_block.mlp.gate_up_proj.weight,
+            split_size_or_sections=[nanotron_llama_config.intermediate_size, nanotron_llama_config.intermediate_size],
+        )
+        assert gate_proj.shape == hf_model.model.layers[i].mlp.gate_proj.weight.shape
+        assert up_proj.shape == hf_model.model.layers[i].mlp.up_proj.weight.shape
+
+        with torch.no_grad():
+            hf_model.model.layers[i].mlp.gate_proj.weight.copy_(gate_proj)
+            hf_model.model.layers[i].mlp.up_proj.weight.copy_(up_proj)
+
+        ## Down Proj
+        assert (
+            hf_model.model.layers[i].mlp.down_proj.weight.shape
+            == nanotron_model.model.decoder[i].pp_block.mlp.down_proj.weight.shape
+        )
+        with torch.no_grad():
+            hf_model.model.layers[i].mlp.down_proj.weight.copy_(
+                nanotron_model.model.decoder[i].pp_block.mlp.down_proj.weight
+            )
+
+        # Post attn layer norm
+        assert (
+            hf_model.model.layers[i].post_attention_layernorm.weight.shape
+            == nanotron_model.model.decoder[i].pp_block.post_attention_layernorm.weight.shape
+        )
+        with torch.no_grad():
+            hf_model.model.layers[i].post_attention_layernorm.weight.copy_(
+                nanotron_model.model.decoder[i].pp_block.post_attention_layernorm.weight
+            )
+
+    # Last layer norm
+    assert nanotron_model.model.final_layer_norm.pp_block.weight.shape == hf_model.model.norm.weight.shape
+    with torch.no_grad():
+        hf_model.model.norm.weight.copy_(nanotron_model.model.final_layer_norm.pp_block.weight)
+
+    # LM_Head
+    assert nanotron_model.model.lm_head.pp_block.weight.shape == hf_model.lm_head.weight.shape
+    with torch.no_grad():
+        hf_model.lm_head.weight.copy_(nanotron_model.model.lm_head.pp_block.weight)
+
+    # Store weights
+    hf_model.save_pretrained(args.hugging_face_checkpoint_path, from_pt=True)
+    # Store tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(nanotron_config.tokenizer.tokenizer_name_or_path)
+    tokenizer.save_pretrained(args.hugging_face_checkpoint_path)
+
+
+if __name__ == "__main__":
+    _args = get_args()
+    main(_args)

From 2411d435ca37867fb6db7888b8e86035f753441c Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Sun, 19 May 2024 11:15:27 +0000
Subject: [PATCH 05/11] Moved scripts to tools llama3 folder

---
 .../llama3/convert_hf_to_nanotron.py                      | 0
 .../llama3/convert_nanotron_to_hf.py                      | 0
 .../llama3/generate_hf_predictions.py                     | 3 +++
 .../llama3/generate_nanotron_predictions.py               | 8 ++++++--
 4 files changed, 9 insertions(+), 2 deletions(-)
 rename convert_hf_to_nanotron.py => tools/llama3/convert_hf_to_nanotron.py (100%)
 rename convert_nanotron_to_hf.py => tools/llama3/convert_nanotron_to_hf.py (100%)
 rename generate_hf_predictions.py => tools/llama3/generate_hf_predictions.py (94%)
 rename generate_nanotron_predictions.py => tools/llama3/generate_nanotron_predictions.py (94%)

diff --git a/convert_hf_to_nanotron.py b/tools/llama3/convert_hf_to_nanotron.py
similarity index 100%
rename from convert_hf_to_nanotron.py
rename to tools/llama3/convert_hf_to_nanotron.py
diff --git a/convert_nanotron_to_hf.py b/tools/llama3/convert_nanotron_to_hf.py
similarity index 100%
rename from convert_nanotron_to_hf.py
rename to tools/llama3/convert_nanotron_to_hf.py
diff --git a/generate_hf_predictions.py b/tools/llama3/generate_hf_predictions.py
similarity index 94%
rename from generate_hf_predictions.py
rename to tools/llama3/generate_hf_predictions.py
index 5fd5bc3f..d484bff5 100644
--- a/generate_hf_predictions.py
+++ b/tools/llama3/generate_hf_predictions.py
@@ -1,3 +1,6 @@
+"""
+torchrun --nproc-per-node 1 generate_hf_predictions.py --pretrained-model-name-or-path hf_c/second  --tokenizer-name-or-path /mloscratch/homes/solergib/models/Meta-Llama-3-8B-Instruct
+"""
 import argparse
 import os
 
diff --git a/generate_nanotron_predictions.py b/tools/llama3/generate_nanotron_predictions.py
similarity index 94%
rename from generate_nanotron_predictions.py
rename to tools/llama3/generate_nanotron_predictions.py
index ff613f4d..3671007d 100644
--- a/generate_nanotron_predictions.py
+++ b/tools/llama3/generate_nanotron_predictions.py
@@ -1,5 +1,9 @@
+"""
+torchrun --nproc-per-node 1 generate_nanotron_predictions.py --tp 1 --nanotron-checkpoint-path n_c/second  --tokenizer-name-or-path /mloscratch/homes/solergib/models/Meta-Llama-3-8B-Instruct
+"""
 import argparse
 import os
+from pathlib import Path
 
 import torch
 from nanotron.config import Config, ParallelismArgs, get_config_from_file
@@ -88,7 +92,7 @@ def main(args):
     sanity_check(root_module=model)
 
     # Load checkpoint directly in memory and then only keep the state dictionary
-    load_weights(model=model, parallel_context=parallel_context, root_folder=args.nanotron_checkpoint_path)
+    load_weights(model=model, parallel_context=parallel_context, root_folder=Path(args.nanotron_checkpoint_path))
 
     tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name_or_path)
     tokens = tokenizer(TXT, return_tensors="pt", truncation=True, max_length=(SEQ_LENGTH + 1))["input_ids"].to("cuda")
@@ -97,7 +101,7 @@ def main(args):
     model.eval()
 
     with torch.no_grad():
-        output = model(inputs)
+        output = model.model(**inputs)
 
     predicted_tokens = [5, 27, 34]  # Index of the predictions to compare across models
     term_cols = int(os.get_terminal_size().columns / 3)

From 372fa02eae1e60c8c9c2fce83d939ded8aad90c4 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Sun, 19 May 2024 11:18:28 +0000
Subject: [PATCH 06/11] Pushed FA2 mod and rope configs fix

---
 src/nanotron/config/models_config.py |  4 +++
 src/nanotron/models/llama.py         | 53 +++++++---------------------
 2 files changed, 17 insertions(+), 40 deletions(-)

diff --git a/src/nanotron/config/models_config.py b/src/nanotron/config/models_config.py
index ba4559cf..2630e1d6 100644
--- a/src/nanotron/config/models_config.py
+++ b/src/nanotron/config/models_config.py
@@ -47,6 +47,10 @@ class LlamaConfig:
     pretraining_tp: int = 1
     rms_norm_eps: float = 1e-6
     rope_scaling: Optional[dict] = None
+    rope_theta: float = 10000.0
+    rope_interleaved: bool = (
+        True  # The default value has been True, but for loading Llama3 checkpoints you have to set it to False
+    )
     tie_word_embeddings: bool = False
     use_cache: bool = True
     vocab_size: int = 32000
diff --git a/src/nanotron/models/llama.py b/src/nanotron/models/llama.py
index 32aab9cd..2072a789 100644
--- a/src/nanotron/models/llama.py
+++ b/src/nanotron/models/llama.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """PyTorch LLaMa model."""
 
-from typing import Dict, Optional, Union, List
+from typing import Dict, Optional, Union
 
 import torch
 from torch import nn
@@ -188,35 +188,21 @@ def __init__(self, config: LlamaConfig, parallel_config: Optional[ParallelismArg
     @checkpoint_method(attr_name="checkpoint_attention")
     def forward(
         self,
-        query_states: torch.Tensor,  # [batch_size * q_length, n_local_q_heads, inner_dim]
-        key_states: torch.Tensor,  # [batch_size * kv_length, n_local_kv_heads, inner_dim]
-        value_states: torch.Tensor,  # [batch_size * kv_length, n_local_kv_heads, inner_dim]
-        q_sequence_mask: torch.Tensor,  # torch.BoolTensor [batch_size, q_length] (can be broadcasted to that size)
-        kv_sequence_mask: torch.Tensor,  # torch.BoolTensor [batch_size, kv_length] (can be broadcasted to that size)
+        query_states: torch.Tensor,  # [batch_size, q_length, n_local_q_heads, inner_dim]
+        key_states: torch.Tensor,  # [batch_size, kv_length, n_local_kv_heads, inner_dim]
+        value_states: torch.Tensor,  # [batch_size, kv_length, n_local_kv_heads, inner_dim]
     ):
-        from flash_attn.flash_attn_interface import flash_attn_varlen_func
-
-        # TODO @thomasw21: Compute once, instead of computing for each layers.
-        cu_seqlens_q = torch.zeros((q_sequence_mask.shape[0] + 1), dtype=torch.int32, device=query_states.device)
-        cu_seqlens_k = torch.zeros((kv_sequence_mask.shape[0] + 1), dtype=torch.int32, device=query_states.device)
-        torch.cumsum(q_sequence_mask.sum(-1, dtype=torch.int32), dim=0, dtype=torch.int32, out=cu_seqlens_q[1:])
-        torch.cumsum(kv_sequence_mask.sum(-1, dtype=torch.int32), dim=0, dtype=torch.int32, out=cu_seqlens_k[1:])
-
-        # TODO(kunhao): flash attn's causal means that the query can only attend to the keys before it. This is not
-        # what we want if we are using kv cache. This is a hack as we always have q_length == 1 when using kv cache.
-        causal = False if q_sequence_mask.shape[1] == 1 else True
+        from flash_attn.flash_attn_interface import flash_attn_func
 
         # NOTE: this scale is for µTransfer,
         # in SP, we use sqrt(1/d_h)
         softmax_scale = 1 / query_states.shape[-1] if self.is_using_mup else None
-        attn_output = flash_attn_varlen_func(
+        # For now we are assuming that we use causual mask. No magic here
+        causal = True
+        attn_output = flash_attn_func(
             q=query_states,
             k=key_states,
             v=value_states,
-            cu_seqlens_q=cu_seqlens_q,
-            cu_seqlens_k=cu_seqlens_k,
-            max_seqlen_q=q_sequence_mask.shape[1],
-            max_seqlen_k=kv_sequence_mask.shape[1],
             dropout_p=0.0,
             softmax_scale=softmax_scale,
             causal=causal,
@@ -323,7 +309,9 @@ def __init__(
         )
 
         # NOTE: Only supported for training (TODO(fmom): position_ids not supported yet)
-        self.flash_rotary_embedding = FlashRotaryEmbedding(dim=self.d_qk, interleaved=True)
+        self.flash_rotary_embedding = FlashRotaryEmbedding(
+            dim=self.d_qk, interleaved=config.rope_interleaved, base=config.rope_theta
+        )
 
         self.o_proj = TensorParallelRowLinear(
             config.num_attention_heads * self.d_qk,
@@ -565,29 +553,14 @@ def forward(
             # [batch_size, seq_length, num_heads, d_qk]
             key_states, value_states = torch.split(key_value_states, 1, dim=2)
 
-            q_sequence_mask = sequence_mask
-            kv_sequence_mask = sequence_mask
-
             kv_length = key_states.shape[1]
-            # [batch_size, seq_length, num_heads, d_qk]
-            # Shaping for use in `flash-attn` version of flash-attn: `flash_attn_unpadded_func`
-            query_states = query_states.view(
-                batch_size * q_length, self.n_local_q_heads, self.d_qk
-            )  # [batch_size * q_length, self.n_heads, d_qk]
-
-            key_states = key_states.view(
-                batch_size * kv_length, self.n_local_kv_heads, self.d_qk
-            )  # [batch_size * kv_length, self.n_heads, d_qk]
-            value_states = value_states.view(
-                batch_size * kv_length, self.n_local_kv_heads, self.d_v
-            )  # [batch_size * kv_length, self.n_heads, d_v]
+            key_states = key_states.view(batch_size, kv_length, self.n_local_kv_heads, self.d_qk)
+            value_states = value_states.view(batch_size, kv_length, self.n_local_kv_heads, self.d_v)
 
             attention_output = self.attention(
                 query_states=query_states,
                 key_states=key_states,
                 value_states=value_states,
-                q_sequence_mask=q_sequence_mask,
-                kv_sequence_mask=kv_sequence_mask,
             )
 
         attention_output = (

From b348460911ca0c24f3f8b469b8222fb8732787c9 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Sun, 19 May 2024 12:36:14 +0000
Subject: [PATCH 07/11] Added logging

---
 tools/llama3/convert_hf_to_nanotron.py        | 82 +++++++++++++------
 tools/llama3/convert_nanotron_to_hf.py        | 44 +++++++---
 tools/llama3/generate_hf_predictions.py       | 20 +++--
 tools/llama3/generate_nanotron_predictions.py | 27 +++---
 4 files changed, 108 insertions(+), 65 deletions(-)

diff --git a/tools/llama3/convert_hf_to_nanotron.py b/tools/llama3/convert_hf_to_nanotron.py
index 3b66cef4..15f42705 100644
--- a/tools/llama3/convert_hf_to_nanotron.py
+++ b/tools/llama3/convert_hf_to_nanotron.py
@@ -1,5 +1,5 @@
 """
-torchrun --nproc-per-node 1 convert_hf_to_nanotron.py --tp 1 --nanotron-checkpoint-path n_c/second  --pretrained-model-name-or-path /mloscratch/homes/solergib/models/Meta-Llama-3-8B-Instruct
+torchrun --nproc-per-node 1 tools/llama3/convert_hf_to_nanotron.py --tp 1 --nanotron-checkpoint-path nanotron_checkpoints/NanotronLlama38B --pretrained-model-name-or-path /mloscratch/homes/solergib/models/Meta-Llama-3-8B-Instruct
 """
 import argparse
 import json
@@ -8,9 +8,11 @@
 
 import torch
 import yaml
+from nanotron import logging
 from nanotron.config import Config, GeneralArgs, ModelArgs, ParallelismArgs, TokenizerArgs
 from nanotron.config.models_config import ExistingCheckpointInit
 from nanotron.config.models_config import LlamaConfig as LlamaConfigNanotron
+from nanotron.logging import log_rank
 from nanotron.models import build_model
 from nanotron.models.llama import LlamaForTraining
 from nanotron.parallel import ParallelContext
@@ -20,12 +22,17 @@
 from nanotron.serialize import TrainingMetadata, save_meta, save_weights
 from nanotron.serialize.metadata import DataStageMetadata
 from nanotron.trainer import mark_tied_parameters
-from transformers import AutoModelForCausalLM
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
-# TODO Currentyly just sopporting Llama8B that doesn't needs any kind of parallelism
+logger = logging.get_logger(__name__)
+
+# TODO Currentyly just sopporting Llama8B that doesn't needs any kind of model parallelism
 DP = 1
 PP = 1
 
+DEVICE = torch.device("cuda")
+TORCH_DTYPE = torch.bfloat16
+
 
 def get_args():
     parser = argparse.ArgumentParser()
@@ -54,10 +61,36 @@ def get_args():
 
 
 def main(args):
+    # Init Nanotron Parallel Utilities
+    parallel_config = ParallelismArgs(
+        dp=DP,
+        pp=PP,
+        tp=args.tp,
+        pp_engine=AllForwardAllBackwardPipelineEngine(),
+        tp_mode=TensorParallelLinearMode.ALL_REDUCE,
+        tp_linear_async_communication=False,
+    )
+    assert (
+        parallel_config.tp_mode == TensorParallelLinearMode.ALL_REDUCE
+        and parallel_config.tp_linear_async_communication is False
+    )
+
+    parallel_context = ParallelContext(
+        data_parallel_size=parallel_config.dp,
+        pipeline_parallel_size=parallel_config.pp,
+        tensor_parallel_size=parallel_config.tp,
+    )
+
     # Load Llama3-8B HF model
+    log_rank(
+        f"Loading pretrained Llama3 Model: {args.pretrained_model_name_or_path}",
+        logger=logger,
+        level=logging.INFO,
+        rank=0,
+    )
     hf_model = AutoModelForCausalLM.from_pretrained(
-        args.pretrained_model_name_or_path, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
-    ).to("cuda")
+        args.pretrained_model_name_or_path, torch_dtype=TORCH_DTYPE, attn_implementation="flash_attention_2"
+    ).to(DEVICE)
     hf_config = hf_model.config
 
     # Set Nanotron LlamaConfig
@@ -85,25 +118,7 @@ def main(args):
     )
 
     # Init Llama3-8B Nanotron model
-    parallel_config = ParallelismArgs(
-        dp=DP,
-        pp=PP,
-        tp=args.tp,
-        pp_engine=AllForwardAllBackwardPipelineEngine(),
-        tp_mode=TensorParallelLinearMode.ALL_REDUCE,
-        tp_linear_async_communication=False,
-    )
-    assert (
-        parallel_config.tp_mode == TensorParallelLinearMode.ALL_REDUCE
-        and parallel_config.tp_linear_async_communication is False
-    )
-
-    parallel_context = ParallelContext(
-        data_parallel_size=parallel_config.dp,
-        pipeline_parallel_size=parallel_config.pp,
-        tensor_parallel_size=parallel_config.tp,
-    )
-
+    log_rank("Init empty Nanotron Llama3 Model", logger=logger, level=logging.INFO, rank=0)
     nanotron_model = build_model(
         model_builder=lambda: LlamaForTraining(
             config=nanotron_llama_config,
@@ -112,14 +127,15 @@ def main(args):
             random_states=None,
         ),
         parallel_context=parallel_context,
-        dtype=torch.bfloat16,
-        device=torch.device("cuda"),
+        dtype=TORCH_DTYPE,
+        device=DEVICE,
     )
 
     mark_tied_parameters(model=nanotron_model, parallel_context=parallel_context)
     sanity_check(root_module=nanotron_model)
 
     # Copy params from HF to Nanotron
+    log_rank("Copyng weights from HF model to Nanotron model...", logger=logger, level=logging.INFO, rank=0)
     # Token embeddings
     assert (
         nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight.shape
@@ -210,11 +226,13 @@ def main(args):
     with torch.no_grad():
         nanotron_model.model.lm_head.pp_block.weight.copy_(hf_model.lm_head.weight)
 
+    log_rank("Copied weights from HF model to Nanotron model!", logger=logger, level=logging.INFO, rank=0)
     # Store weights
     nanotron_checkpoint_path = Path(args.nanotron_checkpoint_path)
     save_weights(model=nanotron_model, parallel_context=parallel_context, root_folder=nanotron_checkpoint_path)
 
     # Store metadata
+    log_rank("Storing Nanotron model Configs and Metadata!", logger=logger, level=logging.INFO, rank=0)
     training_metadata = TrainingMetadata(
         last_train_step=0,
         consumed_train_samples=0,
@@ -223,6 +241,9 @@ def main(args):
     save_meta(
         root_folder=nanotron_checkpoint_path, parallel_context=parallel_context, training_metadata=training_metadata
     )
+    # Store Tokenizer into Nanotron Checkpoint folder
+    tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_name_or_path)
+    tokenizer.save_pretrained(nanotron_checkpoint_path)
 
     # Store Config and Model Config files
     with open(nanotron_checkpoint_path / "config.yaml", "w") as f:
@@ -233,7 +254,7 @@ def main(args):
                 init_method=ExistingCheckpointInit(nanotron_checkpoint_path),
                 model_config=nanotron_llama_config,
             ),
-            tokenizer=TokenizerArgs(args.pretrained_model_name_or_path),
+            tokenizer=TokenizerArgs(nanotron_checkpoint_path),
         )
         print("Saving config ...")
         yaml.dump(config.as_dict(), f)
@@ -242,6 +263,13 @@ def main(args):
         print("Saving model config ...")
         json.dump(asdict(nanotron_llama_config), f)
 
+    log_rank(
+        f"Checkpoint conversion finished, check {args.nanotron_checkpoint_path}",
+        logger=logger,
+        level=logging.INFO,
+        rank=0,
+    )
+
 
 if __name__ == "__main__":
     _args = get_args()
diff --git a/tools/llama3/convert_nanotron_to_hf.py b/tools/llama3/convert_nanotron_to_hf.py
index 363e0c51..c0bb1b1b 100644
--- a/tools/llama3/convert_nanotron_to_hf.py
+++ b/tools/llama3/convert_nanotron_to_hf.py
@@ -1,5 +1,5 @@
 """
-torchrun --nproc-per-node 1 convert_nanotron_to_hf.py --tp 1 --nanotron-checkpoint-path n_c/second --hugging-face-checkpoint-path hf_c/second
+torchrun --nproc-per-node 1 tools/llama3/convert_nanotron_to_hf.py --tp 1 --nanotron-checkpoint-path nanotron_checkpoints/NanotronLlama38B --hugging-face-checkpoint-path hf_checkpoints/ConvertedNanotronLlama38B
 """
 import argparse
 import os
@@ -7,7 +7,9 @@
 from pathlib import Path
 
 import torch
+from nanotron import logging
 from nanotron.config import Config, ParallelismArgs, get_config_from_file
+from nanotron.logging import log_rank
 from nanotron.models import build_model
 from nanotron.models.llama import LlamaForTraining
 from nanotron.parallel import ParallelContext
@@ -19,10 +21,15 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.models.llama import LlamaConfig as LlamaConfigHF
 
-# TODO Currentyly just sopporting Llama8B that doesn't needs any kind of parallelism
+logger = logging.get_logger(__name__)
+
+# TODO Currentyly just sopporting Llama8B that doesn't needs any kind of model parallelism
 DP = 1
 PP = 1
 
+DEVICE = torch.device("cuda")
+TORCH_DTYPE = torch.bfloat16
+
 
 def get_args():
     parser = argparse.ArgumentParser()
@@ -52,13 +59,7 @@ def get_args():
 
 
 def main(args):
-    # Load Nanotron checkpoint config
-    nanotron_config = get_config_from_file(
-        os.path.join(args.nanotron_checkpoint_path, "config.yaml"), config_class=Config, model_config_class=None
-    )
-    nanotron_llama_config = nanotron_config.model.model_config
-
-    # Init Llama3-8B Nanotron model
+    # Init Nanotron Parallel Utilities
     parallel_config = ParallelismArgs(
         dp=DP,
         pp=PP,
@@ -78,6 +79,20 @@ def main(args):
         tensor_parallel_size=parallel_config.tp,
     )
 
+    # Load Nanotron checkpoint config
+    log_rank(
+        f"Loading Nanotron checkpoint config file: {os.path.join(args.nanotron_checkpoint_path, 'config.yaml')}",
+        logger=logger,
+        level=logging.INFO,
+        rank=0,
+    )
+    nanotron_config = get_config_from_file(
+        os.path.join(args.nanotron_checkpoint_path, "config.yaml"), config_class=Config, model_config_class=None
+    )
+    nanotron_llama_config = nanotron_config.model.model_config
+
+    # Init Llama3-8B Nanotron model
+    log_rank("Init empty Nanotron Llama3 Model", logger=logger, level=logging.INFO, rank=0)
     nanotron_model = build_model(
         model_builder=lambda: LlamaForTraining(
             config=nanotron_config.model.model_config,
@@ -86,8 +101,8 @@ def main(args):
             random_states=None,
         ),
         parallel_context=parallel_context,
-        dtype=torch.bfloat16,
-        device=torch.device("cuda"),
+        dtype=TORCH_DTYPE,
+        device=DEVICE,
     )
 
     mark_tied_parameters(model=nanotron_model, parallel_context=parallel_context)
@@ -102,11 +117,12 @@ def main(args):
     ## TODO This takes pretty long time
     hf_model = AutoModelForCausalLM.from_config(
         config=LlamaConfigHF(**asdict(nanotron_llama_config)),
-        torch_dtype=torch.bfloat16,
+        torch_dtype=TORCH_DTYPE,
         attn_implementation="flash_attention_2",
-    ).to("cuda")
+    ).to(DEVICE)
 
     # Copy params from Nanotron to HF
+    log_rank("Copyng weights from Nanotron model to HF model...", logger=logger, level=logging.INFO, rank=0)
     # Token embeddings
     assert (
         nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight.shape
@@ -201,7 +217,9 @@ def main(args):
     with torch.no_grad():
         hf_model.lm_head.weight.copy_(nanotron_model.model.lm_head.pp_block.weight)
 
+    log_rank("Copied weights from Nanotron model to HF model!", logger=logger, level=logging.INFO, rank=0)
     # Store weights
+    log_rank("Storing HF model Checkpoint and Tokenizer!", logger=logger, level=logging.INFO, rank=0)
     hf_model.save_pretrained(args.hugging_face_checkpoint_path, from_pt=True)
     # Store tokenizer
     tokenizer = AutoTokenizer.from_pretrained(nanotron_config.tokenizer.tokenizer_name_or_path)
diff --git a/tools/llama3/generate_hf_predictions.py b/tools/llama3/generate_hf_predictions.py
index d484bff5..12b52f2a 100644
--- a/tools/llama3/generate_hf_predictions.py
+++ b/tools/llama3/generate_hf_predictions.py
@@ -1,5 +1,5 @@
 """
-torchrun --nproc-per-node 1 generate_hf_predictions.py --pretrained-model-name-or-path hf_c/second  --tokenizer-name-or-path /mloscratch/homes/solergib/models/Meta-Llama-3-8B-Instruct
+torchrun --nproc-per-node 1 tools/llama3/generate_hf_predictions.py --pretrained-model-name-or-path hf_checkpoints/ConvertedNanotronLlama38B
 """
 import argparse
 import os
@@ -10,6 +10,9 @@
 TXT = "The prologue of Romeo and Juliet calls the title characters “star-crossed lovers”—and the stars do seem to conspire against these young lovers.  Romeo is a Montague, and Juliet a Capulet. Their families are enmeshed in a feud, but the moment they meet—when Romeo and his friends attend a party at Juliets house in disguise—the two fall in love and quickly decide that they want to be married.  A friar secretly marries them, hoping to end the feud. Romeo and his companions almost immediately encounter Juliets cousin Tybalt, who challenges Romeo. When Romeo refuses to fight, Romeos friend Mercutio accepts the challenge and is killed. Romeo then kills Tybalt and is banished. He spends that night with Juliet and then leaves for Mantua.  Juliets father forces her into a marriage with Count Paris. To avoid this marriage, Juliet takes a potion, given her by the friar, that makes her appear dead. The friar will send Romeo word to be at her family tomb when she awakes. The plan goes awry, and Romeo learns instead that she is dead. In the tomb, Romeo kills himself. Juliet wakes, sees his body, and commits suicide. Their deaths appear finally to end the feud."
 SEQ_LENGTH = 256  # For truncating the TXT if GPU can't fit too many tokens
 
+DEVICE = torch.device("cuda")
+TORCH_DTYPE = torch.bfloat16
+
 
 def get_args():
     parser = argparse.ArgumentParser()
@@ -28,16 +31,15 @@ def get_args():
 
 def main(args):
     # TODO Refractor with HF pipeline or .generate()?
-    model = (
-        AutoModelForCausalLM.from_pretrained(
-            args.pretrained_model_name_or_path, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
-        )
-        .to("cuda")
-        .eval()
-    )
+    model = AutoModelForCausalLM.from_pretrained(
+        args.pretrained_model_name_or_path,
+        torch_dtype=TORCH_DTYPE,
+        attn_implementation="flash_attention_2",
+        device=DEVICE,
+    ).eval()
 
     tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_name_or_path)
-    tokens = tokenizer(TXT, return_tensors="pt", truncation=True, max_length=(SEQ_LENGTH + 1))["input_ids"].to("cuda")
+    tokens = tokenizer(TXT, return_tensors="pt", truncation=True, max_length=(SEQ_LENGTH + 1))["input_ids"].to(DEVICE)
     inputs = tokens[:, :-1]
 
     with torch.no_grad():
diff --git a/tools/llama3/generate_nanotron_predictions.py b/tools/llama3/generate_nanotron_predictions.py
index 3671007d..dc77acc9 100644
--- a/tools/llama3/generate_nanotron_predictions.py
+++ b/tools/llama3/generate_nanotron_predictions.py
@@ -1,5 +1,5 @@
 """
-torchrun --nproc-per-node 1 generate_nanotron_predictions.py --tp 1 --nanotron-checkpoint-path n_c/second  --tokenizer-name-or-path /mloscratch/homes/solergib/models/Meta-Llama-3-8B-Instruct
+torchrun --nproc-per-node 1 tools/llama3/generate_nanotron_predictions.py --tp 1 --nanotron-checkpoint-path nanotron_checkpoints/NanotronLlama38B
 """
 import argparse
 import os
@@ -17,13 +17,16 @@
 from nanotron.trainer import mark_tied_parameters
 from transformers import AutoTokenizer
 
-# TODO Currentyly just sopporting Llama8B that doesn't needs any kind of parallelism
+# TODO Currentyly just sopporting Llama8B that doesn't needs any kind of model parallelism
 DP = 1
 PP = 1
 
 TXT = "The prologue of Romeo and Juliet calls the title characters “star-crossed lovers”—and the stars do seem to conspire against these young lovers.  Romeo is a Montague, and Juliet a Capulet. Their families are enmeshed in a feud, but the moment they meet—when Romeo and his friends attend a party at Juliets house in disguise—the two fall in love and quickly decide that they want to be married.  A friar secretly marries them, hoping to end the feud. Romeo and his companions almost immediately encounter Juliets cousin Tybalt, who challenges Romeo. When Romeo refuses to fight, Romeos friend Mercutio accepts the challenge and is killed. Romeo then kills Tybalt and is banished. He spends that night with Juliet and then leaves for Mantua.  Juliets father forces her into a marriage with Count Paris. To avoid this marriage, Juliet takes a potion, given her by the friar, that makes her appear dead. The friar will send Romeo word to be at her family tomb when she awakes. The plan goes awry, and Romeo learns instead that she is dead. In the tomb, Romeo kills himself. Juliet wakes, sees his body, and commits suicide. Their deaths appear finally to end the feud."
 SEQ_LENGTH = 256  # For truncating the TXT if GPU can't fit too many tokens
 
+DEVICE = torch.device("cuda")
+TORCH_DTYPE = torch.bfloat16
+
 
 def get_args():
     parser = argparse.ArgumentParser()
@@ -38,21 +41,13 @@ def get_args():
     group = parser.add_argument_group(title="Nanotron Parallelism")
     group.add_argument("--tp", type=int, required=True, help="Tensor Parallelism Degree of the Nanotron Checkpoint")
 
-    group = parser.add_argument_group(title="Tokenizer")
-    group.add_argument(
-        "--tokenizer-name-or-path",
-        type=str,
-        required=True,
-        help="A path to a directory containing vocabulary files required by the tokenizer or the model id of a predefined tokenizer hosted inside a model repo on the Hugging Face Hub.",
-    )
-
     args = parser.parse_args()
 
     return args
 
 
 def main(args):
-
+    # Init Nanotron Parallel Utilities
     parallel_config = ParallelismArgs(
         dp=DP,
         pp=PP,
@@ -84,8 +79,8 @@ def main(args):
             random_states=None,
         ),
         parallel_context=parallel_context,
-        dtype=torch.bfloat16,
-        device=torch.device("cuda"),  # TODO Check with different parallelism
+        dtype=TORCH_DTYPE,
+        device=DEVICE,  # TODO Check with different parallelism if cpu is available
     )
 
     mark_tied_parameters(model=model, parallel_context=parallel_context)
@@ -94,9 +89,9 @@ def main(args):
     # Load checkpoint directly in memory and then only keep the state dictionary
     load_weights(model=model, parallel_context=parallel_context, root_folder=Path(args.nanotron_checkpoint_path))
 
-    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name_or_path)
-    tokens = tokenizer(TXT, return_tensors="pt", truncation=True, max_length=(SEQ_LENGTH + 1))["input_ids"].to("cuda")
-    inputs = {"input_ids": tokens[:, :-1], "input_mask": torch.ones((1, SEQ_LENGTH), device="cuda")}
+    tokenizer = AutoTokenizer.from_pretrained(nanotron_config.tokenizer.tokenizer_name_or_path)
+    tokens = tokenizer(TXT, return_tensors="pt", truncation=True, max_length=(SEQ_LENGTH + 1))["input_ids"].to(DEVICE)
+    inputs = {"input_ids": tokens[:, :-1], "input_mask": torch.ones((1, SEQ_LENGTH), device=DEVICE)}
 
     model.eval()
 

From de81b53a27e89f3cf5361bfa3eda6fc5078bf9db Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Sun, 19 May 2024 23:44:28 +0000
Subject: [PATCH 08/11] Cleaned scripts

---
 tools/llama3/convert_hf_to_nanotron.py        | 53 ++++++----------
 tools/llama3/convert_nanotron_to_hf.py        | 53 ++++++++--------
 tools/llama3/generate_hf_predictions.py       | 23 +++++--
 tools/llama3/generate_nanotron_predictions.py | 61 +++++++++++--------
 4 files changed, 98 insertions(+), 92 deletions(-)

diff --git a/tools/llama3/convert_hf_to_nanotron.py b/tools/llama3/convert_hf_to_nanotron.py
index 15f42705..4c185f01 100644
--- a/tools/llama3/convert_hf_to_nanotron.py
+++ b/tools/llama3/convert_hf_to_nanotron.py
@@ -1,5 +1,5 @@
 """
-torchrun --nproc-per-node 1 tools/llama3/convert_hf_to_nanotron.py --tp 1 --nanotron-checkpoint-path nanotron_checkpoints/NanotronLlama38B --pretrained-model-name-or-path /mloscratch/homes/solergib/models/Meta-Llama-3-8B-Instruct
+torchrun --nproc-per-node 1 tools/llama3/convert_hf_to_nanotron.py --nanotron-checkpoint-path nanotron_checkpoints/NanotronLlama38B --pretrained-model-name-or-path meta-llama/Meta-Llama-3-8B-Instruct
 """
 import argparse
 import json
@@ -8,11 +8,9 @@
 
 import torch
 import yaml
-from nanotron import logging
 from nanotron.config import Config, GeneralArgs, ModelArgs, ParallelismArgs, TokenizerArgs
 from nanotron.config.models_config import ExistingCheckpointInit
 from nanotron.config.models_config import LlamaConfig as LlamaConfigNanotron
-from nanotron.logging import log_rank
 from nanotron.models import build_model
 from nanotron.models.llama import LlamaForTraining
 from nanotron.parallel import ParallelContext
@@ -22,15 +20,10 @@
 from nanotron.serialize import TrainingMetadata, save_meta, save_weights
 from nanotron.serialize.metadata import DataStageMetadata
 from nanotron.trainer import mark_tied_parameters
+from tqdm import tqdm
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-logger = logging.get_logger(__name__)
-
-# TODO Currentyly just sopporting Llama8B that doesn't needs any kind of model parallelism
-DP = 1
-PP = 1
-
-DEVICE = torch.device("cuda")
+DEVICE = torch.device("cpu")
 TORCH_DTYPE = torch.bfloat16
 
 
@@ -44,9 +37,6 @@ def get_args():
         help="A path to a directory to store the converted Nanotron Checkpoint",
     )
 
-    group = parser.add_argument_group(title="Nanotron Parallelism")
-    group.add_argument("--tp", type=int, required=True, help="Tensor Parallelism Degree of the Nanotron Checkpoint")
-
     group = parser.add_argument_group(title="HuggingFace Model")
     group.add_argument(
         "--pretrained-model-name-or-path",
@@ -63,9 +53,9 @@ def get_args():
 def main(args):
     # Init Nanotron Parallel Utilities
     parallel_config = ParallelismArgs(
-        dp=DP,
-        pp=PP,
-        tp=args.tp,
+        dp=1,
+        pp=1,
+        tp=1,
         pp_engine=AllForwardAllBackwardPipelineEngine(),
         tp_mode=TensorParallelLinearMode.ALL_REDUCE,
         tp_linear_async_communication=False,
@@ -82,12 +72,7 @@ def main(args):
     )
 
     # Load Llama3-8B HF model
-    log_rank(
-        f"Loading pretrained Llama3 Model: {args.pretrained_model_name_or_path}",
-        logger=logger,
-        level=logging.INFO,
-        rank=0,
-    )
+    print(f"Loading pretrained Llama3 Model: {args.pretrained_model_name_or_path}")
     hf_model = AutoModelForCausalLM.from_pretrained(
         args.pretrained_model_name_or_path, torch_dtype=TORCH_DTYPE, attn_implementation="flash_attention_2"
     ).to(DEVICE)
@@ -118,7 +103,7 @@ def main(args):
     )
 
     # Init Llama3-8B Nanotron model
-    log_rank("Init empty Nanotron Llama3 Model", logger=logger, level=logging.INFO, rank=0)
+    print("Init empty Nanotron Llama3 Model")
     nanotron_model = build_model(
         model_builder=lambda: LlamaForTraining(
             config=nanotron_llama_config,
@@ -135,8 +120,9 @@ def main(args):
     sanity_check(root_module=nanotron_model)
 
     # Copy params from HF to Nanotron
-    log_rank("Copyng weights from HF model to Nanotron model...", logger=logger, level=logging.INFO, rank=0)
+    print("Copyng weights from HF model to Nanotron model...")
     # Token embeddings
+    print("Copyng Token Embeddings...")
     assert (
         nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight.shape
         == hf_model.model.embed_tokens.weight.shape
@@ -147,7 +133,11 @@ def main(args):
         )
 
     # Decoder layers
-    for i in range(nanotron_llama_config.num_hidden_layers):
+    for i in tqdm(
+        range(nanotron_llama_config.num_hidden_layers),
+        desc="Copyng Hidden Layers",
+        total=nanotron_llama_config.num_hidden_layers,
+    ):
         # Input layer norm
         assert (
             hf_model.model.layers[i].input_layernorm.weight.shape
@@ -217,22 +207,24 @@ def main(args):
             )
 
     # Last layer norm
+    print("Copyng Final Layer Norm...")
     assert nanotron_model.model.final_layer_norm.pp_block.weight.shape == hf_model.model.norm.weight.shape
     with torch.no_grad():
         nanotron_model.model.final_layer_norm.pp_block.weight.copy_(hf_model.model.norm.weight)
 
     # LM_Head
+    print("Copyng LM Head...")
     assert nanotron_model.model.lm_head.pp_block.weight.shape == hf_model.lm_head.weight.shape
     with torch.no_grad():
         nanotron_model.model.lm_head.pp_block.weight.copy_(hf_model.lm_head.weight)
 
-    log_rank("Copied weights from HF model to Nanotron model!", logger=logger, level=logging.INFO, rank=0)
+    print("Copied weights from HF model to Nanotron model!")
     # Store weights
     nanotron_checkpoint_path = Path(args.nanotron_checkpoint_path)
     save_weights(model=nanotron_model, parallel_context=parallel_context, root_folder=nanotron_checkpoint_path)
 
     # Store metadata
-    log_rank("Storing Nanotron model Configs and Metadata!", logger=logger, level=logging.INFO, rank=0)
+    print("Storing Nanotron model Configs and Metadata!")
     training_metadata = TrainingMetadata(
         last_train_step=0,
         consumed_train_samples=0,
@@ -263,12 +255,7 @@ def main(args):
         print("Saving model config ...")
         json.dump(asdict(nanotron_llama_config), f)
 
-    log_rank(
-        f"Checkpoint conversion finished, check {args.nanotron_checkpoint_path}",
-        logger=logger,
-        level=logging.INFO,
-        rank=0,
-    )
+    print(f"Checkpoint conversion finished, check {args.nanotron_checkpoint_path}")
 
 
 if __name__ == "__main__":
diff --git a/tools/llama3/convert_nanotron_to_hf.py b/tools/llama3/convert_nanotron_to_hf.py
index c0bb1b1b..cbcb73e6 100644
--- a/tools/llama3/convert_nanotron_to_hf.py
+++ b/tools/llama3/convert_nanotron_to_hf.py
@@ -1,5 +1,5 @@
 """
-torchrun --nproc-per-node 1 tools/llama3/convert_nanotron_to_hf.py --tp 1 --nanotron-checkpoint-path nanotron_checkpoints/NanotronLlama38B --hugging-face-checkpoint-path hf_checkpoints/ConvertedNanotronLlama38B
+torchrun --nproc-per-node 1 tools/llama3/convert_nanotron_to_hf.py --nanotron-checkpoint-path nanotron_checkpoints/NanotronLlama38B --hugging-face-checkpoint-path hf_checkpoints/ConvertedNanotronLlama38B
 """
 import argparse
 import os
@@ -7,9 +7,7 @@
 from pathlib import Path
 
 import torch
-from nanotron import logging
 from nanotron.config import Config, ParallelismArgs, get_config_from_file
-from nanotron.logging import log_rank
 from nanotron.models import build_model
 from nanotron.models.llama import LlamaForTraining
 from nanotron.parallel import ParallelContext
@@ -18,16 +16,11 @@
 from nanotron.parallel.tensor_parallel.nn import TensorParallelLinearMode
 from nanotron.serialize import load_weights
 from nanotron.trainer import mark_tied_parameters
+from tqdm import tqdm
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.models.llama import LlamaConfig as LlamaConfigHF
 
-logger = logging.get_logger(__name__)
-
-# TODO Currentyly just sopporting Llama8B that doesn't needs any kind of model parallelism
-DP = 1
-PP = 1
-
-DEVICE = torch.device("cuda")
+DEVICE = torch.device("cpu")
 TORCH_DTYPE = torch.bfloat16
 
 
@@ -41,9 +34,6 @@ def get_args():
         help="A path to a directory with a Nanotron Checkpoint",
     )
 
-    group = parser.add_argument_group(title="Nanotron Parallelism")
-    group.add_argument("--tp", type=int, required=True, help="Tensor Parallelism Degree of the Nanotron Checkpoint")
-
     group = parser.add_argument_group(title="HuggingFace Model")
     group.add_argument(
         "--hugging-face-checkpoint-path",
@@ -61,9 +51,9 @@ def get_args():
 def main(args):
     # Init Nanotron Parallel Utilities
     parallel_config = ParallelismArgs(
-        dp=DP,
-        pp=PP,
-        tp=args.tp,
+        dp=1,
+        pp=1,
+        tp=1,
         pp_engine=AllForwardAllBackwardPipelineEngine(),
         tp_mode=TensorParallelLinearMode.ALL_REDUCE,
         tp_linear_async_communication=False,
@@ -80,19 +70,14 @@ def main(args):
     )
 
     # Load Nanotron checkpoint config
-    log_rank(
-        f"Loading Nanotron checkpoint config file: {os.path.join(args.nanotron_checkpoint_path, 'config.yaml')}",
-        logger=logger,
-        level=logging.INFO,
-        rank=0,
-    )
+    print(f"Loading Nanotron checkpoint config file: {os.path.join(args.nanotron_checkpoint_path, 'config.yaml')}")
     nanotron_config = get_config_from_file(
         os.path.join(args.nanotron_checkpoint_path, "config.yaml"), config_class=Config, model_config_class=None
     )
     nanotron_llama_config = nanotron_config.model.model_config
 
     # Init Llama3-8B Nanotron model
-    log_rank("Init empty Nanotron Llama3 Model", logger=logger, level=logging.INFO, rank=0)
+    print("Init empty Nanotron Llama3 Model")
     nanotron_model = build_model(
         model_builder=lambda: LlamaForTraining(
             config=nanotron_config.model.model_config,
@@ -109,21 +94,23 @@ def main(args):
     sanity_check(root_module=nanotron_model)
 
     # Load Nanotron Checkpoint
+    print("Loading Nanotron Llama3 Model...")
     load_weights(
         model=nanotron_model, parallel_context=parallel_context, root_folder=Path(args.nanotron_checkpoint_path)
     )
 
     # Build empty HF Model
-    ## TODO This takes pretty long time
-    hf_model = AutoModelForCausalLM.from_config(
+    print("Init empty HF Llama3 Model")
+    hf_model = AutoModelForCausalLM.from_config(  # WARN This takes a long time
         config=LlamaConfigHF(**asdict(nanotron_llama_config)),
         torch_dtype=TORCH_DTYPE,
         attn_implementation="flash_attention_2",
     ).to(DEVICE)
 
     # Copy params from Nanotron to HF
-    log_rank("Copyng weights from Nanotron model to HF model...", logger=logger, level=logging.INFO, rank=0)
+    print("Copyng weights from Nanotron model to HF model...")
     # Token embeddings
+    print("Copyng Token Embeddings...")
     assert (
         nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight.shape
         == hf_model.model.embed_tokens.weight.shape
@@ -134,7 +121,11 @@ def main(args):
         )
 
     # Decoder layers
-    for i in range(nanotron_config.model.model_config.num_hidden_layers):
+    for i in tqdm(
+        range(nanotron_llama_config.num_hidden_layers),
+        desc="Copyng Hidden Layers",
+        total=nanotron_llama_config.num_hidden_layers,
+    ):
         # Input layer norm
         assert (
             hf_model.model.layers[i].input_layernorm.weight.shape
@@ -208,23 +199,27 @@ def main(args):
             )
 
     # Last layer norm
+    print("Copyng Final Layer Norm...")
     assert nanotron_model.model.final_layer_norm.pp_block.weight.shape == hf_model.model.norm.weight.shape
     with torch.no_grad():
         hf_model.model.norm.weight.copy_(nanotron_model.model.final_layer_norm.pp_block.weight)
 
     # LM_Head
+    print("Copyng LM Head...")
     assert nanotron_model.model.lm_head.pp_block.weight.shape == hf_model.lm_head.weight.shape
     with torch.no_grad():
         hf_model.lm_head.weight.copy_(nanotron_model.model.lm_head.pp_block.weight)
 
-    log_rank("Copied weights from Nanotron model to HF model!", logger=logger, level=logging.INFO, rank=0)
+    print("Copied weights from Nanotron model to HF model!")
     # Store weights
-    log_rank("Storing HF model Checkpoint and Tokenizer!", logger=logger, level=logging.INFO, rank=0)
+    print("Storing HF model Checkpoint and Tokenizer!")
     hf_model.save_pretrained(args.hugging_face_checkpoint_path, from_pt=True)
     # Store tokenizer
     tokenizer = AutoTokenizer.from_pretrained(nanotron_config.tokenizer.tokenizer_name_or_path)
     tokenizer.save_pretrained(args.hugging_face_checkpoint_path)
 
+    print(f"Checkpoint conversion finished, check {args.hugging_face_checkpoint_path}")
+
 
 if __name__ == "__main__":
     _args = get_args()
diff --git a/tools/llama3/generate_hf_predictions.py b/tools/llama3/generate_hf_predictions.py
index 12b52f2a..b16774a4 100644
--- a/tools/llama3/generate_hf_predictions.py
+++ b/tools/llama3/generate_hf_predictions.py
@@ -1,14 +1,16 @@
 """
-torchrun --nproc-per-node 1 tools/llama3/generate_hf_predictions.py --pretrained-model-name-or-path hf_checkpoints/ConvertedNanotronLlama38B
+torchrun --nproc-per-node 1 tools/llama3/generate_hf_predictions.py --pretrained-model-name-or-path meta-llama/Meta-Llama-3-8B-Instruct
 """
 import argparse
 import os
 
+import numpy as np
 import torch
+from sklearn.metrics import accuracy_score
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-TXT = "The prologue of Romeo and Juliet calls the title characters “star-crossed lovers”—and the stars do seem to conspire against these young lovers.  Romeo is a Montague, and Juliet a Capulet. Their families are enmeshed in a feud, but the moment they meet—when Romeo and his friends attend a party at Juliets house in disguise—the two fall in love and quickly decide that they want to be married.  A friar secretly marries them, hoping to end the feud. Romeo and his companions almost immediately encounter Juliets cousin Tybalt, who challenges Romeo. When Romeo refuses to fight, Romeos friend Mercutio accepts the challenge and is killed. Romeo then kills Tybalt and is banished. He spends that night with Juliet and then leaves for Mantua.  Juliets father forces her into a marriage with Count Paris. To avoid this marriage, Juliet takes a potion, given her by the friar, that makes her appear dead. The friar will send Romeo word to be at her family tomb when she awakes. The plan goes awry, and Romeo learns instead that she is dead. In the tomb, Romeo kills himself. Juliet wakes, sees his body, and commits suicide. Their deaths appear finally to end the feud."
-SEQ_LENGTH = 256  # For truncating the TXT if GPU can't fit too many tokens
+TXT = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHello! Which is the capital of France? What can I visit over there if I go for a week vacation?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nBonjour! The capital of France is Paris, also known as the City of Light. Paris is a stunning city with a rich history, art, fashion, and cuisine. If you're planning a week-long vacation in Paris, you'll have plenty of time to explore its iconic landmarks, museums, and neighborhoods. Here's a suggested itinerary to get you started:  Day 1-2: Iconic Landmarks  The Eiffel Tower (Tour Eiffel): The iron lady offers breathtaking views of the city. You can take the stairs or elevator to the top. The Louvre Museum (Musée du Louvre): Home to the Mona Lisa, Venus de Milo, and many other famous artworks. Arc de Triomphe: A monumental arch honoring the soldiers who fought and died for France. Champs-Élysées: A famous avenue lined with cafes, shops, and theaters. Day 3: Montmartre and Sacré-Cœur  Explore the charming neighborhood of Montmartre, known for its bohemian vibe, street artists, and stunning views. Visit the Basilique du Sacré-Cœur, a beautiful white church perched on a hill."
+SEQ_LENGTH = 512  # For truncating the TXT if GPU can't fit too many tokens
 
 DEVICE = torch.device("cuda")
 TORCH_DTYPE = torch.bfloat16
@@ -30,12 +32,12 @@ def get_args():
 
 
 def main(args):
-    # TODO Refractor with HF pipeline or .generate()?
+
     model = AutoModelForCausalLM.from_pretrained(
         args.pretrained_model_name_or_path,
         torch_dtype=TORCH_DTYPE,
         attn_implementation="flash_attention_2",
-        device=DEVICE,
+        device_map="auto",
     ).eval()
 
     tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_name_or_path)
@@ -62,6 +64,17 @@ def main(args):
             sep="\n",
         )
 
+    # Compute accuracy
+    predictions = np.argmax(output.logits.cpu(), axis=2).flatten().tolist()
+    labels = tokens.cpu().flatten()[1:].tolist()
+    print(f"\nAccuracy: {accuracy_score(labels, predictions)}")
+    # Results
+    ## [TP=1] HF 8B: 0.8308823529411765
+    ## [TP=2]HF 70B: 0.8860294117647058
+    ## [TP=1] HF -> Nanotron -> HF 8B: 0.8308823529411765
+    ## [TP=2] HF -> Nanotron -> HF 70B: 0.8860294117647058
+    ## [TP=1 --> TP=2] HF -> Nanotron -> Dummy Finetune to change TP=2 -> HF 8B: 0.8308823529411765
+
 
 if __name__ == "__main__":
     _args = get_args()
diff --git a/tools/llama3/generate_nanotron_predictions.py b/tools/llama3/generate_nanotron_predictions.py
index dc77acc9..fbede799 100644
--- a/tools/llama3/generate_nanotron_predictions.py
+++ b/tools/llama3/generate_nanotron_predictions.py
@@ -1,10 +1,12 @@
 """
-torchrun --nproc-per-node 1 tools/llama3/generate_nanotron_predictions.py --tp 1 --nanotron-checkpoint-path nanotron_checkpoints/NanotronLlama38B
+torchrun --nproc-per-node 2 tools/llama3/generate_nanotron_predictions.py --tp 2 --nanotron-checkpoint-path nanotron_checkpoints/NanotronLlama38B
 """
 import argparse
 import os
 from pathlib import Path
 
+import nanotron.distributed as dist
+import numpy as np
 import torch
 from nanotron.config import Config, ParallelismArgs, get_config_from_file
 from nanotron.models import build_model
@@ -15,14 +17,11 @@
 from nanotron.parallel.tensor_parallel.nn import TensorParallelLinearMode
 from nanotron.serialize import load_weights
 from nanotron.trainer import mark_tied_parameters
+from sklearn.metrics import accuracy_score
 from transformers import AutoTokenizer
 
-# TODO Currentyly just sopporting Llama8B that doesn't needs any kind of model parallelism
-DP = 1
-PP = 1
-
-TXT = "The prologue of Romeo and Juliet calls the title characters “star-crossed lovers”—and the stars do seem to conspire against these young lovers.  Romeo is a Montague, and Juliet a Capulet. Their families are enmeshed in a feud, but the moment they meet—when Romeo and his friends attend a party at Juliets house in disguise—the two fall in love and quickly decide that they want to be married.  A friar secretly marries them, hoping to end the feud. Romeo and his companions almost immediately encounter Juliets cousin Tybalt, who challenges Romeo. When Romeo refuses to fight, Romeos friend Mercutio accepts the challenge and is killed. Romeo then kills Tybalt and is banished. He spends that night with Juliet and then leaves for Mantua.  Juliets father forces her into a marriage with Count Paris. To avoid this marriage, Juliet takes a potion, given her by the friar, that makes her appear dead. The friar will send Romeo word to be at her family tomb when she awakes. The plan goes awry, and Romeo learns instead that she is dead. In the tomb, Romeo kills himself. Juliet wakes, sees his body, and commits suicide. Their deaths appear finally to end the feud."
-SEQ_LENGTH = 256  # For truncating the TXT if GPU can't fit too many tokens
+TXT = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHello! Which is the capital of France? What can I visit over there if I go for a week vacation?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nBonjour! The capital of France is Paris, also known as the City of Light. Paris is a stunning city with a rich history, art, fashion, and cuisine. If you're planning a week-long vacation in Paris, you'll have plenty of time to explore its iconic landmarks, museums, and neighborhoods. Here's a suggested itinerary to get you started:  Day 1-2: Iconic Landmarks  The Eiffel Tower (Tour Eiffel): The iron lady offers breathtaking views of the city. You can take the stairs or elevator to the top. The Louvre Museum (Musée du Louvre): Home to the Mona Lisa, Venus de Milo, and many other famous artworks. Arc de Triomphe: A monumental arch honoring the soldiers who fought and died for France. Champs-Élysées: A famous avenue lined with cafes, shops, and theaters. Day 3: Montmartre and Sacré-Cœur  Explore the charming neighborhood of Montmartre, known for its bohemian vibe, street artists, and stunning views. Visit the Basilique du Sacré-Cœur, a beautiful white church perched on a hill."
+SEQ_LENGTH = 512  # For truncating the TXT if GPU can't fit too many tokens
 
 DEVICE = torch.device("cuda")
 TORCH_DTYPE = torch.bfloat16
@@ -49,8 +48,8 @@ def get_args():
 def main(args):
     # Init Nanotron Parallel Utilities
     parallel_config = ParallelismArgs(
-        dp=DP,
-        pp=PP,
+        dp=1,
+        pp=1,
         tp=args.tp,
         pp_engine=AllForwardAllBackwardPipelineEngine(),
         tp_mode=TensorParallelLinearMode.ALL_REDUCE,
@@ -67,6 +66,8 @@ def main(args):
         tensor_parallel_size=parallel_config.tp,
     )
 
+    RANK = dist.get_rank(parallel_context.world_pg)
+
     nanotron_config = get_config_from_file(
         os.path.join(args.nanotron_checkpoint_path, "config.yaml"), config_class=Config, model_config_class=None
     )
@@ -98,22 +99,32 @@ def main(args):
     with torch.no_grad():
         output = model.model(**inputs)
 
-    predicted_tokens = [5, 27, 34]  # Index of the predictions to compare across models
-    term_cols = int(os.get_terminal_size().columns / 3)
-
-    for predicted_token in predicted_tokens:
-
-        print("\n", "=" * term_cols, f"Predictions of token {predicted_token}", "=" * term_cols)
-        next_tokens = torch.softmax(output.transpose(0, 1)[0, predicted_token, :], -1)
-        topk_next_tokens = torch.topk(next_tokens, 10)
-
-        print(
-            *[
-                f"[Nanotron Model] Next token: {idx.item()}, probability: {prob}"
-                for idx, prob in zip(topk_next_tokens.indices, topk_next_tokens.values)
-            ],
-            sep="\n",
-        )
+    if not RANK:
+        predicted_tokens = [5, 27, 34]  # Index of the predictions to compare across models
+        term_cols = int(os.get_terminal_size().columns / 3)
+
+        for predicted_token in predicted_tokens:
+
+            print("\n", "=" * term_cols, f"Predictions of token {predicted_token}", "=" * term_cols)
+            next_tokens = torch.softmax(output.transpose(0, 1)[0, predicted_token, :], -1)
+            topk_next_tokens = torch.topk(next_tokens, 10)
+
+            print(
+                *[
+                    f"[Nanotron Model] Next token: {idx.item()}, probability: {prob}"
+                    for idx, prob in zip(topk_next_tokens.indices, topk_next_tokens.values)
+                ],
+                sep="\n",
+            )
+
+        # Compute accuracy
+        predictions = np.argmax(output.transpose(0, 1).cpu(), axis=2).flatten().tolist()
+        labels = tokens.cpu().flatten()[1:].tolist()
+        print(f"\nAccuracy: {accuracy_score(labels, predictions)}")
+        # Results
+        ## Nanotron 8B, TP 1: 0.8272058823529411
+        ## Nanotron 8B, TP 2: 0.7720588235294118
+        ## Nanotron 70B, TP 2: 0.8272058823529411
 
 
 if __name__ == "__main__":

From a28c53289950adcaa0f6fe2914c762921929d66e Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 22 May 2024 13:01:01 +0000
Subject: [PATCH 09/11] Added Nanotron logging

---
 tools/llama3/convert_hf_to_nanotron.py | 57 +++++++++++++------------
 tools/llama3/convert_nanotron_to_hf.py | 59 ++++++++++++++------------
 2 files changed, 61 insertions(+), 55 deletions(-)

diff --git a/tools/llama3/convert_hf_to_nanotron.py b/tools/llama3/convert_hf_to_nanotron.py
index 4c185f01..0032bf9a 100644
--- a/tools/llama3/convert_hf_to_nanotron.py
+++ b/tools/llama3/convert_hf_to_nanotron.py
@@ -8,21 +8,23 @@
 
 import torch
 import yaml
-from nanotron.config import Config, GeneralArgs, ModelArgs, ParallelismArgs, TokenizerArgs
+from nanotron import logging
+from nanotron.config import Config, GeneralArgs, LoggingArgs, ModelArgs, ParallelismArgs, TokenizerArgs
 from nanotron.config.models_config import ExistingCheckpointInit
 from nanotron.config.models_config import LlamaConfig as LlamaConfigNanotron
+from nanotron.logging import log_rank, set_ranks_logging_level
 from nanotron.models import build_model
 from nanotron.models.llama import LlamaForTraining
 from nanotron.parallel import ParallelContext
 from nanotron.parallel.parameters import sanity_check
-from nanotron.parallel.pipeline_parallel.engine import AllForwardAllBackwardPipelineEngine
-from nanotron.parallel.tensor_parallel.nn import TensorParallelLinearMode
 from nanotron.serialize import TrainingMetadata, save_meta, save_weights
 from nanotron.serialize.metadata import DataStageMetadata
 from nanotron.trainer import mark_tied_parameters
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+logger = logging.get_logger(__name__)
+
 DEVICE = torch.device("cpu")
 TORCH_DTYPE = torch.bfloat16
 
@@ -52,18 +54,7 @@ def get_args():
 
 def main(args):
     # Init Nanotron Parallel Utilities
-    parallel_config = ParallelismArgs(
-        dp=1,
-        pp=1,
-        tp=1,
-        pp_engine=AllForwardAllBackwardPipelineEngine(),
-        tp_mode=TensorParallelLinearMode.ALL_REDUCE,
-        tp_linear_async_communication=False,
-    )
-    assert (
-        parallel_config.tp_mode == TensorParallelLinearMode.ALL_REDUCE
-        and parallel_config.tp_linear_async_communication is False
-    )
+    parallel_config = ParallelismArgs(dp=1, pp=1, tp=1)
 
     parallel_context = ParallelContext(
         data_parallel_size=parallel_config.dp,
@@ -71,8 +62,15 @@ def main(args):
         tensor_parallel_size=parallel_config.tp,
     )
 
+    set_ranks_logging_level(parallel_context=parallel_context, logging_config=LoggingArgs())
+
     # Load Llama3-8B HF model
-    print(f"Loading pretrained Llama3 Model: {args.pretrained_model_name_or_path}")
+    log_rank(
+        f"Loading pretrained Llama3 Model: {args.pretrained_model_name_or_path}",
+        logger=logger,
+        level=logging.INFO,
+        rank=0,
+    )
     hf_model = AutoModelForCausalLM.from_pretrained(
         args.pretrained_model_name_or_path, torch_dtype=TORCH_DTYPE, attn_implementation="flash_attention_2"
     ).to(DEVICE)
@@ -103,7 +101,7 @@ def main(args):
     )
 
     # Init Llama3-8B Nanotron model
-    print("Init empty Nanotron Llama3 Model")
+    log_rank("Init empty Nanotron Llama3 Model", logger=logger, level=logging.INFO, rank=0)
     nanotron_model = build_model(
         model_builder=lambda: LlamaForTraining(
             config=nanotron_llama_config,
@@ -120,9 +118,9 @@ def main(args):
     sanity_check(root_module=nanotron_model)
 
     # Copy params from HF to Nanotron
-    print("Copyng weights from HF model to Nanotron model...")
+    log_rank("Copying weights from HF model to Nanotron model...", logger=logger, level=logging.INFO, rank=0)
     # Token embeddings
-    print("Copyng Token Embeddings...")
+    log_rank("Copying Token Embeddings...", logger=logger, level=logging.INFO, rank=0)
     assert (
         nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight.shape
         == hf_model.model.embed_tokens.weight.shape
@@ -135,7 +133,7 @@ def main(args):
     # Decoder layers
     for i in tqdm(
         range(nanotron_llama_config.num_hidden_layers),
-        desc="Copyng Hidden Layers",
+        desc="Copying Hidden Layers",
         total=nanotron_llama_config.num_hidden_layers,
     ):
         # Input layer norm
@@ -207,24 +205,24 @@ def main(args):
             )
 
     # Last layer norm
-    print("Copyng Final Layer Norm...")
+    log_rank("Copying Final Layer Norm...", logger=logger, level=logging.INFO, rank=0)
     assert nanotron_model.model.final_layer_norm.pp_block.weight.shape == hf_model.model.norm.weight.shape
     with torch.no_grad():
         nanotron_model.model.final_layer_norm.pp_block.weight.copy_(hf_model.model.norm.weight)
 
     # LM_Head
-    print("Copyng LM Head...")
+    log_rank("Copying LM Head...", logger=logger, level=logging.INFO, rank=0)
     assert nanotron_model.model.lm_head.pp_block.weight.shape == hf_model.lm_head.weight.shape
     with torch.no_grad():
         nanotron_model.model.lm_head.pp_block.weight.copy_(hf_model.lm_head.weight)
 
-    print("Copied weights from HF model to Nanotron model!")
+    log_rank("Copied weights from HF model to Nanotron model!", logger=logger, level=logging.INFO, rank=0)
     # Store weights
     nanotron_checkpoint_path = Path(args.nanotron_checkpoint_path)
     save_weights(model=nanotron_model, parallel_context=parallel_context, root_folder=nanotron_checkpoint_path)
 
     # Store metadata
-    print("Storing Nanotron model Configs and Metadata!")
+    log_rank("Storing Nanotron model Configs and Metadata!", logger=logger, level=logging.INFO, rank=0)
     training_metadata = TrainingMetadata(
         last_train_step=0,
         consumed_train_samples=0,
@@ -248,14 +246,19 @@ def main(args):
             ),
             tokenizer=TokenizerArgs(nanotron_checkpoint_path),
         )
-        print("Saving config ...")
+        log_rank("Saving config ...", logger=logger, level=logging.INFO, rank=0)
         yaml.dump(config.as_dict(), f)
 
     with open(nanotron_checkpoint_path / "model_config.json", "w") as f:
-        print("Saving model config ...")
+        log_rank("Saving model config ...", logger=logger, level=logging.INFO, rank=0)
         json.dump(asdict(nanotron_llama_config), f)
 
-    print(f"Checkpoint conversion finished, check {args.nanotron_checkpoint_path}")
+    log_rank(
+        f"Checkpoint conversion finished, check {args.nanotron_checkpoint_path}",
+        logger=logger,
+        level=logging.INFO,
+        rank=0,
+    )
 
 
 if __name__ == "__main__":
diff --git a/tools/llama3/convert_nanotron_to_hf.py b/tools/llama3/convert_nanotron_to_hf.py
index cbcb73e6..0254ed4a 100644
--- a/tools/llama3/convert_nanotron_to_hf.py
+++ b/tools/llama3/convert_nanotron_to_hf.py
@@ -7,19 +7,21 @@
 from pathlib import Path
 
 import torch
-from nanotron.config import Config, ParallelismArgs, get_config_from_file
+from nanotron import logging
+from nanotron.config import Config, LoggingArgs, ParallelismArgs, get_config_from_file
+from nanotron.logging import log_rank, set_ranks_logging_level
 from nanotron.models import build_model
 from nanotron.models.llama import LlamaForTraining
 from nanotron.parallel import ParallelContext
 from nanotron.parallel.parameters import sanity_check
-from nanotron.parallel.pipeline_parallel.engine import AllForwardAllBackwardPipelineEngine
-from nanotron.parallel.tensor_parallel.nn import TensorParallelLinearMode
 from nanotron.serialize import load_weights
 from nanotron.trainer import mark_tied_parameters
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.models.llama import LlamaConfig as LlamaConfigHF
 
+logger = logging.get_logger(__name__)
+
 DEVICE = torch.device("cpu")
 TORCH_DTYPE = torch.bfloat16
 
@@ -41,7 +43,6 @@ def get_args():
         required=True,
         help="A path to a directory to store the converted checkpoint",
     )
-    # TODO Add push to hub
 
     args = parser.parse_args()
 
@@ -50,18 +51,7 @@ def get_args():
 
 def main(args):
     # Init Nanotron Parallel Utilities
-    parallel_config = ParallelismArgs(
-        dp=1,
-        pp=1,
-        tp=1,
-        pp_engine=AllForwardAllBackwardPipelineEngine(),
-        tp_mode=TensorParallelLinearMode.ALL_REDUCE,
-        tp_linear_async_communication=False,
-    )
-    assert (
-        parallel_config.tp_mode == TensorParallelLinearMode.ALL_REDUCE
-        and parallel_config.tp_linear_async_communication is False
-    )
+    parallel_config = ParallelismArgs(dp=1, pp=1, tp=1)
 
     parallel_context = ParallelContext(
         data_parallel_size=parallel_config.dp,
@@ -69,15 +59,23 @@ def main(args):
         tensor_parallel_size=parallel_config.tp,
     )
 
+    set_ranks_logging_level(parallel_context=parallel_context, logging_config=LoggingArgs())
+
     # Load Nanotron checkpoint config
-    print(f"Loading Nanotron checkpoint config file: {os.path.join(args.nanotron_checkpoint_path, 'config.yaml')}")
+    log_rank(
+        f"Loading Nanotron checkpoint config file: {os.path.join(args.nanotron_checkpoint_path, 'config.yaml')}",
+        logger=logger,
+        level=logging.INFO,
+        rank=0,
+    )
     nanotron_config = get_config_from_file(
         os.path.join(args.nanotron_checkpoint_path, "config.yaml"), config_class=Config, model_config_class=None
     )
     nanotron_llama_config = nanotron_config.model.model_config
 
     # Init Llama3-8B Nanotron model
-    print("Init empty Nanotron Llama3 Model")
+    log_rank("Init empty Nanotron Llama3 Model", logger=logger, level=logging.INFO, rank=0)
+
     nanotron_model = build_model(
         model_builder=lambda: LlamaForTraining(
             config=nanotron_config.model.model_config,
@@ -94,13 +92,13 @@ def main(args):
     sanity_check(root_module=nanotron_model)
 
     # Load Nanotron Checkpoint
-    print("Loading Nanotron Llama3 Model...")
+    log_rank("Loading Nanotron Llama3 Model...", logger=logger, level=logging.INFO, rank=0)
     load_weights(
         model=nanotron_model, parallel_context=parallel_context, root_folder=Path(args.nanotron_checkpoint_path)
     )
 
     # Build empty HF Model
-    print("Init empty HF Llama3 Model")
+    log_rank("Init empty HF Llama3 Model", logger=logger, level=logging.INFO, rank=0)
     hf_model = AutoModelForCausalLM.from_config(  # WARN This takes a long time
         config=LlamaConfigHF(**asdict(nanotron_llama_config)),
         torch_dtype=TORCH_DTYPE,
@@ -108,9 +106,9 @@ def main(args):
     ).to(DEVICE)
 
     # Copy params from Nanotron to HF
-    print("Copyng weights from Nanotron model to HF model...")
+    log_rank("Copying weights from Nanotron model to HF model...", logger=logger, level=logging.INFO, rank=0)
     # Token embeddings
-    print("Copyng Token Embeddings...")
+    log_rank("Copying Token Embeddings...", logger=logger, level=logging.INFO, rank=0)
     assert (
         nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight.shape
         == hf_model.model.embed_tokens.weight.shape
@@ -123,7 +121,7 @@ def main(args):
     # Decoder layers
     for i in tqdm(
         range(nanotron_llama_config.num_hidden_layers),
-        desc="Copyng Hidden Layers",
+        desc="Copying Hidden Layers",
         total=nanotron_llama_config.num_hidden_layers,
     ):
         # Input layer norm
@@ -199,26 +197,31 @@ def main(args):
             )
 
     # Last layer norm
-    print("Copyng Final Layer Norm...")
+    log_rank("Copying Final Layer Norm...", logger=logger, level=logging.INFO, rank=0)
     assert nanotron_model.model.final_layer_norm.pp_block.weight.shape == hf_model.model.norm.weight.shape
     with torch.no_grad():
         hf_model.model.norm.weight.copy_(nanotron_model.model.final_layer_norm.pp_block.weight)
 
     # LM_Head
-    print("Copyng LM Head...")
+    log_rank("Copying LM Head...", logger=logger, level=logging.INFO, rank=0)
     assert nanotron_model.model.lm_head.pp_block.weight.shape == hf_model.lm_head.weight.shape
     with torch.no_grad():
         hf_model.lm_head.weight.copy_(nanotron_model.model.lm_head.pp_block.weight)
 
-    print("Copied weights from Nanotron model to HF model!")
+    log_rank("Copied weights from Nanotron model to HF model!", logger=logger, level=logging.INFO, rank=0)
     # Store weights
-    print("Storing HF model Checkpoint and Tokenizer!")
+    log_rank("Storing HF model Checkpoint and Tokenizer!", logger=logger, level=logging.INFO, rank=0)
     hf_model.save_pretrained(args.hugging_face_checkpoint_path, from_pt=True)
     # Store tokenizer
     tokenizer = AutoTokenizer.from_pretrained(nanotron_config.tokenizer.tokenizer_name_or_path)
     tokenizer.save_pretrained(args.hugging_face_checkpoint_path)
 
-    print(f"Checkpoint conversion finished, check {args.hugging_face_checkpoint_path}")
+    log_rank(
+        f"Checkpoint conversion finished, check {args.hugging_face_checkpoint_path}",
+        logger=logger,
+        level=logging.INFO,
+        rank=0,
+    )
 
 
 if __name__ == "__main__":

From 3e169c5afc80b3494c1065bd4ef3079dc2b657de Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 22 May 2024 13:44:01 +0000
Subject: [PATCH 10/11] Added README

---
 tools/llama3/README.md                 | 19 +++++++++++++++++++
 tools/llama3/convert_hf_to_nanotron.py |  4 ++--
 tools/llama3/convert_nanotron_to_hf.py |  2 +-
 3 files changed, 22 insertions(+), 3 deletions(-)
 create mode 100644 tools/llama3/README.md

diff --git a/tools/llama3/README.md b/tools/llama3/README.md
new file mode 100644
index 00000000..57a31b5e
--- /dev/null
+++ b/tools/llama3/README.md
@@ -0,0 +1,19 @@
+# Llama3 Weight conversion tool
+This directory contains the scripts to convert the Llama3 checkpoints from HuggingFace to Nanotron and vice versa.
+
+- Convert from HuggingFace to Nanotron
+
+`torchrun --nproc-per-node 1 tools/llama3/convert_hf_to_nanotron.py --nanotron-checkpoint-path nanotron_checkpoints/Nanotron-Llama-3-8B --pretrained-model-name-or-path meta-llama/Meta-Llama-3-8B-Instruct`
+- Convert from Nanotron to HuggingFace
+
+`torchrun --nproc-per-node 1 tools/llama3/convert_nanotron_to_hf.py --nanotron-checkpoint-path nanotron_checkpoints/Nanotron-Llama3-8B --hugging-face-checkpoint-path hf_checkpoints/Converted-Nanotron-Llama-3-8B`
+
+In summary, we will do the following:
+- Initialize the HuggingFace model with the pretrained weights. The model definition is [here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py).
+- Initialize a Nanotron model with empty weights. The model definition is [here](https://github.com/huggingface/nanotron/blob/main/src/nanotron/models/llama.py).
+- Copy the parameters layer by layer from one model to the other.
+- Store the Nanotron model along with the tokenizer.
+
+When comparing the HuggingFace implementation with the Nanotron implementation, the main difference lies in the Q, K & V matrices and in the MLP projections. In the HuggingFace implementation, these matrices are separated [[1]](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L415), [[2]](https://github.com/huggingface/transformers/blob/1518508467d96b3866fc4ebcb7a5b3a2e0df2aa4/src/transformers/models/llama/modeling_llama.py#L194), while in the Nanotron implementation, they are concatenated [[1b]](https://github.com/huggingface/nanotron/blob/b69690703a1c41b60cd706f92a80a3d23ebaf2d0/src/nanotron/models/llama.py#L310), [[2b]](https://github.com/huggingface/nanotron/blob/b69690703a1c41b60cd706f92a80a3d23ebaf2d0/src/nanotron/models/llama.py#L149). It is crucial to pay attention to these details to convert the models correctly.
+
+To perform the conversion, we will need at least **1 GPU**, although the operations will be carried out on the **CPU**. We will convert the models with a parallel configuration of DP = PP = TP = 1, but it should be noted that the checkpoints generated by Nanotron are topology agnostic.
diff --git a/tools/llama3/convert_hf_to_nanotron.py b/tools/llama3/convert_hf_to_nanotron.py
index 0032bf9a..e30610a3 100644
--- a/tools/llama3/convert_hf_to_nanotron.py
+++ b/tools/llama3/convert_hf_to_nanotron.py
@@ -1,5 +1,5 @@
 """
-torchrun --nproc-per-node 1 tools/llama3/convert_hf_to_nanotron.py --nanotron-checkpoint-path nanotron_checkpoints/NanotronLlama38B --pretrained-model-name-or-path meta-llama/Meta-Llama-3-8B-Instruct
+torchrun --nproc-per-node 1 tools/llama3/convert_hf_to_nanotron.py --nanotron-checkpoint-path nanotron_checkpoints/Nanotron-Llama-3-8B --pretrained-model-name-or-path meta-llama/Meta-Llama-3-8B-Instruct
 """
 import argparse
 import json
@@ -238,7 +238,7 @@ def main(args):
     # Store Config and Model Config files
     with open(nanotron_checkpoint_path / "config.yaml", "w") as f:
         config = Config(
-            general=GeneralArgs(project="conversion", run="Llama3-8B"),
+            general=GeneralArgs(project="Nanotron", run="Llama3"),
             parallelism=parallel_config,
             model=ModelArgs(
                 init_method=ExistingCheckpointInit(nanotron_checkpoint_path),
diff --git a/tools/llama3/convert_nanotron_to_hf.py b/tools/llama3/convert_nanotron_to_hf.py
index 0254ed4a..c5fb1940 100644
--- a/tools/llama3/convert_nanotron_to_hf.py
+++ b/tools/llama3/convert_nanotron_to_hf.py
@@ -1,5 +1,5 @@
 """
-torchrun --nproc-per-node 1 tools/llama3/convert_nanotron_to_hf.py --nanotron-checkpoint-path nanotron_checkpoints/NanotronLlama38B --hugging-face-checkpoint-path hf_checkpoints/ConvertedNanotronLlama38B
+torchrun --nproc-per-node 1 tools/llama3/convert_nanotron_to_hf.py --nanotron-checkpoint-path nanotron_checkpoints/Nanotron-Llama-3-8B --hugging-face-checkpoint-path hf_checkpoints/Converted-Nanotron-Llama-3-8B
 """
 import argparse
 import os

From 0afd7b743654442dac47d5ef5c1e6e823af5def3 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Thu, 25 Jul 2024 11:08:14 +0000
Subject: [PATCH 11/11] Deleted generation tests & added donwload info to
 README

---
 tools/llama3/README.md                        |  13 ++
 tools/llama3/generate_hf_predictions.py       |  81 -----------
 tools/llama3/generate_nanotron_predictions.py | 132 ------------------
 3 files changed, 13 insertions(+), 213 deletions(-)
 delete mode 100644 tools/llama3/generate_hf_predictions.py
 delete mode 100644 tools/llama3/generate_nanotron_predictions.py

diff --git a/tools/llama3/README.md b/tools/llama3/README.md
index 57a31b5e..048b16c9 100644
--- a/tools/llama3/README.md
+++ b/tools/llama3/README.md
@@ -1,6 +1,19 @@
 # Llama3 Weight conversion tool
 This directory contains the scripts to convert the Llama3 checkpoints from HuggingFace to Nanotron and vice versa.
 
+## Downloading Llama3 weights
+We will use the Llama3 checkpoints stored in the HuggingFace Hub for the conversion. Despite being able to download the checkpoints setting `--pretrained-model-name-or-pathmeta-llama/Meta-Llama-3-8B-Instruct`, this is not recommended since it will download the pretrained weights to the [HuggingFace Cache](https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables#hfhubcache). We encourage to download the checkpoints explicityly to a folder with the following script:
+```python
+from huggingface_hub import snapshot_download
+
+snapshot_download(repo_id="meta-llama/Meta-Llama-3-8B",
+                  local_dir = "models/Meta-Llama-3-8B",
+                  local_dir_use_symlinks=False,
+                  ignore_patterns=["original/*"]) # Llama3 models in the Hub contain the original checkpoints. We just want the HF checkpoint stored in the safetensor format
+```
+
+## Conversion
+
 - Convert from HuggingFace to Nanotron
 
 `torchrun --nproc-per-node 1 tools/llama3/convert_hf_to_nanotron.py --nanotron-checkpoint-path nanotron_checkpoints/Nanotron-Llama-3-8B --pretrained-model-name-or-path meta-llama/Meta-Llama-3-8B-Instruct`
diff --git a/tools/llama3/generate_hf_predictions.py b/tools/llama3/generate_hf_predictions.py
deleted file mode 100644
index b16774a4..00000000
--- a/tools/llama3/generate_hf_predictions.py
+++ /dev/null
@@ -1,81 +0,0 @@
-"""
-torchrun --nproc-per-node 1 tools/llama3/generate_hf_predictions.py --pretrained-model-name-or-path meta-llama/Meta-Llama-3-8B-Instruct
-"""
-import argparse
-import os
-
-import numpy as np
-import torch
-from sklearn.metrics import accuracy_score
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-TXT = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHello! Which is the capital of France? What can I visit over there if I go for a week vacation?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nBonjour! The capital of France is Paris, also known as the City of Light. Paris is a stunning city with a rich history, art, fashion, and cuisine. If you're planning a week-long vacation in Paris, you'll have plenty of time to explore its iconic landmarks, museums, and neighborhoods. Here's a suggested itinerary to get you started:  Day 1-2: Iconic Landmarks  The Eiffel Tower (Tour Eiffel): The iron lady offers breathtaking views of the city. You can take the stairs or elevator to the top. The Louvre Museum (Musée du Louvre): Home to the Mona Lisa, Venus de Milo, and many other famous artworks. Arc de Triomphe: A monumental arch honoring the soldiers who fought and died for France. Champs-Élysées: A famous avenue lined with cafes, shops, and theaters. Day 3: Montmartre and Sacré-Cœur  Explore the charming neighborhood of Montmartre, known for its bohemian vibe, street artists, and stunning views. Visit the Basilique du Sacré-Cœur, a beautiful white church perched on a hill."
-SEQ_LENGTH = 512  # For truncating the TXT if GPU can't fit too many tokens
-
-DEVICE = torch.device("cuda")
-TORCH_DTYPE = torch.bfloat16
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group(title="HuggingFace Model")
-    group.add_argument(
-        "--pretrained-model-name-or-path",
-        type=str,
-        required=True,
-        help="A path to a directory containing model weights saved using save_pretrained() or the model id of a pretrained model hosted inside a model repo on the Hugging Face Hub",
-    )
-
-    args = parser.parse_args()
-
-    return args
-
-
-def main(args):
-
-    model = AutoModelForCausalLM.from_pretrained(
-        args.pretrained_model_name_or_path,
-        torch_dtype=TORCH_DTYPE,
-        attn_implementation="flash_attention_2",
-        device_map="auto",
-    ).eval()
-
-    tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_name_or_path)
-    tokens = tokenizer(TXT, return_tensors="pt", truncation=True, max_length=(SEQ_LENGTH + 1))["input_ids"].to(DEVICE)
-    inputs = tokens[:, :-1]
-
-    with torch.no_grad():
-        output = model(inputs)
-
-    predicted_tokens = [5, 27, 34]  # Index of the predictions to compare across models
-    term_cols = int(os.get_terminal_size().columns / 3)
-
-    for predicted_token in predicted_tokens:
-
-        print("\n", "=" * term_cols, f"Predictions of token {predicted_token}", "=" * term_cols)
-        next_tokens = torch.softmax(output.logits[0, predicted_token, :], -1)
-        topk_next_tokens = torch.topk(next_tokens, 10)
-
-        print(
-            *[
-                f"[HF Model] Next token: {idx.item()}, probability: {prob}"
-                for idx, prob in zip(topk_next_tokens.indices, topk_next_tokens.values)
-            ],
-            sep="\n",
-        )
-
-    # Compute accuracy
-    predictions = np.argmax(output.logits.cpu(), axis=2).flatten().tolist()
-    labels = tokens.cpu().flatten()[1:].tolist()
-    print(f"\nAccuracy: {accuracy_score(labels, predictions)}")
-    # Results
-    ## [TP=1] HF 8B: 0.8308823529411765
-    ## [TP=2]HF 70B: 0.8860294117647058
-    ## [TP=1] HF -> Nanotron -> HF 8B: 0.8308823529411765
-    ## [TP=2] HF -> Nanotron -> HF 70B: 0.8860294117647058
-    ## [TP=1 --> TP=2] HF -> Nanotron -> Dummy Finetune to change TP=2 -> HF 8B: 0.8308823529411765
-
-
-if __name__ == "__main__":
-    _args = get_args()
-    main(_args)
diff --git a/tools/llama3/generate_nanotron_predictions.py b/tools/llama3/generate_nanotron_predictions.py
deleted file mode 100644
index fbede799..00000000
--- a/tools/llama3/generate_nanotron_predictions.py
+++ /dev/null
@@ -1,132 +0,0 @@
-"""
-torchrun --nproc-per-node 2 tools/llama3/generate_nanotron_predictions.py --tp 2 --nanotron-checkpoint-path nanotron_checkpoints/NanotronLlama38B
-"""
-import argparse
-import os
-from pathlib import Path
-
-import nanotron.distributed as dist
-import numpy as np
-import torch
-from nanotron.config import Config, ParallelismArgs, get_config_from_file
-from nanotron.models import build_model
-from nanotron.models.llama import LlamaForTraining
-from nanotron.parallel import ParallelContext
-from nanotron.parallel.parameters import sanity_check
-from nanotron.parallel.pipeline_parallel.engine import AllForwardAllBackwardPipelineEngine
-from nanotron.parallel.tensor_parallel.nn import TensorParallelLinearMode
-from nanotron.serialize import load_weights
-from nanotron.trainer import mark_tied_parameters
-from sklearn.metrics import accuracy_score
-from transformers import AutoTokenizer
-
-TXT = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHello! Which is the capital of France? What can I visit over there if I go for a week vacation?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nBonjour! The capital of France is Paris, also known as the City of Light. Paris is a stunning city with a rich history, art, fashion, and cuisine. If you're planning a week-long vacation in Paris, you'll have plenty of time to explore its iconic landmarks, museums, and neighborhoods. Here's a suggested itinerary to get you started:  Day 1-2: Iconic Landmarks  The Eiffel Tower (Tour Eiffel): The iron lady offers breathtaking views of the city. You can take the stairs or elevator to the top. The Louvre Museum (Musée du Louvre): Home to the Mona Lisa, Venus de Milo, and many other famous artworks. Arc de Triomphe: A monumental arch honoring the soldiers who fought and died for France. Champs-Élysées: A famous avenue lined with cafes, shops, and theaters. Day 3: Montmartre and Sacré-Cœur  Explore the charming neighborhood of Montmartre, known for its bohemian vibe, street artists, and stunning views. Visit the Basilique du Sacré-Cœur, a beautiful white church perched on a hill."
-SEQ_LENGTH = 512  # For truncating the TXT if GPU can't fit too many tokens
-
-DEVICE = torch.device("cuda")
-TORCH_DTYPE = torch.bfloat16
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group(title="Nanotron Model")
-    group.add_argument(
-        "--nanotron-checkpoint-path",
-        type=str,
-        required=True,
-        help="A path to a directory containing a Nanotron Checkpoint",
-    )
-
-    group = parser.add_argument_group(title="Nanotron Parallelism")
-    group.add_argument("--tp", type=int, required=True, help="Tensor Parallelism Degree of the Nanotron Checkpoint")
-
-    args = parser.parse_args()
-
-    return args
-
-
-def main(args):
-    # Init Nanotron Parallel Utilities
-    parallel_config = ParallelismArgs(
-        dp=1,
-        pp=1,
-        tp=args.tp,
-        pp_engine=AllForwardAllBackwardPipelineEngine(),
-        tp_mode=TensorParallelLinearMode.ALL_REDUCE,
-        tp_linear_async_communication=False,
-    )
-    assert (
-        parallel_config.tp_mode == TensorParallelLinearMode.ALL_REDUCE
-        and parallel_config.tp_linear_async_communication is False
-    )
-
-    parallel_context = ParallelContext(
-        data_parallel_size=parallel_config.dp,
-        pipeline_parallel_size=parallel_config.pp,
-        tensor_parallel_size=parallel_config.tp,
-    )
-
-    RANK = dist.get_rank(parallel_context.world_pg)
-
-    nanotron_config = get_config_from_file(
-        os.path.join(args.nanotron_checkpoint_path, "config.yaml"), config_class=Config, model_config_class=None
-    )
-
-    model = build_model(
-        model_builder=lambda: LlamaForTraining(
-            config=nanotron_config.model.model_config,
-            parallel_context=parallel_context,
-            parallel_config=parallel_config,
-            random_states=None,
-        ),
-        parallel_context=parallel_context,
-        dtype=TORCH_DTYPE,
-        device=DEVICE,  # TODO Check with different parallelism if cpu is available
-    )
-
-    mark_tied_parameters(model=model, parallel_context=parallel_context)
-    sanity_check(root_module=model)
-
-    # Load checkpoint directly in memory and then only keep the state dictionary
-    load_weights(model=model, parallel_context=parallel_context, root_folder=Path(args.nanotron_checkpoint_path))
-
-    tokenizer = AutoTokenizer.from_pretrained(nanotron_config.tokenizer.tokenizer_name_or_path)
-    tokens = tokenizer(TXT, return_tensors="pt", truncation=True, max_length=(SEQ_LENGTH + 1))["input_ids"].to(DEVICE)
-    inputs = {"input_ids": tokens[:, :-1], "input_mask": torch.ones((1, SEQ_LENGTH), device=DEVICE)}
-
-    model.eval()
-
-    with torch.no_grad():
-        output = model.model(**inputs)
-
-    if not RANK:
-        predicted_tokens = [5, 27, 34]  # Index of the predictions to compare across models
-        term_cols = int(os.get_terminal_size().columns / 3)
-
-        for predicted_token in predicted_tokens:
-
-            print("\n", "=" * term_cols, f"Predictions of token {predicted_token}", "=" * term_cols)
-            next_tokens = torch.softmax(output.transpose(0, 1)[0, predicted_token, :], -1)
-            topk_next_tokens = torch.topk(next_tokens, 10)
-
-            print(
-                *[
-                    f"[Nanotron Model] Next token: {idx.item()}, probability: {prob}"
-                    for idx, prob in zip(topk_next_tokens.indices, topk_next_tokens.values)
-                ],
-                sep="\n",
-            )
-
-        # Compute accuracy
-        predictions = np.argmax(output.transpose(0, 1).cpu(), axis=2).flatten().tolist()
-        labels = tokens.cpu().flatten()[1:].tolist()
-        print(f"\nAccuracy: {accuracy_score(labels, predictions)}")
-        # Results
-        ## Nanotron 8B, TP 1: 0.8272058823529411
-        ## Nanotron 8B, TP 2: 0.7720588235294118
-        ## Nanotron 70B, TP 2: 0.8272058823529411
-
-
-if __name__ == "__main__":
-    _args = get_args()
-    main(_args)