From 0afd7b743654442dac47d5ef5c1e6e823af5def3 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Thu, 25 Jul 2024 11:08:14 +0000
Subject: [PATCH] Deleted generation tests & added donwload info to README

---
 tools/llama3/README.md                        |  13 ++
 tools/llama3/generate_hf_predictions.py       |  81 -----------
 tools/llama3/generate_nanotron_predictions.py | 132 ------------------
 3 files changed, 13 insertions(+), 213 deletions(-)
 delete mode 100644 tools/llama3/generate_hf_predictions.py
 delete mode 100644 tools/llama3/generate_nanotron_predictions.py

diff --git a/tools/llama3/README.md b/tools/llama3/README.md
index 57a31b5e..048b16c9 100644
--- a/tools/llama3/README.md
+++ b/tools/llama3/README.md
@@ -1,6 +1,19 @@
 # Llama3 Weight conversion tool
 This directory contains the scripts to convert the Llama3 checkpoints from HuggingFace to Nanotron and vice versa.
 
+## Downloading Llama3 weights
+We will use the Llama3 checkpoints stored in the HuggingFace Hub for the conversion. Despite being able to download the checkpoints setting `--pretrained-model-name-or-pathmeta-llama/Meta-Llama-3-8B-Instruct`, this is not recommended since it will download the pretrained weights to the [HuggingFace Cache](https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables#hfhubcache). We encourage to download the checkpoints explicityly to a folder with the following script:
+```python
+from huggingface_hub import snapshot_download
+
+snapshot_download(repo_id="meta-llama/Meta-Llama-3-8B",
+                  local_dir = "models/Meta-Llama-3-8B",
+                  local_dir_use_symlinks=False,
+                  ignore_patterns=["original/*"]) # Llama3 models in the Hub contain the original checkpoints. We just want the HF checkpoint stored in the safetensor format
+```
+
+## Conversion
+
 - Convert from HuggingFace to Nanotron
 
 `torchrun --nproc-per-node 1 tools/llama3/convert_hf_to_nanotron.py --nanotron-checkpoint-path nanotron_checkpoints/Nanotron-Llama-3-8B --pretrained-model-name-or-path meta-llama/Meta-Llama-3-8B-Instruct`
diff --git a/tools/llama3/generate_hf_predictions.py b/tools/llama3/generate_hf_predictions.py
deleted file mode 100644
index b16774a4..00000000
--- a/tools/llama3/generate_hf_predictions.py
+++ /dev/null
@@ -1,81 +0,0 @@
-"""
-torchrun --nproc-per-node 1 tools/llama3/generate_hf_predictions.py --pretrained-model-name-or-path meta-llama/Meta-Llama-3-8B-Instruct
-"""
-import argparse
-import os
-
-import numpy as np
-import torch
-from sklearn.metrics import accuracy_score
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-TXT = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHello! Which is the capital of France? What can I visit over there if I go for a week vacation?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nBonjour! The capital of France is Paris, also known as the City of Light. Paris is a stunning city with a rich history, art, fashion, and cuisine. If you're planning a week-long vacation in Paris, you'll have plenty of time to explore its iconic landmarks, museums, and neighborhoods. Here's a suggested itinerary to get you started:  Day 1-2: Iconic Landmarks  The Eiffel Tower (Tour Eiffel): The iron lady offers breathtaking views of the city. You can take the stairs or elevator to the top. The Louvre Museum (Musée du Louvre): Home to the Mona Lisa, Venus de Milo, and many other famous artworks. Arc de Triomphe: A monumental arch honoring the soldiers who fought and died for France. Champs-Élysées: A famous avenue lined with cafes, shops, and theaters. Day 3: Montmartre and Sacré-Cœur  Explore the charming neighborhood of Montmartre, known for its bohemian vibe, street artists, and stunning views. Visit the Basilique du Sacré-Cœur, a beautiful white church perched on a hill."
-SEQ_LENGTH = 512  # For truncating the TXT if GPU can't fit too many tokens
-
-DEVICE = torch.device("cuda")
-TORCH_DTYPE = torch.bfloat16
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group(title="HuggingFace Model")
-    group.add_argument(
-        "--pretrained-model-name-or-path",
-        type=str,
-        required=True,
-        help="A path to a directory containing model weights saved using save_pretrained() or the model id of a pretrained model hosted inside a model repo on the Hugging Face Hub",
-    )
-
-    args = parser.parse_args()
-
-    return args
-
-
-def main(args):
-
-    model = AutoModelForCausalLM.from_pretrained(
-        args.pretrained_model_name_or_path,
-        torch_dtype=TORCH_DTYPE,
-        attn_implementation="flash_attention_2",
-        device_map="auto",
-    ).eval()
-
-    tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_name_or_path)
-    tokens = tokenizer(TXT, return_tensors="pt", truncation=True, max_length=(SEQ_LENGTH + 1))["input_ids"].to(DEVICE)
-    inputs = tokens[:, :-1]
-
-    with torch.no_grad():
-        output = model(inputs)
-
-    predicted_tokens = [5, 27, 34]  # Index of the predictions to compare across models
-    term_cols = int(os.get_terminal_size().columns / 3)
-
-    for predicted_token in predicted_tokens:
-
-        print("\n", "=" * term_cols, f"Predictions of token {predicted_token}", "=" * term_cols)
-        next_tokens = torch.softmax(output.logits[0, predicted_token, :], -1)
-        topk_next_tokens = torch.topk(next_tokens, 10)
-
-        print(
-            *[
-                f"[HF Model] Next token: {idx.item()}, probability: {prob}"
-                for idx, prob in zip(topk_next_tokens.indices, topk_next_tokens.values)
-            ],
-            sep="\n",
-        )
-
-    # Compute accuracy
-    predictions = np.argmax(output.logits.cpu(), axis=2).flatten().tolist()
-    labels = tokens.cpu().flatten()[1:].tolist()
-    print(f"\nAccuracy: {accuracy_score(labels, predictions)}")
-    # Results
-    ## [TP=1] HF 8B: 0.8308823529411765
-    ## [TP=2]HF 70B: 0.8860294117647058
-    ## [TP=1] HF -> Nanotron -> HF 8B: 0.8308823529411765
-    ## [TP=2] HF -> Nanotron -> HF 70B: 0.8860294117647058
-    ## [TP=1 --> TP=2] HF -> Nanotron -> Dummy Finetune to change TP=2 -> HF 8B: 0.8308823529411765
-
-
-if __name__ == "__main__":
-    _args = get_args()
-    main(_args)
diff --git a/tools/llama3/generate_nanotron_predictions.py b/tools/llama3/generate_nanotron_predictions.py
deleted file mode 100644
index fbede799..00000000
--- a/tools/llama3/generate_nanotron_predictions.py
+++ /dev/null
@@ -1,132 +0,0 @@
-"""
-torchrun --nproc-per-node 2 tools/llama3/generate_nanotron_predictions.py --tp 2 --nanotron-checkpoint-path nanotron_checkpoints/NanotronLlama38B
-"""
-import argparse
-import os
-from pathlib import Path
-
-import nanotron.distributed as dist
-import numpy as np
-import torch
-from nanotron.config import Config, ParallelismArgs, get_config_from_file
-from nanotron.models import build_model
-from nanotron.models.llama import LlamaForTraining
-from nanotron.parallel import ParallelContext
-from nanotron.parallel.parameters import sanity_check
-from nanotron.parallel.pipeline_parallel.engine import AllForwardAllBackwardPipelineEngine
-from nanotron.parallel.tensor_parallel.nn import TensorParallelLinearMode
-from nanotron.serialize import load_weights
-from nanotron.trainer import mark_tied_parameters
-from sklearn.metrics import accuracy_score
-from transformers import AutoTokenizer
-
-TXT = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHello! Which is the capital of France? What can I visit over there if I go for a week vacation?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nBonjour! The capital of France is Paris, also known as the City of Light. Paris is a stunning city with a rich history, art, fashion, and cuisine. If you're planning a week-long vacation in Paris, you'll have plenty of time to explore its iconic landmarks, museums, and neighborhoods. Here's a suggested itinerary to get you started:  Day 1-2: Iconic Landmarks  The Eiffel Tower (Tour Eiffel): The iron lady offers breathtaking views of the city. You can take the stairs or elevator to the top. The Louvre Museum (Musée du Louvre): Home to the Mona Lisa, Venus de Milo, and many other famous artworks. Arc de Triomphe: A monumental arch honoring the soldiers who fought and died for France. Champs-Élysées: A famous avenue lined with cafes, shops, and theaters. Day 3: Montmartre and Sacré-Cœur  Explore the charming neighborhood of Montmartre, known for its bohemian vibe, street artists, and stunning views. Visit the Basilique du Sacré-Cœur, a beautiful white church perched on a hill."
-SEQ_LENGTH = 512  # For truncating the TXT if GPU can't fit too many tokens
-
-DEVICE = torch.device("cuda")
-TORCH_DTYPE = torch.bfloat16
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group(title="Nanotron Model")
-    group.add_argument(
-        "--nanotron-checkpoint-path",
-        type=str,
-        required=True,
-        help="A path to a directory containing a Nanotron Checkpoint",
-    )
-
-    group = parser.add_argument_group(title="Nanotron Parallelism")
-    group.add_argument("--tp", type=int, required=True, help="Tensor Parallelism Degree of the Nanotron Checkpoint")
-
-    args = parser.parse_args()
-
-    return args
-
-
-def main(args):
-    # Init Nanotron Parallel Utilities
-    parallel_config = ParallelismArgs(
-        dp=1,
-        pp=1,
-        tp=args.tp,
-        pp_engine=AllForwardAllBackwardPipelineEngine(),
-        tp_mode=TensorParallelLinearMode.ALL_REDUCE,
-        tp_linear_async_communication=False,
-    )
-    assert (
-        parallel_config.tp_mode == TensorParallelLinearMode.ALL_REDUCE
-        and parallel_config.tp_linear_async_communication is False
-    )
-
-    parallel_context = ParallelContext(
-        data_parallel_size=parallel_config.dp,
-        pipeline_parallel_size=parallel_config.pp,
-        tensor_parallel_size=parallel_config.tp,
-    )
-
-    RANK = dist.get_rank(parallel_context.world_pg)
-
-    nanotron_config = get_config_from_file(
-        os.path.join(args.nanotron_checkpoint_path, "config.yaml"), config_class=Config, model_config_class=None
-    )
-
-    model = build_model(
-        model_builder=lambda: LlamaForTraining(
-            config=nanotron_config.model.model_config,
-            parallel_context=parallel_context,
-            parallel_config=parallel_config,
-            random_states=None,
-        ),
-        parallel_context=parallel_context,
-        dtype=TORCH_DTYPE,
-        device=DEVICE,  # TODO Check with different parallelism if cpu is available
-    )
-
-    mark_tied_parameters(model=model, parallel_context=parallel_context)
-    sanity_check(root_module=model)
-
-    # Load checkpoint directly in memory and then only keep the state dictionary
-    load_weights(model=model, parallel_context=parallel_context, root_folder=Path(args.nanotron_checkpoint_path))
-
-    tokenizer = AutoTokenizer.from_pretrained(nanotron_config.tokenizer.tokenizer_name_or_path)
-    tokens = tokenizer(TXT, return_tensors="pt", truncation=True, max_length=(SEQ_LENGTH + 1))["input_ids"].to(DEVICE)
-    inputs = {"input_ids": tokens[:, :-1], "input_mask": torch.ones((1, SEQ_LENGTH), device=DEVICE)}
-
-    model.eval()
-
-    with torch.no_grad():
-        output = model.model(**inputs)
-
-    if not RANK:
-        predicted_tokens = [5, 27, 34]  # Index of the predictions to compare across models
-        term_cols = int(os.get_terminal_size().columns / 3)
-
-        for predicted_token in predicted_tokens:
-
-            print("\n", "=" * term_cols, f"Predictions of token {predicted_token}", "=" * term_cols)
-            next_tokens = torch.softmax(output.transpose(0, 1)[0, predicted_token, :], -1)
-            topk_next_tokens = torch.topk(next_tokens, 10)
-
-            print(
-                *[
-                    f"[Nanotron Model] Next token: {idx.item()}, probability: {prob}"
-                    for idx, prob in zip(topk_next_tokens.indices, topk_next_tokens.values)
-                ],
-                sep="\n",
-            )
-
-        # Compute accuracy
-        predictions = np.argmax(output.transpose(0, 1).cpu(), axis=2).flatten().tolist()
-        labels = tokens.cpu().flatten()[1:].tolist()
-        print(f"\nAccuracy: {accuracy_score(labels, predictions)}")
-        # Results
-        ## Nanotron 8B, TP 1: 0.8272058823529411
-        ## Nanotron 8B, TP 2: 0.7720588235294118
-        ## Nanotron 70B, TP 2: 0.8272058823529411
-
-
-if __name__ == "__main__":
-    _args = get_args()
-    main(_args)