From 0afd7b743654442dac47d5ef5c1e6e823af5def3 Mon Sep 17 00:00:00 2001 From: tj-solergibert Date: Thu, 25 Jul 2024 11:08:14 +0000 Subject: [PATCH] Deleted generation tests & added donwload info to README --- tools/llama3/README.md | 13 ++ tools/llama3/generate_hf_predictions.py | 81 ----------- tools/llama3/generate_nanotron_predictions.py | 132 ------------------ 3 files changed, 13 insertions(+), 213 deletions(-) delete mode 100644 tools/llama3/generate_hf_predictions.py delete mode 100644 tools/llama3/generate_nanotron_predictions.py diff --git a/tools/llama3/README.md b/tools/llama3/README.md index 57a31b5e..048b16c9 100644 --- a/tools/llama3/README.md +++ b/tools/llama3/README.md @@ -1,6 +1,19 @@ # Llama3 Weight conversion tool This directory contains the scripts to convert the Llama3 checkpoints from HuggingFace to Nanotron and vice versa. +## Downloading Llama3 weights +We will use the Llama3 checkpoints stored in the HuggingFace Hub for the conversion. Despite being able to download the checkpoints setting `--pretrained-model-name-or-pathmeta-llama/Meta-Llama-3-8B-Instruct`, this is not recommended since it will download the pretrained weights to the [HuggingFace Cache](https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables#hfhubcache). We encourage to download the checkpoints explicityly to a folder with the following script: +```python +from huggingface_hub import snapshot_download + +snapshot_download(repo_id="meta-llama/Meta-Llama-3-8B", + local_dir = "models/Meta-Llama-3-8B", + local_dir_use_symlinks=False, + ignore_patterns=["original/*"]) # Llama3 models in the Hub contain the original checkpoints. We just want the HF checkpoint stored in the safetensor format +``` + +## Conversion + - Convert from HuggingFace to Nanotron `torchrun --nproc-per-node 1 tools/llama3/convert_hf_to_nanotron.py --nanotron-checkpoint-path nanotron_checkpoints/Nanotron-Llama-3-8B --pretrained-model-name-or-path meta-llama/Meta-Llama-3-8B-Instruct` diff --git a/tools/llama3/generate_hf_predictions.py b/tools/llama3/generate_hf_predictions.py deleted file mode 100644 index b16774a4..00000000 --- a/tools/llama3/generate_hf_predictions.py +++ /dev/null @@ -1,81 +0,0 @@ -""" -torchrun --nproc-per-node 1 tools/llama3/generate_hf_predictions.py --pretrained-model-name-or-path meta-llama/Meta-Llama-3-8B-Instruct -""" -import argparse -import os - -import numpy as np -import torch -from sklearn.metrics import accuracy_score -from transformers import AutoModelForCausalLM, AutoTokenizer - -TXT = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHello! Which is the capital of France? What can I visit over there if I go for a week vacation?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nBonjour! The capital of France is Paris, also known as the City of Light. Paris is a stunning city with a rich history, art, fashion, and cuisine. If you're planning a week-long vacation in Paris, you'll have plenty of time to explore its iconic landmarks, museums, and neighborhoods. Here's a suggested itinerary to get you started: Day 1-2: Iconic Landmarks The Eiffel Tower (Tour Eiffel): The iron lady offers breathtaking views of the city. You can take the stairs or elevator to the top. The Louvre Museum (Musée du Louvre): Home to the Mona Lisa, Venus de Milo, and many other famous artworks. Arc de Triomphe: A monumental arch honoring the soldiers who fought and died for France. Champs-Élysées: A famous avenue lined with cafes, shops, and theaters. Day 3: Montmartre and Sacré-Cœur Explore the charming neighborhood of Montmartre, known for its bohemian vibe, street artists, and stunning views. Visit the Basilique du Sacré-Cœur, a beautiful white church perched on a hill." -SEQ_LENGTH = 512 # For truncating the TXT if GPU can't fit too many tokens - -DEVICE = torch.device("cuda") -TORCH_DTYPE = torch.bfloat16 - - -def get_args(): - parser = argparse.ArgumentParser() - group = parser.add_argument_group(title="HuggingFace Model") - group.add_argument( - "--pretrained-model-name-or-path", - type=str, - required=True, - help="A path to a directory containing model weights saved using save_pretrained() or the model id of a pretrained model hosted inside a model repo on the Hugging Face Hub", - ) - - args = parser.parse_args() - - return args - - -def main(args): - - model = AutoModelForCausalLM.from_pretrained( - args.pretrained_model_name_or_path, - torch_dtype=TORCH_DTYPE, - attn_implementation="flash_attention_2", - device_map="auto", - ).eval() - - tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_name_or_path) - tokens = tokenizer(TXT, return_tensors="pt", truncation=True, max_length=(SEQ_LENGTH + 1))["input_ids"].to(DEVICE) - inputs = tokens[:, :-1] - - with torch.no_grad(): - output = model(inputs) - - predicted_tokens = [5, 27, 34] # Index of the predictions to compare across models - term_cols = int(os.get_terminal_size().columns / 3) - - for predicted_token in predicted_tokens: - - print("\n", "=" * term_cols, f"Predictions of token {predicted_token}", "=" * term_cols) - next_tokens = torch.softmax(output.logits[0, predicted_token, :], -1) - topk_next_tokens = torch.topk(next_tokens, 10) - - print( - *[ - f"[HF Model] Next token: {idx.item()}, probability: {prob}" - for idx, prob in zip(topk_next_tokens.indices, topk_next_tokens.values) - ], - sep="\n", - ) - - # Compute accuracy - predictions = np.argmax(output.logits.cpu(), axis=2).flatten().tolist() - labels = tokens.cpu().flatten()[1:].tolist() - print(f"\nAccuracy: {accuracy_score(labels, predictions)}") - # Results - ## [TP=1] HF 8B: 0.8308823529411765 - ## [TP=2]HF 70B: 0.8860294117647058 - ## [TP=1] HF -> Nanotron -> HF 8B: 0.8308823529411765 - ## [TP=2] HF -> Nanotron -> HF 70B: 0.8860294117647058 - ## [TP=1 --> TP=2] HF -> Nanotron -> Dummy Finetune to change TP=2 -> HF 8B: 0.8308823529411765 - - -if __name__ == "__main__": - _args = get_args() - main(_args) diff --git a/tools/llama3/generate_nanotron_predictions.py b/tools/llama3/generate_nanotron_predictions.py deleted file mode 100644 index fbede799..00000000 --- a/tools/llama3/generate_nanotron_predictions.py +++ /dev/null @@ -1,132 +0,0 @@ -""" -torchrun --nproc-per-node 2 tools/llama3/generate_nanotron_predictions.py --tp 2 --nanotron-checkpoint-path nanotron_checkpoints/NanotronLlama38B -""" -import argparse -import os -from pathlib import Path - -import nanotron.distributed as dist -import numpy as np -import torch -from nanotron.config import Config, ParallelismArgs, get_config_from_file -from nanotron.models import build_model -from nanotron.models.llama import LlamaForTraining -from nanotron.parallel import ParallelContext -from nanotron.parallel.parameters import sanity_check -from nanotron.parallel.pipeline_parallel.engine import AllForwardAllBackwardPipelineEngine -from nanotron.parallel.tensor_parallel.nn import TensorParallelLinearMode -from nanotron.serialize import load_weights -from nanotron.trainer import mark_tied_parameters -from sklearn.metrics import accuracy_score -from transformers import AutoTokenizer - -TXT = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHello! Which is the capital of France? What can I visit over there if I go for a week vacation?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nBonjour! The capital of France is Paris, also known as the City of Light. Paris is a stunning city with a rich history, art, fashion, and cuisine. If you're planning a week-long vacation in Paris, you'll have plenty of time to explore its iconic landmarks, museums, and neighborhoods. Here's a suggested itinerary to get you started: Day 1-2: Iconic Landmarks The Eiffel Tower (Tour Eiffel): The iron lady offers breathtaking views of the city. You can take the stairs or elevator to the top. The Louvre Museum (Musée du Louvre): Home to the Mona Lisa, Venus de Milo, and many other famous artworks. Arc de Triomphe: A monumental arch honoring the soldiers who fought and died for France. Champs-Élysées: A famous avenue lined with cafes, shops, and theaters. Day 3: Montmartre and Sacré-Cœur Explore the charming neighborhood of Montmartre, known for its bohemian vibe, street artists, and stunning views. Visit the Basilique du Sacré-Cœur, a beautiful white church perched on a hill." -SEQ_LENGTH = 512 # For truncating the TXT if GPU can't fit too many tokens - -DEVICE = torch.device("cuda") -TORCH_DTYPE = torch.bfloat16 - - -def get_args(): - parser = argparse.ArgumentParser() - group = parser.add_argument_group(title="Nanotron Model") - group.add_argument( - "--nanotron-checkpoint-path", - type=str, - required=True, - help="A path to a directory containing a Nanotron Checkpoint", - ) - - group = parser.add_argument_group(title="Nanotron Parallelism") - group.add_argument("--tp", type=int, required=True, help="Tensor Parallelism Degree of the Nanotron Checkpoint") - - args = parser.parse_args() - - return args - - -def main(args): - # Init Nanotron Parallel Utilities - parallel_config = ParallelismArgs( - dp=1, - pp=1, - tp=args.tp, - pp_engine=AllForwardAllBackwardPipelineEngine(), - tp_mode=TensorParallelLinearMode.ALL_REDUCE, - tp_linear_async_communication=False, - ) - assert ( - parallel_config.tp_mode == TensorParallelLinearMode.ALL_REDUCE - and parallel_config.tp_linear_async_communication is False - ) - - parallel_context = ParallelContext( - data_parallel_size=parallel_config.dp, - pipeline_parallel_size=parallel_config.pp, - tensor_parallel_size=parallel_config.tp, - ) - - RANK = dist.get_rank(parallel_context.world_pg) - - nanotron_config = get_config_from_file( - os.path.join(args.nanotron_checkpoint_path, "config.yaml"), config_class=Config, model_config_class=None - ) - - model = build_model( - model_builder=lambda: LlamaForTraining( - config=nanotron_config.model.model_config, - parallel_context=parallel_context, - parallel_config=parallel_config, - random_states=None, - ), - parallel_context=parallel_context, - dtype=TORCH_DTYPE, - device=DEVICE, # TODO Check with different parallelism if cpu is available - ) - - mark_tied_parameters(model=model, parallel_context=parallel_context) - sanity_check(root_module=model) - - # Load checkpoint directly in memory and then only keep the state dictionary - load_weights(model=model, parallel_context=parallel_context, root_folder=Path(args.nanotron_checkpoint_path)) - - tokenizer = AutoTokenizer.from_pretrained(nanotron_config.tokenizer.tokenizer_name_or_path) - tokens = tokenizer(TXT, return_tensors="pt", truncation=True, max_length=(SEQ_LENGTH + 1))["input_ids"].to(DEVICE) - inputs = {"input_ids": tokens[:, :-1], "input_mask": torch.ones((1, SEQ_LENGTH), device=DEVICE)} - - model.eval() - - with torch.no_grad(): - output = model.model(**inputs) - - if not RANK: - predicted_tokens = [5, 27, 34] # Index of the predictions to compare across models - term_cols = int(os.get_terminal_size().columns / 3) - - for predicted_token in predicted_tokens: - - print("\n", "=" * term_cols, f"Predictions of token {predicted_token}", "=" * term_cols) - next_tokens = torch.softmax(output.transpose(0, 1)[0, predicted_token, :], -1) - topk_next_tokens = torch.topk(next_tokens, 10) - - print( - *[ - f"[Nanotron Model] Next token: {idx.item()}, probability: {prob}" - for idx, prob in zip(topk_next_tokens.indices, topk_next_tokens.values) - ], - sep="\n", - ) - - # Compute accuracy - predictions = np.argmax(output.transpose(0, 1).cpu(), axis=2).flatten().tolist() - labels = tokens.cpu().flatten()[1:].tolist() - print(f"\nAccuracy: {accuracy_score(labels, predictions)}") - # Results - ## Nanotron 8B, TP 1: 0.8272058823529411 - ## Nanotron 8B, TP 2: 0.7720588235294118 - ## Nanotron 70B, TP 2: 0.8272058823529411 - - -if __name__ == "__main__": - _args = get_args() - main(_args)