From 3e169c5afc80b3494c1065bd4ef3079dc2b657de Mon Sep 17 00:00:00 2001 From: tj-solergibert Date: Wed, 22 May 2024 13:44:01 +0000 Subject: [PATCH] Added README --- tools/llama3/README.md | 19 +++++++++++++++++++ tools/llama3/convert_hf_to_nanotron.py | 4 ++-- tools/llama3/convert_nanotron_to_hf.py | 2 +- 3 files changed, 22 insertions(+), 3 deletions(-) create mode 100644 tools/llama3/README.md diff --git a/tools/llama3/README.md b/tools/llama3/README.md new file mode 100644 index 00000000..57a31b5e --- /dev/null +++ b/tools/llama3/README.md @@ -0,0 +1,19 @@ +# Llama3 Weight conversion tool +This directory contains the scripts to convert the Llama3 checkpoints from HuggingFace to Nanotron and vice versa. + +- Convert from HuggingFace to Nanotron + +`torchrun --nproc-per-node 1 tools/llama3/convert_hf_to_nanotron.py --nanotron-checkpoint-path nanotron_checkpoints/Nanotron-Llama-3-8B --pretrained-model-name-or-path meta-llama/Meta-Llama-3-8B-Instruct` +- Convert from Nanotron to HuggingFace + +`torchrun --nproc-per-node 1 tools/llama3/convert_nanotron_to_hf.py --nanotron-checkpoint-path nanotron_checkpoints/Nanotron-Llama3-8B --hugging-face-checkpoint-path hf_checkpoints/Converted-Nanotron-Llama-3-8B` + +In summary, we will do the following: +- Initialize the HuggingFace model with the pretrained weights. The model definition is [here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py). +- Initialize a Nanotron model with empty weights. The model definition is [here](https://github.com/huggingface/nanotron/blob/main/src/nanotron/models/llama.py). +- Copy the parameters layer by layer from one model to the other. +- Store the Nanotron model along with the tokenizer. + +When comparing the HuggingFace implementation with the Nanotron implementation, the main difference lies in the Q, K & V matrices and in the MLP projections. In the HuggingFace implementation, these matrices are separated [[1]](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L415), [[2]](https://github.com/huggingface/transformers/blob/1518508467d96b3866fc4ebcb7a5b3a2e0df2aa4/src/transformers/models/llama/modeling_llama.py#L194), while in the Nanotron implementation, they are concatenated [[1b]](https://github.com/huggingface/nanotron/blob/b69690703a1c41b60cd706f92a80a3d23ebaf2d0/src/nanotron/models/llama.py#L310), [[2b]](https://github.com/huggingface/nanotron/blob/b69690703a1c41b60cd706f92a80a3d23ebaf2d0/src/nanotron/models/llama.py#L149). It is crucial to pay attention to these details to convert the models correctly. + +To perform the conversion, we will need at least **1 GPU**, although the operations will be carried out on the **CPU**. We will convert the models with a parallel configuration of DP = PP = TP = 1, but it should be noted that the checkpoints generated by Nanotron are topology agnostic. diff --git a/tools/llama3/convert_hf_to_nanotron.py b/tools/llama3/convert_hf_to_nanotron.py index 0032bf9a..e30610a3 100644 --- a/tools/llama3/convert_hf_to_nanotron.py +++ b/tools/llama3/convert_hf_to_nanotron.py @@ -1,5 +1,5 @@ """ -torchrun --nproc-per-node 1 tools/llama3/convert_hf_to_nanotron.py --nanotron-checkpoint-path nanotron_checkpoints/NanotronLlama38B --pretrained-model-name-or-path meta-llama/Meta-Llama-3-8B-Instruct +torchrun --nproc-per-node 1 tools/llama3/convert_hf_to_nanotron.py --nanotron-checkpoint-path nanotron_checkpoints/Nanotron-Llama-3-8B --pretrained-model-name-or-path meta-llama/Meta-Llama-3-8B-Instruct """ import argparse import json @@ -238,7 +238,7 @@ def main(args): # Store Config and Model Config files with open(nanotron_checkpoint_path / "config.yaml", "w") as f: config = Config( - general=GeneralArgs(project="conversion", run="Llama3-8B"), + general=GeneralArgs(project="Nanotron", run="Llama3"), parallelism=parallel_config, model=ModelArgs( init_method=ExistingCheckpointInit(nanotron_checkpoint_path), diff --git a/tools/llama3/convert_nanotron_to_hf.py b/tools/llama3/convert_nanotron_to_hf.py index 0254ed4a..c5fb1940 100644 --- a/tools/llama3/convert_nanotron_to_hf.py +++ b/tools/llama3/convert_nanotron_to_hf.py @@ -1,5 +1,5 @@ """ -torchrun --nproc-per-node 1 tools/llama3/convert_nanotron_to_hf.py --nanotron-checkpoint-path nanotron_checkpoints/NanotronLlama38B --hugging-face-checkpoint-path hf_checkpoints/ConvertedNanotronLlama38B +torchrun --nproc-per-node 1 tools/llama3/convert_nanotron_to_hf.py --nanotron-checkpoint-path nanotron_checkpoints/Nanotron-Llama-3-8B --hugging-face-checkpoint-path hf_checkpoints/Converted-Nanotron-Llama-3-8B """ import argparse import os