Skip to content

Commit

Permalink
Add converter for Hugging Face Llama 2 models
Browse files Browse the repository at this point in the history
  • Loading branch information
clebert committed Oct 18, 2023
1 parent 0112f2e commit 39c45bb
Show file tree
Hide file tree
Showing 9 changed files with 222 additions and 4 deletions.
2 changes: 2 additions & 0 deletions .vscode/extensions.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
"recommendations": [
"editorconfig.editorconfig",
"esbenp.prettier-vscode",
"ms-python.black-formatter",
"ms-python.python",
"ziglang.vscode-zig"
]
}
4 changes: 4 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
"[json]": {
"editor.defaultFormatter": "esbenp.prettier-vscode"
},
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter"
},
"[typescript]": {
"editor.defaultFormatter": "esbenp.prettier-vscode"
},
Expand Down Expand Up @@ -50,5 +53,6 @@
"url": "https://raw.githubusercontent.com/clebert/onecfg-rust/main/schema.json"
}
],
"python.formatting.provider": "none",
"zig.zls.enableInlayHints": false
}
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,26 @@ through the following linked [tests](./test.sh).
zig build -Doptimize=ReleaseFast run-generator -- models/tinystories_15m --temperature 0 --verbose
```

## Run Llama 2 from Hugging Face

```sh
# Make sure you have git-lfs installed (https://git-lfs.com)
git lfs install
git clone https://huggingface.co/meta-llama/Llama-2-7b-hf
```

```sh
pip3 install -r requirements.txt
```

```sh
python3 convert_hf_model.py /path/to/Llama-2-7b-hf models/llama2_7b_hf
```

```sh
zig build -Doptimize=ReleaseFast run-generator -- models/llama2_7b_hf --prompt "Once Upon a Time"
```

## Help

### llama2-generator
Expand Down
184 changes: 184 additions & 0 deletions convert_hf_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
import argparse
import os
import struct
import torch
from transformers import AutoModelForCausalLM
from sentencepiece import SentencePieceProcessor


def serialize_f32(file, tensor):
tensor_f32 = tensor.detach().cpu().view(-1).to(torch.float32).numpy()

file.write(struct.pack(f"{len(tensor_f32)}f", *tensor_f32))


def write_checkpoint_file():
hf_model = AutoModelForCausalLM.from_pretrained(args.input_model_path)

if hf_model.config.model_type != "llama":
parser.error("Expected llama model")

if hf_model.config.rope_theta != 10000:
parser.error("Expected a RoPE frequency base of 10000")

hf_state_dict = hf_model.state_dict()
token_embedding_vectors = hf_state_dict["model.embed_tokens.weight"]
output_matrix = hf_state_dict[f"lm_head.weight"]

embedding_size = hf_model.config.hidden_size
ffn_hidden_size = hf_model.config.intermediate_size
n_layers = hf_model.config.num_hidden_layers
n_attention_heads = hf_model.config.num_attention_heads
n_attention_query_groups = hf_model.config.num_key_value_heads
vocab_size = hf_model.config.vocab_size
max_sequence_length = hf_model.config.max_position_embeddings
shared_output_matrix = torch.equal(token_embedding_vectors, output_matrix)

os.makedirs(args.output_model_path, exist_ok=True)

output_file = open(os.path.join(args.output_model_path, "checkpoint_v1.bin"), "wb")

output_file.write(struct.pack("I", 0x616B3432))
output_file.write(struct.pack("i", 1))

output_file.write(
struct.pack(
"iiiiiii",
embedding_size,
ffn_hidden_size,
n_layers,
n_attention_heads,
n_attention_query_groups,
vocab_size,
max_sequence_length,
)
)

output_file.write(struct.pack("B", int(shared_output_matrix)))
output_file.write(b"\0" * (256 - output_file.tell()))

# attention_norm_vectors
for layer in range(n_layers):
serialize_f32(
output_file, hf_state_dict[f"model.layers.{layer}.input_layernorm.weight"]
)

# ffn_norm_vectors
for layer in range(n_layers):
serialize_f32(
output_file,
hf_state_dict[f"model.layers.{layer}.post_attention_layernorm.weight"],
)

# output_norm_vector
serialize_f32(output_file, hf_state_dict["model.norm.weight"])

serialize_f32(output_file, token_embedding_vectors)

# https://github.com/huggingface/transformers/blob/5c081e29930466ecf9a478727039d980131076d9/src/transformers/models/llama/convert_llama_weights_to_hf.py#L122C28-L122C35
def unpermute(tensor):
return (
tensor.view(
n_attention_heads,
2,
embedding_size // n_attention_heads // 2,
embedding_size,
)
.transpose(1, 2)
.reshape(embedding_size, embedding_size)
)

# attention_query_matrices
for layer in range(n_layers):
serialize_f32(
output_file,
unpermute(hf_state_dict[f"model.layers.{layer}.self_attn.q_proj.weight"]),
)

# attention_key_matrices
for layer in range(n_layers):
serialize_f32(
output_file,
unpermute(hf_state_dict[f"model.layers.{layer}.self_attn.k_proj.weight"]),
)

# attention_value_matrices
for layer in range(n_layers):
serialize_f32(
output_file, hf_state_dict[f"model.layers.{layer}.self_attn.v_proj.weight"]
)

# attention_output_matrices
for layer in range(n_layers):
serialize_f32(
output_file, hf_state_dict[f"model.layers.{layer}.self_attn.o_proj.weight"]
)

# ffn_gate_matrices
for layer in range(n_layers):
serialize_f32(
output_file, hf_state_dict[f"model.layers.{layer}.mlp.gate_proj.weight"]
)

# ffn_down_matrices
for layer in range(n_layers):
serialize_f32(
output_file, hf_state_dict[f"model.layers.{layer}.mlp.down_proj.weight"]
)

# ffn_up_matrices
for layer in range(n_layers):
serialize_f32(
output_file, hf_state_dict[f"model.layers.{layer}.mlp.up_proj.weight"]
)

if not shared_output_matrix:
serialize_f32(output_file, output_matrix)

output_file.close()


def write_tokenizer_file():
sp_model = SentencePieceProcessor(
model_file=os.path.join(args.input_model_path, "tokenizer.model")
)

words, scores = [], []

for token in range(sp_model.vocab_size()):
word = sp_model.id_to_piece(token)
score = sp_model.get_score(token)

if token == sp_model.bos_id():
word = "\n<s>\n"
elif token == sp_model.eos_id():
word = "\n</s>\n"

words.append(word.replace("▁", " ").encode("utf-8"))
scores.append(score)

max_word_length = max(len(word) for word in words)

os.makedirs(args.output_model_path, exist_ok=True)

output_file = open(os.path.join(args.output_model_path, "tokenizer.bin"), "wb")

output_file.write(struct.pack("I", max_word_length))

for word, score in zip(words, scores):
output_file.write(struct.pack("fI", score, len(word)))
output_file.write(word)

output_file.close()


if __name__ == "__main__":
parser = argparse.ArgumentParser()

parser.add_argument("input_model_path", type=str, help="the input model")
parser.add_argument("output_model_path", type=str, help="the output model")

args = parser.parse_args()

write_checkpoint_file()
write_tokenizer_file()
Binary file modified models/tinystories_15m/checkpoint_v1.bin
Binary file not shown.
Binary file modified models/tinystories_260k/checkpoint_v1.bin
Binary file not shown.
6 changes: 5 additions & 1 deletion onecfg.json
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,17 @@
],
".vscode/extensions.json": [
{
"value": {"recommendations": ["ziglang.vscode-zig"]}
"value": {
"recommendations": ["ms-python.black-formatter", "ms-python.python", "ziglang.vscode-zig"]
}
}
],
".vscode/settings.json": [
{
"value": {
"files.exclude": {".github": true, ".github/workflows/ci.yml": true},
"[python]": {"editor.defaultFormatter": "ms-python.black-formatter"},
"python.formatting.provider": "none",
"[zig]": {"editor.defaultFormatter": "ziglang.vscode-zig"},
"zig.zls.enableInlayHints": false
}
Expand Down
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
numpy==1.26.1
sentencepiece==0.1.99
torch==2.1.0
transformers==4.34.0
6 changes: 3 additions & 3 deletions src/checkpoint.zig
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ pub fn writeV1(self: *const Self, allocator: std.mem.Allocator, model_path: []co

defer file.close();

try file.writer().writeAll("ak42");
try file.writer().writeIntLittle(u32, 0x616b3432);
try file.writer().writeIntLittle(i32, 1);
try file.writer().writeIntLittle(i32, @as(i32, @intCast(self.embedding_size)));
try file.writer().writeIntLittle(i32, @as(i32, @intCast(self.ffn_hidden_size)));
Expand Down Expand Up @@ -102,9 +102,9 @@ pub fn writeV1(self: *const Self, allocator: std.mem.Allocator, model_path: []co

// https://github.com/karpathy/llama2.c/blob/d9862069e7ef665fe6309e3c17398ded2f121bf5/export.py#L132
fn readV1(allocator: std.mem.Allocator, file: std.fs.File) !Self {
const magic: [*]const u8 = @ptrCast(&try file.reader().readIntLittle(u32));
const magic = try file.reader().readIntLittle(u32);

if (!std.mem.eql(u8, magic[0..4], "ak42")) {
if (magic != 0x616b3432) {
return error.InvalidMagic;
}

Expand Down

0 comments on commit 39c45bb

Please sign in to comment.