Skip to content

Commit

Permalink
Introduce models directory
Browse files Browse the repository at this point in the history
  • Loading branch information
clebert committed Oct 17, 2023
1 parent 5c3137a commit 81c7816
Show file tree
Hide file tree
Showing 19 changed files with 51 additions and 22 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.bin filter=lfs diff=lfs merge=lfs -text
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
/models/llama2_*
/zig-cache
/zig-out
/*.bin
6 changes: 0 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,6 @@ However, my goal is to continue porting the improvements and new features of And
this codebase. At present, my Zig port produces the same output as the C version. I ensure this
through the following linked [tests](./test.sh).

## Usage

```sh
zig build -Doptimize=ReleaseFast run -- stories260K.bin -z tok512.bin -i "Once upon a time" --verbose
```

## Papers

- Standard transformer architecture: [Attention Is All You Need](https://arxiv.org/abs/1706.03762)
Expand Down
3 changes: 3 additions & 0 deletions models/tinystories_110m/tinystories_110m_legacy.bin
Git LFS file not shown
3 changes: 3 additions & 0 deletions models/tinystories_110m/tokenizer.bin
Git LFS file not shown
3 changes: 3 additions & 0 deletions models/tinystories_15m/tinystories_15m_legacy.bin
Git LFS file not shown
3 changes: 3 additions & 0 deletions models/tinystories_15m/tokenizer.bin
Git LFS file not shown
3 changes: 3 additions & 0 deletions models/tinystories_260k/tinystories_260k_legacy.bin
Git LFS file not shown
3 changes: 3 additions & 0 deletions models/tinystories_260k/tokenizer.bin
Git LFS file not shown
3 changes: 3 additions & 0 deletions models/tinystories_42m/tinystories_42m_legacy.bin
Git LFS file not shown
3 changes: 3 additions & 0 deletions models/tinystories_42m/tokenizer.bin
Git LFS file not shown
2 changes: 1 addition & 1 deletion onecfg.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
],
".gitignore": [
{
"value": ["/zig-cache", "/zig-out", "/*.bin"]
"value": ["/models/llama2_*", "/zig-cache", "/zig-out"]
}
],
".prettierignore": [
Expand Down
4 changes: 4 additions & 0 deletions src/checkpoint.zig
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ pub fn init(allocator: std.mem.Allocator, cli: *const CLI) !Self {

defer file.close();

return try readLegacy(allocator, file);
}

pub fn readLegacy(allocator: std.mem.Allocator, file: std.fs.File) !Self {
const embedding_size: usize = @intCast(try file.reader().readIntLittle(i32));
const ffn_hidden_size: usize = @intCast(try file.reader().readIntLittle(i32));
const n_layers: usize = @intCast(try file.reader().readIntLittle(i32));
Expand Down
4 changes: 2 additions & 2 deletions src/generator.zig
Original file line number Diff line number Diff line change
Expand Up @@ -108,13 +108,13 @@ test "generate tiny story" {
defer arg_iterator.deinit();

const cli = CLI{
.checkpoint_path = "stories260K.bin",
.checkpoint_path = "models/tinystories_260k/tinystories_260k_legacy.bin",
.temperature = 1,
.top_p = 0.9,
.random_seed = 42,
.n_steps = 10,
.prompt = "There was",
.tokenizer_path = "tok512.bin",
.tokenizer_path = "models/tinystories_260k/tokenizer.bin",
.chat = false,
.system_prompt = "",
.verbose = false,
Expand Down
19 changes: 11 additions & 8 deletions src/tokenizer.zig
Original file line number Diff line number Diff line change
Expand Up @@ -211,10 +211,13 @@ fn lessThan(context: void, lhs: VocabEntry, rhs: VocabEntry) bool {
return std.mem.lessThan(u8, lhs.word, rhs.word);
}

const tokenizer_32k_path = "models/tinystories_15m/tokenizer.bin";
const tokenizer_512_path = "models/tinystories_260k/tokenizer.bin";

// https://github.com/karpathy/llama2.c/pull/226
// https://github.com/karpathy/llama2.c/pull/297
test "encode utf-8" {
const tokenizer = try Self.init(std.testing.allocator, "tokenizer.bin", 32000);
const tokenizer = try Self.init(std.testing.allocator, tokenizer_32k_path, 32000);

defer tokenizer.deinit();

Expand All @@ -227,7 +230,7 @@ test "encode utf-8" {
}

test "encode empty string" {
const tokenizer = try Self.init(std.testing.allocator, "tokenizer.bin", 32000);
const tokenizer = try Self.init(std.testing.allocator, tokenizer_32k_path, 32000);

defer tokenizer.deinit();

Expand All @@ -240,7 +243,7 @@ test "encode empty string" {
}

test "encode unknown codepoint" {
const tokenizer = try Self.init(std.testing.allocator, "tokenizer.bin", 32000);
const tokenizer = try Self.init(std.testing.allocator, tokenizer_32k_path, 32000);

defer tokenizer.deinit();

Expand All @@ -253,7 +256,7 @@ test "encode unknown codepoint" {
}

test "encode single chars" {
const tokenizer = try Self.init(std.testing.allocator, "tok512.bin", 512);
const tokenizer = try Self.init(std.testing.allocator, tokenizer_512_path, 512);

defer tokenizer.deinit();

Expand All @@ -267,7 +270,7 @@ test "encode single chars" {

// https://github.com/facebookresearch/llama/blob/ea9f33d6d3ea8ed7d560d270986407fd6c2e52b7/example_text_completion.py
test "meta encoding example 1" {
const tokenizer = try Self.init(std.testing.allocator, "tokenizer.bin", 32000);
const tokenizer = try Self.init(std.testing.allocator, tokenizer_32k_path, 32000);

defer tokenizer.deinit();

Expand All @@ -280,7 +283,7 @@ test "meta encoding example 1" {
}

test "meta encoding example 2" {
const tokenizer = try Self.init(std.testing.allocator, "tokenizer.bin", 32000);
const tokenizer = try Self.init(std.testing.allocator, tokenizer_32k_path, 32000);

defer tokenizer.deinit();

Expand All @@ -297,7 +300,7 @@ test "meta encoding example 2" {
}

test "meta encoding example 3" {
const tokenizer = try Self.init(std.testing.allocator, "tokenizer.bin", 32000);
const tokenizer = try Self.init(std.testing.allocator, tokenizer_32k_path, 32000);

defer tokenizer.deinit();

Expand All @@ -314,7 +317,7 @@ test "meta encoding example 3" {
}

test "meta encoding example 4" {
const tokenizer = try Self.init(std.testing.allocator, "tokenizer.bin", 32000);
const tokenizer = try Self.init(std.testing.allocator, tokenizer_32k_path, 32000);

defer tokenizer.deinit();

Expand Down
Binary file removed stories260K.bin
Binary file not shown.
11 changes: 7 additions & 4 deletions test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@ set -o noclobber # Prevents from overwriting existing files

zig build

actual_output=$(./zig-out/bin/llama2 stories260K.bin -z tok512.bin -t 0 -n 200)
checkpoint_path="models/tinystories_260k/tinystories_260k_legacy.bin"
tokenizer_path="models/tinystories_260k/tokenizer.bin"

actual_output=$(./zig-out/bin/llama2 $checkpoint_path -z $tokenizer_path -t 0 -n 200)

# Generated with llama2.c (https://github.com/karpathy/llama2.c/tree/7ac65cb2c2b169050747be92011b7bebdd1b4544)
expected_output="Once upon a time, there was a little girl named Lily. She loved to play outside in the park. One day, she saw a big, red ball. She wanted to play with it, but it was too high.
Expand All @@ -20,7 +23,7 @@ if [ "$actual_output" != "$expected_output" ]; then
exit 1
fi

actual_output=$(./zig-out/bin/llama2 stories260K.bin -z tok512.bin -t 1 -p 1 -s 42 -n 200)
actual_output=$(./zig-out/bin/llama2 $checkpoint_path -z $tokenizer_path -t 1 -p 1 -s 42 -n 200)

# Generated with llama2.c (https://github.com/karpathy/llama2.c/tree/7ac65cb2c2b169050747be92011b7bebdd1b4544)
expected_output="Once upon a time, there was a big roof. The fox was ready to look for people inside. He saw a big rock near a big tree. The roof was very small and fun! He ate the roof too. He got a shiny stool, so he sicked the roof with his friend, the girl named Mia.
Expand All @@ -33,7 +36,7 @@ if [ "$actual_output" != "$expected_output" ]; then
exit 1
fi

actual_output=$(./zig-out/bin/llama2 stories260K.bin -z tok512.bin -t 1 -p 0.95 -s 42 -n 200)
actual_output=$(./zig-out/bin/llama2 $checkpoint_path -z $tokenizer_path -t 1 -p 0.95 -s 42 -n 200)

# Generated with llama2.c (https://github.com/karpathy/llama2.c/tree/7ac65cb2c2b169050747be92011b7bebdd1b4544)
expected_output="Once upon a time, there was a little boy named Timmy. Timmy loved going to the park with his mom. One day, Lily went outside to play outside in her pocket. He was scared and didn't know where to buy some colorful animals.
Expand All @@ -44,7 +47,7 @@ if [ "$actual_output" != "$expected_output" ]; then
exit 1
fi

actual_output=$(./zig-out/bin/llama2 stories260K.bin -z tok512.bin -t 1 -p 0.95 -s 42 -n 200 -i "There was a big")
actual_output=$(./zig-out/bin/llama2 $checkpoint_path -z $tokenizer_path -t 1 -p 0.95 -s 42 -n 200 -i "There was a big")

# Generated with llama2.c (https://github.com/karpathy/llama2.c/tree/7ac65cb2c2b169050747be92011b7bebdd1b4544)
expected_output="There was a big pretty grass. It was a long elephant. The cars wanted to tell him that as they spin before the amazing doll, just like it she was always okay.
Expand Down
Binary file removed tok512.bin
Binary file not shown.
Binary file removed tokenizer.bin
Binary file not shown.

0 comments on commit 81c7816

Please sign in to comment.