Introduce models directory

clebert · Oct 17, 2023 · 81c7816 · 81c7816
1 parent 5c3137a
commit 81c7816
Show file tree

Hide file tree

Showing 19 changed files with 51 additions and 22 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+*.bin filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,3 @@
+/models/llama2_*
 /zig-cache
 /zig-out
-/*.bin
diff --git a/README.md b/README.md
@@ -12,12 +12,6 @@ However, my goal is to continue porting the improvements and new features of And
 this codebase. At present, my Zig port produces the same output as the C version. I ensure this
 through the following linked [tests](./test.sh).
 
-## Usage
-
-```sh
-zig build -Doptimize=ReleaseFast run -- stories260K.bin -z tok512.bin -i "Once upon a time" --verbose
-```
-
 ## Papers
 
 - Standard transformer architecture: [Attention Is All You Need](https://arxiv.org/abs/1706.03762)

diff --git a/models/tinystories_110m/tinystories_110m_legacy.bin b/models/tinystories_110m/tinystories_110m_legacy.bin
diff --git a/models/tinystories_110m/tokenizer.bin b/models/tinystories_110m/tokenizer.bin
diff --git a/models/tinystories_15m/tinystories_15m_legacy.bin b/models/tinystories_15m/tinystories_15m_legacy.bin
diff --git a/models/tinystories_15m/tokenizer.bin b/models/tinystories_15m/tokenizer.bin
diff --git a/models/tinystories_260k/tinystories_260k_legacy.bin b/models/tinystories_260k/tinystories_260k_legacy.bin
diff --git a/models/tinystories_260k/tokenizer.bin b/models/tinystories_260k/tokenizer.bin
diff --git a/models/tinystories_42m/tinystories_42m_legacy.bin b/models/tinystories_42m/tinystories_42m_legacy.bin
diff --git a/models/tinystories_42m/tokenizer.bin b/models/tinystories_42m/tokenizer.bin
diff --git a/onecfg.json b/onecfg.json
@@ -32,7 +32,7 @@
     ],
     ".gitignore": [
       {
-        "value": ["/zig-cache", "/zig-out", "/*.bin"]
+        "value": ["/models/llama2_*", "/zig-cache", "/zig-out"]
       }
     ],
     ".prettierignore": [

diff --git a/src/checkpoint.zig b/src/checkpoint.zig
@@ -35,6 +35,10 @@ pub fn init(allocator: std.mem.Allocator, cli: *const CLI) !Self {
 
     defer file.close();
 
+    return try readLegacy(allocator, file);
+}
+
+pub fn readLegacy(allocator: std.mem.Allocator, file: std.fs.File) !Self {
     const embedding_size: usize = @intCast(try file.reader().readIntLittle(i32));
     const ffn_hidden_size: usize = @intCast(try file.reader().readIntLittle(i32));
     const n_layers: usize = @intCast(try file.reader().readIntLittle(i32));

diff --git a/src/generator.zig b/src/generator.zig
@@ -108,13 +108,13 @@ test "generate tiny story" {
     defer arg_iterator.deinit();
 
     const cli = CLI{
-        .checkpoint_path = "stories260K.bin",
+        .checkpoint_path = "models/tinystories_260k/tinystories_260k_legacy.bin",
         .temperature = 1,
         .top_p = 0.9,
         .random_seed = 42,
         .n_steps = 10,
         .prompt = "There was",
-        .tokenizer_path = "tok512.bin",
+        .tokenizer_path = "models/tinystories_260k/tokenizer.bin",
         .chat = false,
         .system_prompt = "",
         .verbose = false,

diff --git a/src/tokenizer.zig b/src/tokenizer.zig
@@ -211,10 +211,13 @@ fn lessThan(context: void, lhs: VocabEntry, rhs: VocabEntry) bool {
     return std.mem.lessThan(u8, lhs.word, rhs.word);
 }
 
+const tokenizer_32k_path = "models/tinystories_15m/tokenizer.bin";
+const tokenizer_512_path = "models/tinystories_260k/tokenizer.bin";
+
 // https://github.com/karpathy/llama2.c/pull/226
 // https://github.com/karpathy/llama2.c/pull/297
 test "encode utf-8" {
-    const tokenizer = try Self.init(std.testing.allocator, "tokenizer.bin", 32000);
+    const tokenizer = try Self.init(std.testing.allocator, tokenizer_32k_path, 32000);
 
     defer tokenizer.deinit();
 
@@ -227,7 +230,7 @@ test "encode utf-8" {
 }
 
 test "encode empty string" {
-    const tokenizer = try Self.init(std.testing.allocator, "tokenizer.bin", 32000);
+    const tokenizer = try Self.init(std.testing.allocator, tokenizer_32k_path, 32000);
 
     defer tokenizer.deinit();
 
@@ -240,7 +243,7 @@ test "encode empty string" {
 }
 
 test "encode unknown codepoint" {
-    const tokenizer = try Self.init(std.testing.allocator, "tokenizer.bin", 32000);
+    const tokenizer = try Self.init(std.testing.allocator, tokenizer_32k_path, 32000);
 
     defer tokenizer.deinit();
 
@@ -253,7 +256,7 @@ test "encode unknown codepoint" {
 }
 
 test "encode single chars" {
-    const tokenizer = try Self.init(std.testing.allocator, "tok512.bin", 512);
+    const tokenizer = try Self.init(std.testing.allocator, tokenizer_512_path, 512);
 
     defer tokenizer.deinit();
 
@@ -267,7 +270,7 @@ test "encode single chars" {
 
 // https://github.com/facebookresearch/llama/blob/ea9f33d6d3ea8ed7d560d270986407fd6c2e52b7/example_text_completion.py
 test "meta encoding example 1" {
-    const tokenizer = try Self.init(std.testing.allocator, "tokenizer.bin", 32000);
+    const tokenizer = try Self.init(std.testing.allocator, tokenizer_32k_path, 32000);
 
     defer tokenizer.deinit();
 
@@ -280,7 +283,7 @@ test "meta encoding example 1" {
 }
 
 test "meta encoding example 2" {
-    const tokenizer = try Self.init(std.testing.allocator, "tokenizer.bin", 32000);
+    const tokenizer = try Self.init(std.testing.allocator, tokenizer_32k_path, 32000);
 
     defer tokenizer.deinit();
 
@@ -297,7 +300,7 @@ test "meta encoding example 2" {
 }
 
 test "meta encoding example 3" {
-    const tokenizer = try Self.init(std.testing.allocator, "tokenizer.bin", 32000);
+    const tokenizer = try Self.init(std.testing.allocator, tokenizer_32k_path, 32000);
 
     defer tokenizer.deinit();
 
@@ -314,7 +317,7 @@ test "meta encoding example 3" {
 }
 
 test "meta encoding example 4" {
-    const tokenizer = try Self.init(std.testing.allocator, "tokenizer.bin", 32000);
+    const tokenizer = try Self.init(std.testing.allocator, tokenizer_32k_path, 32000);
 
     defer tokenizer.deinit();
 

diff --git a/stories260K.bin b/stories260K.bin
diff --git a/test.sh b/test.sh
@@ -7,7 +7,10 @@ set -o noclobber # Prevents from overwriting existing files
 
 zig build
 
-actual_output=$(./zig-out/bin/llama2 stories260K.bin -z tok512.bin -t 0 -n 200)
+checkpoint_path="models/tinystories_260k/tinystories_260k_legacy.bin"
+tokenizer_path="models/tinystories_260k/tokenizer.bin"
+
+actual_output=$(./zig-out/bin/llama2 $checkpoint_path -z $tokenizer_path -t 0 -n 200)
 
 # Generated with llama2.c (https://github.com/karpathy/llama2.c/tree/7ac65cb2c2b169050747be92011b7bebdd1b4544)
 expected_output="Once upon a time, there was a little girl named Lily. She loved to play outside in the park. One day, she saw a big, red ball. She wanted to play with it, but it was too high.
@@ -20,7 +23,7 @@ if [ "$actual_output" != "$expected_output" ]; then
     exit 1
 fi
 
-actual_output=$(./zig-out/bin/llama2 stories260K.bin -z tok512.bin -t 1 -p 1 -s 42 -n 200)
+actual_output=$(./zig-out/bin/llama2 $checkpoint_path -z $tokenizer_path -t 1 -p 1 -s 42 -n 200)
 
 # Generated with llama2.c (https://github.com/karpathy/llama2.c/tree/7ac65cb2c2b169050747be92011b7bebdd1b4544)
 expected_output="Once upon a time, there was a big roof. The fox was ready to look for people inside. He saw a big rock near a big tree. The roof was very small and fun! He ate the roof too. He got a shiny stool, so he sicked the roof with his friend, the girl named Mia.
@@ -33,7 +36,7 @@ if [ "$actual_output" != "$expected_output" ]; then
     exit 1
 fi
 
-actual_output=$(./zig-out/bin/llama2 stories260K.bin -z tok512.bin -t 1 -p 0.95 -s 42 -n 200)
+actual_output=$(./zig-out/bin/llama2 $checkpoint_path -z $tokenizer_path -t 1 -p 0.95 -s 42 -n 200)
 
 # Generated with llama2.c (https://github.com/karpathy/llama2.c/tree/7ac65cb2c2b169050747be92011b7bebdd1b4544)
 expected_output="Once upon a time, there was a little boy named Timmy. Timmy loved going to the park with his mom. One day, Lily went outside to play outside in her pocket. He was scared and didn't know where to buy some colorful animals.
@@ -44,7 +47,7 @@ if [ "$actual_output" != "$expected_output" ]; then
     exit 1
 fi
 
-actual_output=$(./zig-out/bin/llama2 stories260K.bin -z tok512.bin -t 1 -p 0.95 -s 42 -n 200 -i "There was a big")
+actual_output=$(./zig-out/bin/llama2 $checkpoint_path -z $tokenizer_path -t 1 -p 0.95 -s 42 -n 200 -i "There was a big")
 
 # Generated with llama2.c (https://github.com/karpathy/llama2.c/tree/7ac65cb2c2b169050747be92011b7bebdd1b4544)
 expected_output="There was a big pretty grass. It was a long elephant. The cars wanted to tell him that as they spin before the amazing doll, just like it she was always okay.

diff --git a/tok512.bin b/tok512.bin
diff --git a/tokenizer.bin b/tokenizer.bin