diff --git a/README.md b/README.md
index 77c8612..0c4664f 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ This project is a port of Andrej Karpathy's [llama2.c](https://github.com/karpat
## Usage
-Build and run the `llama2-generator` for text generation:
+Build and run `llama2-generator`:
```sh
zig build -Doptimize=ReleaseFast
@@ -23,10 +23,10 @@ Lily wanted to play with the ball, but it was too high up in the sky. She tried
Lily found a stick and tried to hit the ball. But the stick was too short. She tried again and again, but she couldn't reach it. She felt sad.
Suddenly, a kind man came by and saw Lily. He asked her what was wrong. Lily told him about the ball. The man smiled and said, "I have a useful idea!" He took out a long stick and used it to knock the ball down. Lily was so happy! She thanked the man and they played together in the sunshine.
-achieved: 726.974 tok/s
+achieved: 719.870 tok/s
```
-## Run Llama 2 from Hugging Face
+## Run Llama 2 7B from Hugging Face
Install `git-lfs` and clone the [Llama 2 7B](https://huggingface.co/meta-llama/Llama-2-7b-hf) model from Hugging Face:
@@ -43,7 +43,7 @@ pip3 install -r requirements.txt
python3 convert_hf_model.py /path/to/Llama-2-7b-hf models/llama2_7b_hf
```
-Build and run the `llama2-generator` for text generation:
+Build and run `llama2-generator`:
```sh
zig build -Doptimize=ReleaseFast
@@ -55,7 +55,40 @@ The output on an Apple M1 Pro with 32 GB of memory:
```
Once Upon a Time in Hollywood is a 2019 American comedy-drama film written and directed by Quentin Tarantino
-achieved: 1.821 tok/s
+achieved: 1.800 tok/s
+```
+
+## Run Llama 2 7B Chat from Hugging Face
+
+Install `git-lfs` and clone the [Llama 2 7B Chat](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) model from Hugging Face:
+
+```sh
+# Make sure you have git-lfs installed (https://git-lfs.com)
+git lfs install
+git clone https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+```
+
+Install the necessary Python packages and convert the Hugging Face model:
+
+```sh
+pip3 install -r requirements.txt
+python3 convert_hf_model.py /path/to/Llama-2-7b-chat-hf models/llama2_7b_chat_hf
+```
+
+Build and run `llama2-chat`:
+
+```sh
+zig build -Doptimize=ReleaseFast
+./zig-out/bin/llama2-chat models/llama2_7b_chat_hf
+```
+
+The output on an Apple M1 Pro with 32 GB of memory:
+
+```
+Enter system prompt (optional):
+User: Hello
+Assistant: Hello! It's nice to meet you. Is there something I can help you with or would you like to chat?
+User: ...
```
## Help
diff --git a/build.zig b/build.zig
index c04fcbc..ad8280f 100644
--- a/build.zig
+++ b/build.zig
@@ -4,13 +4,6 @@ pub fn build(b: *std.Build) void {
const target = b.standardTargetOptions(.{});
const optimize = b.standardOptimizeOption(.{});
- const chat_exe = b.addExecutable(.{
- .name = "llama2-chat",
- .root_source_file = .{ .path = "src/chat_main.zig" },
- .target = target,
- .optimize = optimize,
- });
-
const generator_exe = b.addExecutable(.{
.name = "llama2-generator",
.root_source_file = .{ .path = "src/generator_main.zig" },
@@ -18,63 +11,54 @@ pub fn build(b: *std.Build) void {
.optimize = optimize,
});
- const converter_exe = b.addExecutable(.{
- .name = "llama2-converter",
- .root_source_file = .{ .path = "src/converter_main.zig" },
+ const chat_exe = b.addExecutable(.{
+ .name = "llama2-chat",
+ .root_source_file = .{ .path = "src/chat_main.zig" },
.target = target,
.optimize = optimize,
});
const build_options = b.addOptions();
- chat_exe.addOptions("build_options", build_options);
generator_exe.addOptions("build_options", build_options);
- converter_exe.addOptions("build_options", build_options);
+ chat_exe.addOptions("build_options", build_options);
// This declares intent for the executable to be installed into the
// standard location when the user invokes the "install" step (the default
// step when running `zig build`).
- b.installArtifact(chat_exe);
b.installArtifact(generator_exe);
- b.installArtifact(converter_exe);
+ b.installArtifact(chat_exe);
// This *creates* a Run step in the build graph, to be executed when another
// step is evaluated that depends on it. The next line below will establish
// such a dependency.
- const run_chat_cmd = b.addRunArtifact(chat_exe);
const run_generator_cmd = b.addRunArtifact(generator_exe);
- const run_converter_cmd = b.addRunArtifact(converter_exe);
+ const run_chat_cmd = b.addRunArtifact(chat_exe);
// By making the run step depend on the install step, it will be run from the
// installation directory rather than directly from within the cache directory.
// This is not necessary, however, if the application depends on other installed
// files, this ensures they will be present and in the expected location.
- run_chat_cmd.step.dependOn(b.getInstallStep());
run_generator_cmd.step.dependOn(b.getInstallStep());
- run_converter_cmd.step.dependOn(b.getInstallStep());
+ run_chat_cmd.step.dependOn(b.getInstallStep());
// This allows the user to pass arguments to the application in the build
// command itself, like this: `zig build run -- arg1 arg2 etc`
if (b.args) |args| {
- run_chat_cmd.addArgs(args);
run_generator_cmd.addArgs(args);
- run_converter_cmd.addArgs(args);
+ run_chat_cmd.addArgs(args);
}
// This creates a build step. It will be visible in the `zig build --help` menu,
// and can be selected like this: `zig build run`
// This will evaluate the `run` step rather than the default, which is "install".
- const run_chat_step = b.step("run-chat", "Run the chat");
-
- run_chat_step.dependOn(&run_chat_cmd.step);
-
const run_generator_step = b.step("run-generator", "Run the generator");
run_generator_step.dependOn(&run_generator_cmd.step);
- const run_converter_step = b.step("run-converter", "Run the converter");
+ const run_chat_step = b.step("run-chat", "Run the chat");
- run_converter_step.dependOn(&run_converter_cmd.step);
+ run_chat_step.dependOn(&run_chat_cmd.step);
const test_step = b.step("test", "Run unit tests");
diff --git a/src/attention.zig b/src/attention.zig
index 43c7df3..2065bbe 100644
--- a/src/attention.zig
+++ b/src/attention.zig
@@ -2,111 +2,91 @@ const Self = @This();
const std = @import("std");
const Checkpoint = @import("checkpoint.zig");
-const math = @import("./math.zig");
-const Tensor = @import("./tensor.zig").Tensor;
+const math = @import("math.zig");
+const simd = @import("simd.zig");
+const Vector = @import("vector.zig");
-allocator: std.mem.Allocator,
checkpoint: Checkpoint,
head_size: usize,
head_size_sqrt: f32,
-input_buffer: Tensor(2),
-output_buffer: Tensor(1),
-query_buffer: Tensor(2),
-key_cache: Tensor(4),
-value_cache: Tensor(4),
+input: Vector,
+output: Vector,
+multi_query: Vector,
+key_cache: []const []const Vector,
+value_cache: []const []const Vector,
scores: []f32,
-pub fn init(allocator: std.mem.Allocator, checkpoint: Checkpoint, sequence_length: usize) !Self {
- const embedding_size = checkpoint.embedding_size;
- const n_attention_heads = checkpoint.n_attention_heads;
- const head_size: usize = embedding_size / n_attention_heads;
- const input_buffer = try Tensor(2).init(allocator, [_]usize{ n_attention_heads, head_size });
-
- errdefer input_buffer.deinit();
-
- const output_buffer = try Tensor(1).init(allocator, [_]usize{embedding_size});
-
- errdefer output_buffer.deinit();
-
- const query_buffer = try Tensor(2).init(allocator, [_]usize{ n_attention_heads, head_size });
-
- errdefer query_buffer.deinit();
-
- const n_layers = checkpoint.n_layers;
- const n_attention_query_groups = checkpoint.n_attention_query_groups;
-
- const key_cache = try Tensor(4).init(
- allocator,
- [_]usize{ n_layers, sequence_length, n_attention_query_groups, head_size },
- );
-
- errdefer key_cache.deinit();
-
- const value_cache = try Tensor(4).init(
- allocator,
- [_]usize{ n_layers, sequence_length, n_attention_query_groups, head_size },
- );
-
- errdefer value_cache.deinit();
+pub fn createLeaky(
+ allocator: std.mem.Allocator,
+ checkpoint: Checkpoint,
+ sequence_length: usize,
+) !Self {
+ const head_size = checkpoint.embedding_size / checkpoint.n_attention_heads;
+ const key_cache = try allocator.alloc([]Vector, checkpoint.n_layers);
+
+ for (key_cache) |*layer| {
+ layer.* = try Vector.createMultipleLeaky(
+ allocator,
+ sequence_length,
+ checkpoint.n_attention_query_groups * head_size,
+ );
+ }
- const scores = try allocator.alloc(f32, sequence_length);
+ const value_cache = try allocator.alloc([]Vector, checkpoint.n_layers);
- errdefer allocator.free(scores);
+ for (value_cache) |*layer| {
+ layer.* = try Vector.createMultipleLeaky(
+ allocator,
+ sequence_length,
+ checkpoint.n_attention_query_groups * head_size,
+ );
+ }
return .{
- .allocator = allocator,
.checkpoint = checkpoint,
.head_size = head_size,
.head_size_sqrt = std.math.sqrt(@as(f32, @floatFromInt(head_size))),
- .input_buffer = input_buffer,
- .output_buffer = output_buffer,
- .query_buffer = query_buffer,
+ .input = try Vector.createLeaky(allocator, checkpoint.embedding_size),
+ .output = try Vector.createLeaky(allocator, checkpoint.embedding_size),
+ .multi_query = try Vector.createLeaky(allocator, checkpoint.embedding_size),
.key_cache = key_cache,
.value_cache = value_cache,
- .scores = scores,
+ .scores = try allocator.alloc(f32, sequence_length),
};
}
-pub fn deinit(self: Self) void {
- self.input_buffer.deinit();
- self.output_buffer.deinit();
- self.query_buffer.deinit();
- self.key_cache.deinit();
- self.value_cache.deinit();
- self.allocator.free(self.scores);
-}
-
-pub fn forward(self: Self, layer: usize, position: usize) void {
- const weights = self.checkpoint.weights;
- const query_matrix = weights.attention_query_matrices.slice(layer);
- const key_matrix = weights.attention_key_matrices.slice(layer);
- const value_matrix = weights.attention_value_matrices.slice(layer);
- const output_matrix = weights.attention_output_matrices.slice(layer);
- const key_buffer = self.key_cache.slice(layer).slice(position);
- const value_buffer = self.value_cache.slice(layer).slice(position);
+pub fn forward(self: Self, layer: usize, position: usize) !void {
+ const query_weight = self.checkpoint.attention_query_weights[layer];
+ const key_weight = self.checkpoint.attention_key_weights[layer];
+ const value_weight = self.checkpoint.attention_value_weights[layer];
+ const output_weight = self.checkpoint.attention_output_weights[layer];
+ const multi_key = self.key_cache[layer][position];
+ const multi_value = self.value_cache[layer][position];
- query_matrix.computeMatrixVectorMultiplication(self.input_buffer, self.query_buffer);
- key_matrix.computeMatrixVectorMultiplication(self.input_buffer, key_buffer);
- value_matrix.computeMatrixVectorMultiplication(self.input_buffer, value_buffer);
+ try query_weight.multiplyVector(self.input, self.multi_query);
+ try key_weight.multiplyVector(self.input, multi_key);
+ try value_weight.multiplyVector(self.input, multi_value);
- self.computeRoPE(position, key_buffer);
+ self.computeRoPE(position, multi_key.values);
for (0..self.checkpoint.n_attention_heads) |head| {
- self.computeGQA(layer, position, head);
+ try self.computeGQA(layer, position, head);
}
- output_matrix.computeMatrixVectorMultiplication(self.input_buffer, self.output_buffer);
+ try output_weight.multiplyVector(self.input, self.output);
}
// Rotary positional embeddings: https://arxiv.org/abs/2104.09864
-fn computeRoPE(self: Self, position: usize, key_buffer: Tensor(2)) void {
+fn computeRoPE(self: Self, position: usize, multi_key_values: []f32) void {
@setFloatMode(.Optimized);
- std.debug.assert(self.query_buffer.values.len % key_buffer.values.len == 0);
+ const multi_query_values = self.multi_query.values;
+
+ std.debug.assert(multi_query_values.len % multi_key_values.len == 0);
var index: usize = 0;
- while (index < self.query_buffer.values.len) : (index += 2) {
+ while (index < multi_query_values.len) : (index += 2) {
const head: f32 = @floatFromInt(index % self.head_size);
const frequency =
@@ -116,27 +96,27 @@ fn computeRoPE(self: Self, position: usize, key_buffer: Tensor(2)) void {
const real_rotation_value: f32 = std.math.cos(rotation_scaling_factor);
const imag_rotation_value: f32 = std.math.sin(rotation_scaling_factor);
- const q_0 = self.query_buffer.values[index];
- const q_1 = self.query_buffer.values[index + 1];
+ const q_0 = multi_query_values[index];
+ const q_1 = multi_query_values[index + 1];
- self.query_buffer.values[index] = q_0 * real_rotation_value - q_1 * imag_rotation_value;
- self.query_buffer.values[index + 1] = q_0 * imag_rotation_value + q_1 * real_rotation_value;
+ multi_query_values[index] = q_0 * real_rotation_value - q_1 * imag_rotation_value;
+ multi_query_values[index + 1] = q_0 * imag_rotation_value + q_1 * real_rotation_value;
- if (index < key_buffer.values.len) {
- const k_0 = key_buffer.values[index];
- const k_1 = key_buffer.values[index + 1];
+ if (index < multi_key_values.len) {
+ const k_0 = multi_key_values[index];
+ const k_1 = multi_key_values[index + 1];
- key_buffer.values[index] = k_0 * real_rotation_value - k_1 * imag_rotation_value;
- key_buffer.values[index + 1] = k_0 * imag_rotation_value + k_1 * real_rotation_value;
+ multi_key_values[index] = k_0 * real_rotation_value - k_1 * imag_rotation_value;
+ multi_key_values[index + 1] = k_0 * imag_rotation_value + k_1 * real_rotation_value;
}
}
}
// Grouped-query attention: https://arxiv.org/abs/2305.13245v1
-fn computeGQA(self: Self, layer: usize, current_position: usize, head: usize) void {
+fn computeGQA(self: Self, layer: usize, current_position: usize, head: usize) !void {
@setFloatMode(.Optimized);
- const query_vector = self.query_buffer.slice(head);
+ const query_values = self.multi_query.values[head * self.head_size ..][0..self.head_size];
const query_group =
head / (self.checkpoint.n_attention_heads / self.checkpoint.n_attention_query_groups);
@@ -144,23 +124,26 @@ fn computeGQA(self: Self, layer: usize, current_position: usize, head: usize) vo
const next_position = current_position + 1;
for (0..next_position) |position| {
- const key_vector = self.key_cache.slice(layer).slice(position).slice(query_group);
+ const multi_key = self.key_cache[layer][position];
+ const key_values = multi_key.values[query_group * self.head_size ..][0..self.head_size];
- self.scores[position] = query_vector.computeScalarProduct(key_vector) / self.head_size_sqrt;
+ self.scores[position] =
+ try simd.computeScalarProduct(query_values, key_values) / self.head_size_sqrt;
}
math.softmax(self.scores[0..next_position]);
- const attention_buffer = self.input_buffer.slice(head);
+ const attention_values = self.input.values[head * self.head_size ..][0..self.head_size];
- @memset(attention_buffer.values, 0);
+ @memset(attention_values, 0);
for (0..next_position) |position| {
- const value_vector = self.value_cache.slice(layer).slice(position).slice(query_group);
+ const multi_value = self.value_cache[layer][position];
+ const value_values = multi_value.values[query_group * self.head_size ..][0..self.head_size];
const weight = self.scores[position];
for (0..self.head_size) |index| {
- attention_buffer.values[index] += value_vector.values[index] * weight;
+ attention_values[index] += value_values[index] * weight;
}
}
}
diff --git a/src/chat.zig b/src/chat.zig
index f7290e5..1953973 100644
--- a/src/chat.zig
+++ b/src/chat.zig
@@ -7,43 +7,25 @@ const Sampler = @import("sampler.zig");
const Tokenizer = @import("tokenizer.zig");
const Transformer = @import("transformer.zig");
-allocator: std.mem.Allocator,
transformer: Transformer,
tokenizer: Tokenizer,
sampler: Sampler,
system_prompt: []const u8,
user_prompt: []const u8,
-pub fn init(allocator: std.mem.Allocator, args: ChatArgs) !Self {
- const transformer = try Transformer.init(allocator, args.model_path, args.sequence_length);
-
- errdefer transformer.deinit();
-
+pub fn createLeaky(allocator: std.mem.Allocator, args: ChatArgs) !Self {
+ const transformer = try Transformer.createLeaky(allocator, args.model_path, args.sequence_length);
const vocab_size = transformer.checkpoint.vocab_size;
- const tokenizer = try Tokenizer.init(allocator, args.model_path, vocab_size);
-
- errdefer tokenizer.deinit();
-
- const sampler = try Sampler.init(allocator, args, vocab_size);
-
- errdefer sampler.deinit();
return .{
- .allocator = allocator,
.transformer = transformer,
- .tokenizer = tokenizer,
- .sampler = sampler,
+ .tokenizer = try Tokenizer.readLeaky(allocator, args.model_path, vocab_size),
+ .sampler = try Sampler.createLeaky(allocator, args, vocab_size),
.system_prompt = args.system_prompt,
.user_prompt = args.user_prompt,
};
}
-pub fn deinit(self: Self) void {
- self.transformer.deinit();
- self.tokenizer.deinit();
- self.sampler.deinit();
-}
-
const system_prompt_template_start = "<>\n";
const system_prompt_template_close = "\n<>\n\n";
const user_prompt_template_start = "[INST] ";
@@ -68,7 +50,7 @@ pub fn start(self: *Self, allocator: std.mem.Allocator) !void {
};
for (0..self.transformer.sequence_length) |position| {
- self.transformer.forward(token, position);
+ try self.transformer.forward(token, position);
if (token == bos_token and user_turn) {
var user_prompt = std.ArrayList(u8).init(allocator);
@@ -129,7 +111,7 @@ pub fn start(self: *Self, allocator: std.mem.Allocator) !void {
user_prompt_tokens_index += 1;
if (next_token == 0) {
- next_token = self.sampler.sample(self.transformer.output_buffer.values);
+ next_token = self.sampler.sample(self.transformer.output.values);
}
if (next_token == eos_token) {
diff --git a/src/chat_args.zig b/src/chat_args.zig
index 040bda4..b38e9c7 100644
--- a/src/chat_args.zig
+++ b/src/chat_args.zig
@@ -2,7 +2,6 @@ const Self = @This();
const std = @import("std");
-arg_iterator: std.process.ArgIterator,
model_path: []const u8,
temperature: f32,
top_p: f32,
@@ -20,11 +19,9 @@ const Option = enum {
user_prompt,
};
-pub fn init(allocator: std.mem.Allocator) !Self {
+pub fn createLeaky(allocator: std.mem.Allocator) !Self {
var arg_iterator = try std.process.argsWithAllocator(allocator);
- errdefer arg_iterator.deinit();
-
_ = arg_iterator.next().?;
const model_path = arg_iterator.next() orelse try help(1);
@@ -78,7 +75,6 @@ pub fn init(allocator: std.mem.Allocator) !Self {
}
return .{
- .arg_iterator = arg_iterator,
.model_path = model_path,
.temperature = @max(@min(temperature orelse 1, 1), 0),
.top_p = @max(@min(top_p orelse 0.9, 1), 0),
@@ -89,10 +85,6 @@ pub fn init(allocator: std.mem.Allocator) !Self {
};
}
-pub fn deinit(self: *Self) void {
- self.arg_iterator.deinit();
-}
-
fn help(exit_status: u8) !noreturn {
const console = if (exit_status == 0)
std.io.getStdOut().writer()
diff --git a/src/chat_main.zig b/src/chat_main.zig
index 4f35682..fffe5ed 100644
--- a/src/chat_main.zig
+++ b/src/chat_main.zig
@@ -3,15 +3,13 @@ const Chat = @import("chat.zig");
const ChatArgs = @import("chat_args.zig");
pub fn main() !void {
- const allocator = std.heap.page_allocator;
+ var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
- var args = try ChatArgs.init(allocator);
+ defer arena.deinit();
- defer args.deinit();
+ const args = try ChatArgs.createLeaky(arena.allocator());
- var chat = try Chat.init(allocator, args);
+ var chat = try Chat.createLeaky(arena.allocator(), args);
- defer chat.deinit();
-
- try chat.start(allocator);
+ try chat.start(arena.allocator());
}
diff --git a/src/checkpoint.zig b/src/checkpoint.zig
index fe5da8a..b7e6993 100644
--- a/src/checkpoint.zig
+++ b/src/checkpoint.zig
@@ -1,9 +1,9 @@
const Self = @This();
const std = @import("std");
-const Tensor = @import("./tensor.zig").Tensor;
+const Matrix = @import("matrix.zig");
+const Vector = @import("vector.zig");
-allocator: std.mem.Allocator,
embedding_size: usize,
ffn_hidden_size: usize,
n_layers: usize,
@@ -11,55 +11,21 @@ n_attention_heads: usize,
n_attention_query_groups: usize,
vocab_size: usize,
max_sequence_length: usize,
-shared_output_matrix: bool,
-weights: struct {
- token_embedding_vectors: Tensor(2),
- attention_norm_vectors: Tensor(2),
- attention_query_matrices: Tensor(3),
- attention_key_matrices: Tensor(3),
- attention_value_matrices: Tensor(3),
- attention_output_matrices: Tensor(3),
- ffn_norm_vectors: Tensor(2),
- ffn_gate_matrices: Tensor(3),
- ffn_down_matrices: Tensor(3),
- ffn_up_matrices: Tensor(3),
- output_norm_vector: Tensor(1),
- output_matrix: Tensor(2),
-},
-
-pub fn init(allocator: std.mem.Allocator, model_path: []const u8) !Self {
- const v1_path = try std.fs.path.join(
- allocator,
- &[_][]const u8{ model_path, "checkpoint_v1.bin" },
- );
-
- defer allocator.free(v1_path);
-
- const v1_file = std.fs.cwd().openFile(v1_path, .{}) catch null;
-
- defer if (v1_file) |file| file.close();
-
- if (v1_file) |file| return try readV1(allocator, file);
-
- const legacy_path = try std.fs.path.join(
- allocator,
- &[_][]const u8{ model_path, "checkpoint_legacy.bin" },
- );
-
- defer allocator.free(legacy_path);
-
- const legacy_file = std.fs.cwd().openFile(legacy_path, .{}) catch null;
-
- defer if (legacy_file) |file| file.close();
-
- if (legacy_file) |file| return try readLegacy(allocator, file);
-
- return error.CheckpointFileNotFound;
-}
-
-// https://github.com/karpathy/llama2.c/blob/d9862069e7ef665fe6309e3c17398ded2f121bf5/export.py#L132
-pub fn writeV1(self: Self, allocator: std.mem.Allocator, model_path: []const u8) !void {
+token_embedding_weights: []const Vector,
+attention_norm_weights: []const Vector,
+attention_query_weights: []const Matrix,
+attention_key_weights: []const Matrix,
+attention_value_weights: []const Matrix,
+attention_output_weights: []const Matrix,
+ffn_norm_weights: []const Vector,
+ffn_gate_weights: []const Matrix,
+ffn_down_weights: []const Matrix,
+ffn_up_weights: []const Matrix,
+output_norm_weight: Vector,
+output_weight: Matrix,
+
+pub fn readLeaky(allocator: std.mem.Allocator, model_path: []const u8) !Self {
const path = try std.fs.path.join(
allocator,
&[_][]const u8{ model_path, "checkpoint_v1.bin" },
@@ -67,49 +33,15 @@ pub fn writeV1(self: Self, allocator: std.mem.Allocator, model_path: []const u8)
defer allocator.free(path);
- const file = try std.fs.cwd().createFile(path, .{ .truncate = true });
+ const file = try std.fs.cwd().openFile(path, .{});
defer file.close();
- try file.writer().writeIntLittle(u32, 0x616b3432);
- try file.writer().writeIntLittle(i32, 1);
- try file.writer().writeIntLittle(i32, @as(i32, @intCast(self.embedding_size)));
- try file.writer().writeIntLittle(i32, @as(i32, @intCast(self.ffn_hidden_size)));
- try file.writer().writeIntLittle(i32, @as(i32, @intCast(self.n_layers)));
- try file.writer().writeIntLittle(i32, @as(i32, @intCast(self.n_attention_heads)));
- try file.writer().writeIntLittle(i32, @as(i32, @intCast(self.n_attention_query_groups)));
- try file.writer().writeIntLittle(i32, @as(i32, @intCast(self.vocab_size)));
- try file.writer().writeIntLittle(i32, @as(i32, @intCast(self.max_sequence_length)));
- try file.writer().writeIntLittle(u8, @as(u8, @intFromBool(self.shared_output_matrix)));
- try file.writer().writeByteNTimes(0, 256 - try file.getPos());
- try self.weights.attention_norm_vectors.write(file);
- try self.weights.ffn_norm_vectors.write(file);
- try self.weights.output_norm_vector.write(file);
- try self.weights.token_embedding_vectors.write(file);
- try self.weights.attention_query_matrices.write(file);
- try self.weights.attention_key_matrices.write(file);
- try self.weights.attention_value_matrices.write(file);
- try self.weights.attention_output_matrices.write(file);
- try self.weights.ffn_gate_matrices.write(file);
- try self.weights.ffn_down_matrices.write(file);
- try self.weights.ffn_up_matrices.write(file);
-
- if (!self.shared_output_matrix) {
- try self.weights.output_matrix.write(file);
- }
-}
-
-// https://github.com/karpathy/llama2.c/blob/d9862069e7ef665fe6309e3c17398ded2f121bf5/export.py#L132
-fn readV1(allocator: std.mem.Allocator, file: std.fs.File) !Self {
- const magic = try file.reader().readIntLittle(u32);
-
- if (magic != 0x616b3432) {
+ if (try file.reader().readIntLittle(u32) != 0x616b3432) {
return error.InvalidMagic;
}
- const version = try file.reader().readIntLittle(i32);
-
- if (version != 1) {
+ if (try file.reader().readIntLittle(i32) != 1) {
return error.InvalidVersion;
}
@@ -124,111 +56,93 @@ fn readV1(allocator: std.mem.Allocator, file: std.fs.File) !Self {
try file.seekTo(256);
- const attention_norm_vectors = try Tensor(2).init(
+ const attention_norm_weights = try Vector.readMultipleLeaky(
allocator,
- [_]usize{ n_layers, embedding_size },
+ file,
+ n_layers,
+ embedding_size,
);
- errdefer attention_norm_vectors.deinit();
- try attention_norm_vectors.read(file);
-
- const ffn_norm_vectors = try Tensor(2).init(
+ const ffn_norm_weights = try Vector.readMultipleLeaky(
allocator,
- [_]usize{ n_layers, embedding_size },
+ file,
+ n_layers,
+ embedding_size,
);
- errdefer ffn_norm_vectors.deinit();
- try ffn_norm_vectors.read(file);
+ const output_norm_weight = try Vector.readLeaky(allocator, file, embedding_size);
- const output_norm_vector = try Tensor(1).init(
+ const token_embedding_weights = try Vector.readMultipleLeaky(
allocator,
- [_]usize{embedding_size},
+ file,
+ vocab_size,
+ embedding_size,
);
- errdefer output_norm_vector.deinit();
- try output_norm_vector.read(file);
-
- const token_embedding_vectors = try Tensor(2).init(
- allocator,
- [_]usize{ vocab_size, embedding_size },
- );
-
- errdefer token_embedding_vectors.deinit();
- try token_embedding_vectors.read(file);
-
- const attention_query_matrices = try Tensor(3).init(
+ const attention_query_weights = try Matrix.readMultipleLeaky(
allocator,
- [_]usize{ n_layers, embedding_size, embedding_size },
+ file,
+ n_layers,
+ embedding_size,
+ embedding_size,
);
- errdefer attention_query_matrices.deinit();
- try attention_query_matrices.read(file);
-
const attention_head_size: usize = embedding_size / n_attention_heads;
- const attention_key_matrices = try Tensor(3).init(
+ const attention_key_weights = try Matrix.readMultipleLeaky(
allocator,
- [_]usize{ n_layers, n_attention_query_groups * attention_head_size, embedding_size },
+ file,
+ n_layers,
+ n_attention_query_groups * attention_head_size,
+ embedding_size,
);
- errdefer attention_key_matrices.deinit();
- try attention_key_matrices.read(file);
-
- const attention_value_matrices = try Tensor(3).init(
+ const attention_value_weights = try Matrix.readMultipleLeaky(
allocator,
- [_]usize{ n_layers, n_attention_query_groups * attention_head_size, embedding_size },
+ file,
+ n_layers,
+ n_attention_query_groups * attention_head_size,
+ embedding_size,
);
- errdefer attention_value_matrices.deinit();
- try attention_value_matrices.read(file);
-
- const attention_output_matrices = try Tensor(3).init(
+ const attention_output_weights = try Matrix.readMultipleLeaky(
allocator,
- [_]usize{ n_layers, embedding_size, embedding_size },
+ file,
+ n_layers,
+ embedding_size,
+ embedding_size,
);
- errdefer attention_output_matrices.deinit();
- try attention_output_matrices.read(file);
-
- const ffn_gate_matrices = try Tensor(3).init(
+ const ffn_gate_weights = try Matrix.readMultipleLeaky(
allocator,
- [_]usize{ n_layers, ffn_hidden_size, embedding_size },
+ file,
+ n_layers,
+ ffn_hidden_size,
+ embedding_size,
);
- errdefer ffn_gate_matrices.deinit();
- try ffn_gate_matrices.read(file);
-
- const ffn_down_matrices = try Tensor(3).init(
+ const ffn_down_weights = try Matrix.readMultipleLeaky(
allocator,
- [_]usize{ n_layers, embedding_size, ffn_hidden_size },
+ file,
+ n_layers,
+ embedding_size,
+ ffn_hidden_size,
);
- errdefer ffn_down_matrices.deinit();
- try ffn_down_matrices.read(file);
-
- const ffn_up_matrices = try Tensor(3).init(
+ const ffn_up_weights = try Matrix.readMultipleLeaky(
allocator,
- [_]usize{ n_layers, ffn_hidden_size, embedding_size },
+ file,
+ n_layers,
+ ffn_hidden_size,
+ embedding_size,
);
- errdefer ffn_up_matrices.deinit();
- try ffn_up_matrices.read(file);
-
- const output_matrix = if (shared_output_matrix)
- token_embedding_vectors
+ const output_weight = if (shared_output_matrix)
+ Matrix{ .rows = token_embedding_weights }
else
- try Tensor(2).init(allocator, [_]usize{ vocab_size, embedding_size });
-
- errdefer if (!shared_output_matrix) {
- output_matrix.deinit();
- };
-
- if (!shared_output_matrix) {
- try output_matrix.read(file);
- }
+ try Matrix.readLeaky(allocator, file, vocab_size, embedding_size);
return .{
- .allocator = allocator,
.embedding_size = embedding_size,
.ffn_hidden_size = ffn_hidden_size,
.n_layers = n_layers,
@@ -236,187 +150,18 @@ fn readV1(allocator: std.mem.Allocator, file: std.fs.File) !Self {
.n_attention_query_groups = n_attention_query_groups,
.vocab_size = vocab_size,
.max_sequence_length = max_sequence_length,
- .shared_output_matrix = shared_output_matrix,
- .weights = .{
- .token_embedding_vectors = token_embedding_vectors,
- .attention_norm_vectors = attention_norm_vectors,
- .attention_query_matrices = attention_query_matrices,
- .attention_key_matrices = attention_key_matrices,
- .attention_value_matrices = attention_value_matrices,
- .attention_output_matrices = attention_output_matrices,
- .ffn_norm_vectors = ffn_norm_vectors,
- .ffn_gate_matrices = ffn_gate_matrices,
- .ffn_down_matrices = ffn_down_matrices,
- .ffn_up_matrices = ffn_up_matrices,
- .output_norm_vector = output_norm_vector,
- .output_matrix = output_matrix,
- },
+ .token_embedding_weights = token_embedding_weights,
+ .attention_norm_weights = attention_norm_weights,
+ .attention_query_weights = attention_query_weights,
+ .attention_key_weights = attention_key_weights,
+ .attention_value_weights = attention_value_weights,
+ .attention_output_weights = attention_output_weights,
+ .ffn_norm_weights = ffn_norm_weights,
+ .ffn_gate_weights = ffn_gate_weights,
+ .ffn_down_weights = ffn_down_weights,
+ .ffn_up_weights = ffn_up_weights,
+ .output_norm_weight = output_norm_weight,
+ .output_weight = output_weight,
};
}
-
-// https://github.com/karpathy/llama2.c/blob/d9862069e7ef665fe6309e3c17398ded2f121bf5/export.py#L75
-fn readLegacy(allocator: std.mem.Allocator, file: std.fs.File) !Self {
- const embedding_size: usize = @intCast(try file.reader().readIntLittle(i32));
- const ffn_hidden_size: usize = @intCast(try file.reader().readIntLittle(i32));
- const n_layers: usize = @intCast(try file.reader().readIntLittle(i32));
- const n_attention_heads: usize = @intCast(try file.reader().readIntLittle(i32));
- const n_attention_query_groups: usize = @intCast(try file.reader().readIntLittle(i32));
-
- // https://github.com/karpathy/llama2.c/blob/35deb5e0fa55f0a257040bcf1624ed8386e63dc7/run.c#L153
- const signed_vocab_size = try file.reader().readIntLittle(i32);
- const shared_output_matrix = signed_vocab_size > 0;
-
- const vocab_size: usize = @abs(signed_vocab_size);
- const max_sequence_length: usize = @intCast(try file.reader().readIntLittle(i32));
-
- const token_embedding_vectors = try Tensor(2).init(
- allocator,
- [_]usize{ vocab_size, embedding_size },
- );
-
- errdefer token_embedding_vectors.deinit();
- try token_embedding_vectors.read(file);
-
- const attention_norm_vectors = try Tensor(2).init(
- allocator,
- [_]usize{ n_layers, embedding_size },
- );
-
- errdefer attention_norm_vectors.deinit();
- try attention_norm_vectors.read(file);
-
- const attention_query_matrices = try Tensor(3).init(
- allocator,
- [_]usize{ n_layers, embedding_size, embedding_size },
- );
-
- errdefer attention_query_matrices.deinit();
- try attention_query_matrices.read(file);
-
- const attention_head_size: usize = embedding_size / n_attention_heads;
-
- const attention_key_matrices = try Tensor(3).init(
- allocator,
- [_]usize{ n_layers, n_attention_query_groups * attention_head_size, embedding_size },
- );
-
- errdefer attention_key_matrices.deinit();
- try attention_key_matrices.read(file);
-
- const attention_value_matrices = try Tensor(3).init(
- allocator,
- [_]usize{ n_layers, n_attention_query_groups * attention_head_size, embedding_size },
- );
-
- errdefer attention_value_matrices.deinit();
- try attention_value_matrices.read(file);
-
- const attention_output_matrices = try Tensor(3).init(
- allocator,
- [_]usize{ n_layers, embedding_size, embedding_size },
- );
-
- errdefer attention_output_matrices.deinit();
- try attention_output_matrices.read(file);
-
- const ffn_norm_vectors = try Tensor(2).init(
- allocator,
- [_]usize{ n_layers, embedding_size },
- );
-
- errdefer ffn_norm_vectors.deinit();
- try ffn_norm_vectors.read(file);
-
- const ffn_gate_matrices = try Tensor(3).init(
- allocator,
- [_]usize{ n_layers, ffn_hidden_size, embedding_size },
- );
-
- errdefer ffn_gate_matrices.deinit();
- try ffn_gate_matrices.read(file);
-
- const ffn_down_matrices = try Tensor(3).init(
- allocator,
- [_]usize{ n_layers, embedding_size, ffn_hidden_size },
- );
-
- errdefer ffn_down_matrices.deinit();
- try ffn_down_matrices.read(file);
-
- const ffn_up_matrices = try Tensor(3).init(
- allocator,
- [_]usize{ n_layers, ffn_hidden_size, embedding_size },
- );
-
- errdefer ffn_up_matrices.deinit();
- try ffn_up_matrices.read(file);
-
- const output_norm_vector = try Tensor(1).init(
- allocator,
- [_]usize{embedding_size},
- );
-
- errdefer output_norm_vector.deinit();
- try output_norm_vector.read(file);
-
- try file.seekBy(@intCast(max_sequence_length * attention_head_size * @sizeOf(f32)));
-
- const output_matrix = if (shared_output_matrix)
- token_embedding_vectors
- else
- try Tensor(2).init(allocator, [_]usize{ vocab_size, embedding_size });
-
- errdefer if (!shared_output_matrix) {
- output_matrix.deinit();
- };
-
- if (!shared_output_matrix) {
- try output_matrix.read(file);
- }
-
- return .{
- .allocator = allocator,
- .embedding_size = embedding_size,
- .ffn_hidden_size = ffn_hidden_size,
- .n_layers = n_layers,
- .n_attention_heads = n_attention_heads,
- .n_attention_query_groups = n_attention_query_groups,
- .vocab_size = vocab_size,
- .max_sequence_length = max_sequence_length,
- .shared_output_matrix = shared_output_matrix,
-
- .weights = .{
- .token_embedding_vectors = token_embedding_vectors,
- .attention_norm_vectors = attention_norm_vectors,
- .attention_query_matrices = attention_query_matrices,
- .attention_key_matrices = attention_key_matrices,
- .attention_value_matrices = attention_value_matrices,
- .attention_output_matrices = attention_output_matrices,
- .ffn_norm_vectors = ffn_norm_vectors,
- .ffn_gate_matrices = ffn_gate_matrices,
- .ffn_down_matrices = ffn_down_matrices,
- .ffn_up_matrices = ffn_up_matrices,
- .output_norm_vector = output_norm_vector,
- .output_matrix = output_matrix,
- },
- };
-}
-
-pub fn deinit(self: Self) void {
- self.weights.token_embedding_vectors.deinit();
- self.weights.attention_norm_vectors.deinit();
- self.weights.attention_query_matrices.deinit();
- self.weights.attention_key_matrices.deinit();
- self.weights.attention_value_matrices.deinit();
- self.weights.attention_output_matrices.deinit();
- self.weights.ffn_norm_vectors.deinit();
- self.weights.ffn_gate_matrices.deinit();
- self.weights.ffn_down_matrices.deinit();
- self.weights.ffn_up_matrices.deinit();
- self.weights.output_norm_vector.deinit();
-
- if (!self.shared_output_matrix) {
- self.weights.output_matrix.deinit();
- }
-}
diff --git a/src/converter_args.zig b/src/converter_args.zig
deleted file mode 100644
index c086b7c..0000000
--- a/src/converter_args.zig
+++ /dev/null
@@ -1,40 +0,0 @@
-const Self = @This();
-
-const std = @import("std");
-
-arg_iterator: std.process.ArgIterator,
-model_path: []const u8,
-
-pub fn init(allocator: std.mem.Allocator) !Self {
- var arg_iterator = try std.process.argsWithAllocator(allocator);
-
- errdefer arg_iterator.deinit();
-
- _ = arg_iterator.next().?;
-
- const model_path = arg_iterator.next() orelse try help(1);
-
- while (arg_iterator.next()) |arg| {
- try help(if (std.mem.eql(u8, arg, "--help")) 0 else 1);
- }
-
- return .{ .arg_iterator = arg_iterator, .model_path = model_path };
-}
-
-pub fn deinit(self: *Self) void {
- self.arg_iterator.deinit();
-}
-
-fn help(exit_status: u8) !noreturn {
- const console = if (exit_status == 0)
- std.io.getStdOut().writer()
- else
- std.io.getStdErr().writer();
-
- try console.print("Usage: llama2-converter [options]\n\n", .{});
-
- try console.print("Options:\n", .{});
- try console.print(" --help\n", .{});
-
- std.process.exit(exit_status);
-}
diff --git a/src/converter_main.zig b/src/converter_main.zig
deleted file mode 100644
index eeba13f..0000000
--- a/src/converter_main.zig
+++ /dev/null
@@ -1,17 +0,0 @@
-const std = @import("std");
-const Checkpoint = @import("checkpoint.zig");
-const ConverterArgs = @import("converter_args.zig");
-
-pub fn main() !void {
- const allocator = std.heap.page_allocator;
-
- var args = try ConverterArgs.init(allocator);
-
- defer args.deinit();
-
- const checkpoint = try Checkpoint.init(allocator, args.model_path);
-
- defer checkpoint.deinit();
-
- try checkpoint.writeV1(allocator, args.model_path);
-}
diff --git a/src/ffn.zig b/src/ffn.zig
index 936c137..ba943a8 100644
--- a/src/ffn.zig
+++ b/src/ffn.zig
@@ -2,66 +2,40 @@ const Self = @This();
const std = @import("std");
const Checkpoint = @import("checkpoint.zig");
-const Tensor = @import("./tensor.zig").Tensor;
+const Vector = @import("vector.zig");
-allocator: std.mem.Allocator,
checkpoint: Checkpoint,
-input_buffer: Tensor(1),
-gate_buffer: Tensor(1),
-hidden_buffer: Tensor(1),
-output_buffer: Tensor(1),
-
-pub fn init(allocator: std.mem.Allocator, checkpoint: Checkpoint) !Self {
- const input_buffer = try Tensor(1).init(allocator, [_]usize{checkpoint.embedding_size});
-
- errdefer input_buffer.deinit();
-
- const gate_buffer = try Tensor(1).init(allocator, [_]usize{checkpoint.ffn_hidden_size});
-
- errdefer gate_buffer.deinit();
-
- const hidden_buffer = try Tensor(1).init(allocator, [_]usize{checkpoint.ffn_hidden_size});
-
- errdefer hidden_buffer.deinit();
-
- const output_buffer = try Tensor(1).init(allocator, [_]usize{checkpoint.embedding_size});
-
- errdefer output_buffer.deinit();
+input: Vector,
+gate: Vector,
+hidden: Vector,
+output: Vector,
+pub fn createLeaky(allocator: std.mem.Allocator, checkpoint: Checkpoint) !Self {
return .{
- .allocator = allocator,
.checkpoint = checkpoint,
- .input_buffer = input_buffer,
- .gate_buffer = gate_buffer,
- .hidden_buffer = hidden_buffer,
- .output_buffer = output_buffer,
+ .input = try Vector.createLeaky(allocator, checkpoint.embedding_size),
+ .gate = try Vector.createLeaky(allocator, checkpoint.ffn_hidden_size),
+ .hidden = try Vector.createLeaky(allocator, checkpoint.ffn_hidden_size),
+ .output = try Vector.createLeaky(allocator, checkpoint.embedding_size),
};
}
-pub fn deinit(self: Self) void {
- self.input_buffer.deinit();
- self.gate_buffer.deinit();
- self.hidden_buffer.deinit();
- self.output_buffer.deinit();
-}
-
// SwiGLU activation function: https://arxiv.org/abs/2002.05202
-pub fn forward(self: Self, layer: usize) void {
+pub fn forward(self: Self, layer: usize) !void {
@setFloatMode(.Optimized);
- const weights = self.checkpoint.weights;
- const gate_matrix = weights.ffn_gate_matrices.slice(layer);
- const up_matrix = weights.ffn_up_matrices.slice(layer);
- const down_matrix = weights.ffn_down_matrices.slice(layer);
+ const gate_weight = self.checkpoint.ffn_gate_weights[layer];
+ const up_weight = self.checkpoint.ffn_up_weights[layer];
+ const down_weight = self.checkpoint.ffn_down_weights[layer];
- gate_matrix.computeMatrixVectorMultiplication(self.input_buffer, self.gate_buffer);
- up_matrix.computeMatrixVectorMultiplication(self.input_buffer, self.hidden_buffer);
+ try gate_weight.multiplyVector(self.input, self.gate);
+ try up_weight.multiplyVector(self.input, self.hidden);
for (0..self.checkpoint.ffn_hidden_size) |index| {
- self.hidden_buffer.values[index] *= swish(self.gate_buffer.values[index]);
+ self.hidden.values[index] *= swish(self.gate.values[index]);
}
- down_matrix.computeMatrixVectorMultiplication(self.hidden_buffer, self.output_buffer);
+ try down_weight.multiplyVector(self.hidden, self.output);
}
// Swish activation function: https://arxiv.org/abs/1710.05941
diff --git a/src/generator.zig b/src/generator.zig
index 1dc62fd..66f30aa 100644
--- a/src/generator.zig
+++ b/src/generator.zig
@@ -7,46 +7,31 @@ const Sampler = @import("sampler.zig");
const Tokenizer = @import("tokenizer.zig");
const Transformer = @import("transformer.zig");
-allocator: std.mem.Allocator,
transformer: Transformer,
tokenizer: Tokenizer,
sampler: Sampler,
prompt_tokens: []usize,
verbose: bool,
-pub fn init(allocator: std.mem.Allocator, args: GeneratorArgs) !Self {
- const transformer = try Transformer.init(allocator, args.model_path, args.sequence_length);
-
- errdefer transformer.deinit();
+pub fn createLeaky(allocator: std.mem.Allocator, args: GeneratorArgs) !Self {
+ const transformer = try Transformer.createLeaky(
+ allocator,
+ args.model_path,
+ args.sequence_length,
+ );
const vocab_size = transformer.checkpoint.vocab_size;
- const tokenizer = try Tokenizer.init(allocator, args.model_path, vocab_size);
-
- errdefer tokenizer.deinit();
-
- const sampler = try Sampler.init(allocator, args, vocab_size);
-
- errdefer sampler.deinit();
-
- const prompt_tokens = try tokenizer.encode(allocator, args.prompt);
+ const tokenizer = try Tokenizer.readLeaky(allocator, args.model_path, vocab_size);
return .{
- .allocator = allocator,
.transformer = transformer,
.tokenizer = tokenizer,
- .sampler = sampler,
- .prompt_tokens = prompt_tokens,
+ .sampler = try Sampler.createLeaky(allocator, args, vocab_size),
+ .prompt_tokens = try tokenizer.encode(allocator, args.prompt),
.verbose = args.verbose,
};
}
-pub fn deinit(self: Self) void {
- self.transformer.deinit();
- self.tokenizer.deinit();
- self.sampler.deinit();
- self.allocator.free(self.prompt_tokens);
-}
-
const bos_token = 1; // beginning of sequence
const eos_token = 2; // end of sequence
@@ -64,7 +49,7 @@ pub fn generate(self: *Self, writer: anytype) !void {
start_time = std.time.milliTimestamp();
}
- self.transformer.forward(token, position);
+ try self.transformer.forward(token, position);
if (start_time > 0) {
total_time += std.time.milliTimestamp() - start_time;
@@ -74,7 +59,7 @@ pub fn generate(self: *Self, writer: anytype) !void {
next_token = self.prompt_tokens[prompt_tokens_index];
prompt_tokens_index += 1;
} else {
- next_token = self.sampler.sample(self.transformer.output_buffer.values);
+ next_token = self.sampler.sample(self.transformer.output.values);
}
if (next_token == bos_token or next_token == eos_token) {
@@ -99,16 +84,13 @@ pub fn generate(self: *Self, writer: anytype) !void {
}
test "generate tiny story" {
- var output = std.ArrayList(u8).init(std.testing.allocator);
+ var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
- defer output.deinit();
+ defer arena.deinit();
- var arg_iterator = try std.process.argsWithAllocator(std.testing.allocator);
-
- defer arg_iterator.deinit();
+ var output = std.ArrayList(u8).init(arena.allocator());
const args = GeneratorArgs{
- .arg_iterator = arg_iterator,
.model_path = "models/tinystories_260k",
.temperature = 1,
.top_p = 0.9,
@@ -118,9 +100,7 @@ test "generate tiny story" {
.verbose = false,
};
- var generator = try Self.init(std.testing.allocator, args);
-
- defer generator.deinit();
+ var generator = try Self.createLeaky(arena.allocator(), args);
try generator.generate(output.writer());
diff --git a/src/generator_args.zig b/src/generator_args.zig
index b95fe05..57c93d0 100644
--- a/src/generator_args.zig
+++ b/src/generator_args.zig
@@ -2,7 +2,6 @@ const Self = @This();
const std = @import("std");
-arg_iterator: std.process.ArgIterator,
model_path: []const u8,
temperature: f32,
top_p: f32,
@@ -13,11 +12,9 @@ verbose: bool,
const Option = enum { temperature, top_p, random_seed, sequence_length, prompt };
-pub fn init(allocator: std.mem.Allocator) !Self {
+pub fn createLeaky(allocator: std.mem.Allocator) !Self {
var arg_iterator = try std.process.argsWithAllocator(allocator);
- errdefer arg_iterator.deinit();
-
_ = arg_iterator.next().?;
const model_path = arg_iterator.next() orelse try help(1);
@@ -69,7 +66,6 @@ pub fn init(allocator: std.mem.Allocator) !Self {
}
return .{
- .arg_iterator = arg_iterator,
.model_path = model_path,
.temperature = @max(@min(temperature orelse 1, 1), 0),
.top_p = @max(@min(top_p orelse 0.9, 1), 0),
@@ -80,10 +76,6 @@ pub fn init(allocator: std.mem.Allocator) !Self {
};
}
-pub fn deinit(self: *Self) void {
- self.arg_iterator.deinit();
-}
-
fn help(exit_status: u8) !noreturn {
const console = if (exit_status == 0)
std.io.getStdOut().writer()
diff --git a/src/generator_main.zig b/src/generator_main.zig
index 6b420ad..2bf605d 100644
--- a/src/generator_main.zig
+++ b/src/generator_main.zig
@@ -3,15 +3,13 @@ const Generator = @import("generator.zig");
const GeneratorArgs = @import("generator_args.zig");
pub fn main() !void {
- const allocator = std.heap.page_allocator;
+ var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
- var args = try GeneratorArgs.init(allocator);
+ defer arena.deinit();
- defer args.deinit();
+ const args = try GeneratorArgs.createLeaky(arena.allocator());
- var generator = try Generator.init(allocator, args);
-
- defer generator.deinit();
+ var generator = try Generator.createLeaky(arena.allocator(), args);
try generator.generate(std.io.getStdOut().writer());
}
diff --git a/src/matrix.zig b/src/matrix.zig
new file mode 100644
index 0000000..ad372b5
--- /dev/null
+++ b/src/matrix.zig
@@ -0,0 +1,39 @@
+const Self = @This();
+
+const std = @import("std");
+const Vector = @import("vector.zig");
+
+rows: []const Vector,
+
+pub fn readLeaky(
+ allocator: std.mem.Allocator,
+ file: std.fs.File,
+ m_rows: usize,
+ n_cols: usize,
+) !Self {
+ return .{ .rows = try Vector.readMultipleLeaky(allocator, file, m_rows, n_cols) };
+}
+
+pub fn readMultipleLeaky(
+ allocator: std.mem.Allocator,
+ file: std.fs.File,
+ n_matrices: usize,
+ m_rows: usize,
+ n_cols: usize,
+) ![]Self {
+ const matrices = try allocator.alloc(Self, n_matrices);
+
+ for (matrices) |*matrix| {
+ matrix.* = try readLeaky(allocator, file, m_rows, n_cols);
+ }
+
+ return matrices;
+}
+
+pub fn multiplyVector(self: Self, input: Vector, output: Vector) !void {
+ std.debug.assert(self.rows.len == output.values.len);
+
+ for (output.values, 0..) |*value, index| {
+ value.* = try self.rows[index].computeScalarProduct(input);
+ }
+}
diff --git a/src/quantized_tensor.zig b/src/quantized_tensor.zig
deleted file mode 100644
index b41bd64..0000000
--- a/src/quantized_tensor.zig
+++ /dev/null
@@ -1,117 +0,0 @@
-const std = @import("std");
-
-pub fn QuantizedTensor(comptime n_dims: comptime_int) type {
- comptime if (n_dims < 1) @compileError("n_dims < 1");
-
- return struct {
- const Self = @This();
-
- allocator: ?std.mem.Allocator,
- sub_dims: [n_dims - 1]usize,
- group_size: usize,
- values: []i8,
- scaling_factors: []f32,
-
- pub fn init(allocator: std.mem.Allocator, dims: [n_dims]usize, group_size: usize) !Self {
- const n_values = @reduce(.Mul, @as(@Vector(n_dims, usize), dims));
-
- if (n_values % group_size != 0) {
- return error.InvalidGroupSize;
- }
-
- const n_groups = n_values / group_size;
-
- return .{
- .allocator = allocator,
- .sub_dims = dims[1..].*,
- .group_size = group_size,
- .values = try allocator.alloc(i8, n_values),
- .scaling_factors = try allocator.alloc(f32, n_groups),
- };
- }
-
- pub fn deinit(self: Self) void {
- if (self.allocator) |allocator| {
- allocator.free(self.values);
- allocator.free(self.scaling_factors);
- }
- }
-
- pub fn slice(self: Self, index: usize) !QuantizedTensor(n_dims - 1) {
- comptime if (n_dims < 2) @compileError("n_dims < 2");
-
- const n_sub_values = @reduce(.Mul, @as(@Vector(n_dims - 1, usize), self.sub_dims));
-
- if (n_sub_values % self.group_size != 0) {
- return error.InvalidGroupSize;
- }
-
- const n_sub_groups = n_sub_values / self.group_size;
-
- return .{
- .allocator = null,
- .sub_dims = self.sub_dims[1..].*,
- .group_size = self.group_size,
- .values = self.values[index * n_sub_values ..][0..n_sub_values],
- .scaling_factors = self.scaling_factors[index * n_sub_groups ..][0..n_sub_groups],
- };
- }
-
- pub fn computeMatrixVectorMultiplication(
- self: Self,
- input: anytype,
- output: anytype,
- ) !void {
- for (output.values, 0..) |*value, index| {
- value.* = try (try self.slice(index)).computeScalarProduct(&input);
- }
- }
-
- fn computeScalarProduct(self: Self, other: anytype) !f32 {
- // https://github.com/karpathy/llama2.c/pull/312#issuecomment-1684140683
- if (self.group_size == 32) {
- return _computeScalarProduct(32, self, other);
- }
-
- if (self.group_size == 16) {
- return _computeScalarProduct(16, self, other);
- }
-
- if (self.group_size == 8) {
- return _computeScalarProduct(8, self, other);
- }
-
- if (self.group_size == 4) {
- return _computeScalarProduct(4, self, other);
- }
-
- return error.UnsupportedGroupSize;
- }
- };
-}
-
-fn _computeScalarProduct(
- comptime vector_size: comptime_int,
- input_1: anytype,
- input_2: anytype,
-) f32 {
- @setFloatMode(.Optimized);
-
- std.debug.assert(input_1.values.len == input_2.values.len);
- std.debug.assert(input_1.scaling_factors.len == input_2.scaling_factors.len);
-
- var output_value: f32 = 0;
- var index: usize = 0;
-
- while (index < input_1.values.len) : (index += vector_size) {
- const values: @Vector(vector_size, i32) =
- @as(@Vector(vector_size, i8), input_1.values[index..][0..vector_size].*) *
- @as(@Vector(vector_size, i8), input_2.values[index..][0..vector_size].*);
-
- output_value += @as(f32, @floatFromInt(@reduce(.Add, values))) *
- input_1.scaling_factors[index / vector_size] *
- input_2.scaling_factors[index / vector_size];
- }
-
- return output_value;
-}
diff --git a/src/sampler.zig b/src/sampler.zig
index 6001921..c9e74fa 100644
--- a/src/sampler.zig
+++ b/src/sampler.zig
@@ -4,29 +4,20 @@ const builtin = @import("builtin");
const std = @import("std");
const math = @import("math.zig");
-allocator: std.mem.Allocator,
-probability_index_pairs_buffer: []ProbabilityIndexPair,
+probability_index_pairs: []ProbabilityIndexPair,
+rng_state: u64,
temperature: f32,
top_p: f32,
-rng_state: u64,
-
-pub fn init(allocator: std.mem.Allocator, args: anytype, vocab_size: usize) !Self {
- const probability_index_pairs_buffer =
- try allocator.alloc(ProbabilityIndexPair, vocab_size);
+pub fn createLeaky(allocator: std.mem.Allocator, args: anytype, vocab_size: usize) !Self {
return .{
- .allocator = allocator,
- .probability_index_pairs_buffer = probability_index_pairs_buffer,
+ .probability_index_pairs = try allocator.alloc(ProbabilityIndexPair, vocab_size),
+ .rng_state = args.random_seed,
.temperature = args.temperature,
.top_p = args.top_p,
- .rng_state = args.random_seed,
};
}
-pub fn deinit(self: Self) void {
- self.allocator.free(self.probability_index_pairs_buffer);
-}
-
pub fn sample(self: *Self, probability_distribution: []f32) usize {
if (self.temperature == 0) {
return math.argmax(probability_distribution);
@@ -42,11 +33,7 @@ pub fn sample(self: *Self, probability_distribution: []f32) usize {
return self.sampleMultinomial(probability_distribution);
}
- return self.sampleNucleus(
- probability_distribution,
- self.top_p,
- self.probability_index_pairs_buffer,
- );
+ return self.sampleNucleus(probability_distribution);
}
const tolerance: comptime_float = std.math.sqrt(std.math.floatEps(f32));
@@ -82,31 +69,26 @@ fn sampleMultinomial(self: *Self, probability_distribution: []const f32) usize {
const ProbabilityIndexPair = struct { probability: f32, index: usize };
// Nucleus sampling: https://arxiv.org/abs/1904.09751
-fn sampleNucleus(
- self: *Self,
- probability_distribution: []const f32,
- top_p: f32,
- probability_index_pairs_buffer: []ProbabilityIndexPair,
-) usize {
+fn sampleNucleus(self: *Self, probability_distribution: []const f32) usize {
@setFloatMode(.Optimized);
std.debug.assert(probability_distribution.len > 0);
// https://github.com/karpathy/llama2.c/commit/d421a95b2bfe593b2d9e5c147f3efc8d128afe0e
var probability_threshold: f32 =
- (1 - top_p) / @as(f32, @floatFromInt(probability_distribution.len - 1));
+ (1 - self.top_p) / @as(f32, @floatFromInt(probability_distribution.len - 1));
var n_probability_index_pairs: usize = 0;
for (probability_distribution, 0..) |probability, index| {
- if (probability_threshold < probability) {
- probability_index_pairs_buffer[n_probability_index_pairs].probability = probability;
- probability_index_pairs_buffer[n_probability_index_pairs].index = index;
+ if (probability >= probability_threshold) {
+ self.probability_index_pairs[n_probability_index_pairs].probability = probability;
+ self.probability_index_pairs[n_probability_index_pairs].index = index;
n_probability_index_pairs += 1;
}
}
- var probability_index_pairs = probability_index_pairs_buffer[0..n_probability_index_pairs];
+ var probability_index_pairs = self.probability_index_pairs[0..n_probability_index_pairs];
std.sort.block(ProbabilityIndexPair, probability_index_pairs, {}, lessThan);
@@ -115,7 +97,7 @@ fn sampleNucleus(
for (probability_index_pairs, 0..) |probability_index_pair, index| {
cumulative_probability += probability_index_pair.probability;
- if (cumulative_probability > top_p) {
+ if (cumulative_probability > self.top_p) {
probability_index_pairs = probability_index_pairs[0 .. index + 1];
break;
diff --git a/src/simd.zig b/src/simd.zig
index 37c637f..539e4ff 100644
--- a/src/simd.zig
+++ b/src/simd.zig
@@ -2,96 +2,104 @@ const std = @import("std");
// Pre-normalization using RMSNorm: https://arxiv.org/abs/1910.07467
pub fn computeRMSNorm(
- comptime TValue: type,
- comptime vector_size: comptime_int,
- input_values: []const TValue,
- weight_values: []const TValue,
- output_values: []TValue,
-) void {
+ input_values: []const f32,
+ weight_values: []const f32,
+ output_values: []f32,
+) !void {
@setFloatMode(.Optimized);
- var rms_scaling_factor = computeScalarProduct(TValue, vector_size, input_values, input_values);
+ var scaling_factor = try computeScalarProduct(input_values, input_values);
- rms_scaling_factor /= @floatFromInt(input_values.len);
- rms_scaling_factor += 1e-5;
- rms_scaling_factor = 1 / std.math.sqrt(rms_scaling_factor);
+ scaling_factor /= @floatFromInt(input_values.len);
+ scaling_factor += 1e-5;
+ scaling_factor = 1 / std.math.sqrt(scaling_factor);
- computeVectorMultiplication(
- TValue,
- vector_size,
- rms_scaling_factor,
- input_values,
- weight_values,
- output_values,
- );
+ try computeVectorMultiplication(scaling_factor, input_values, weight_values, output_values);
}
-pub fn computeScalarProduct(
- comptime TValue: type,
- comptime vector_size: comptime_int,
- values_1: []const TValue,
- values_2: []const TValue,
-) f32 {
+pub fn computeScalarProduct(input_values_1: []const f32, input_values_2: []const f32) !f32 {
@setFloatMode(.Optimized);
- std.debug.assert(values_1.len == values_2.len);
- std.debug.assert(values_1.len % vector_size == 0);
+ std.debug.assert(input_values_1.len == input_values_2.len);
+
+ comptime var vector_len = std.atomic.cache_line / @sizeOf(f32);
+
+ inline while (vector_len >= 4) : (vector_len /= 2) {
+ if (input_values_1.len % vector_len == 0) {
+ var output_values: @Vector(vector_len, f32) = @splat(0);
+ var index: usize = 0;
- var output_values: @Vector(vector_size, f32) = @splat(0.0);
- var index: usize = 0;
+ while (index < input_values_1.len) : (index += vector_len) {
+ output_values +=
+ @as(@Vector(vector_len, f32), input_values_1[index..][0..vector_len].*) *
+ @as(@Vector(vector_len, f32), input_values_2[index..][0..vector_len].*);
+ }
- while (index < values_1.len) : (index += vector_size) {
- output_values +=
- @as(@Vector(vector_size, f32), values_1[index..][0..vector_size].*) *
- @as(@Vector(vector_size, f32), values_2[index..][0..vector_size].*);
+ return @reduce(.Add, output_values);
+ }
}
- return @reduce(.Add, output_values);
+ return error.UnsupportedVectorSize;
}
pub fn computeVectorAddition(
- comptime TValue: type,
- comptime vector_size: comptime_int,
- input_values_1: []const TValue,
- input_values_2: []const TValue,
- output_values: []TValue,
-) void {
+ input_values_1: []const f32,
+ input_values_2: []const f32,
+ output_values: []f32,
+) !void {
@setFloatMode(.Optimized);
std.debug.assert(input_values_1.len == input_values_2.len);
- std.debug.assert(input_values_1.len % vector_size == 0);
+ std.debug.assert(input_values_1.len == output_values.len);
+
+ comptime var vector_len = std.atomic.cache_line / @sizeOf(f32);
- var index: usize = 0;
+ inline while (vector_len >= 4) : (vector_len /= 2) {
+ if (input_values_1.len % vector_len == 0) {
+ var index: usize = 0;
- while (index < input_values_1.len) : (index += vector_size) {
- output_values[index..][0..vector_size].* =
- @as(@Vector(vector_size, TValue), input_values_1[index..][0..vector_size].*) +
- @as(@Vector(vector_size, TValue), input_values_2[index..][0..vector_size].*);
+ while (index < input_values_1.len) : (index += vector_len) {
+ output_values[index..][0..vector_len].* =
+ @as(@Vector(vector_len, f32), input_values_1[index..][0..vector_len].*) +
+ @as(@Vector(vector_len, f32), input_values_2[index..][0..vector_len].*);
+ }
+
+ return;
+ }
}
+
+ return error.UnsupportedVectorSize;
}
pub fn computeVectorMultiplication(
- comptime TValue: type,
- comptime vector_size: comptime_int,
scaling_factor: f32,
- input_values_1: []const TValue,
- input_values_2: []const TValue,
- output_values: []TValue,
-) void {
+ input_values_1: []const f32,
+ input_values_2: []const f32,
+ output_values: []f32,
+) !void {
@setFloatMode(.Optimized);
std.debug.assert(input_values_1.len == input_values_2.len);
std.debug.assert(input_values_1.len == output_values.len);
- std.debug.assert(input_values_1.len % vector_size == 0);
- const scaling_factors: @Vector(vector_size, f32) = @splat(scaling_factor);
+ comptime var vector_len = std.atomic.cache_line / @sizeOf(f32);
+
+ inline while (vector_len >= 4) : (vector_len /= 2) {
+ if (input_values_1.len % vector_len == 0) {
+ const scaling_factors: @Vector(vector_len, f32) = @splat(scaling_factor);
- var index: usize = 0;
+ var index: usize = 0;
- while (index < input_values_1.len) : (index += vector_size) {
- output_values[index..][0..vector_size].* =
- scaling_factors *
- @as(@Vector(vector_size, TValue), input_values_1[index..][0..vector_size].*) *
- @as(@Vector(vector_size, TValue), input_values_2[index..][0..vector_size].*);
+ while (index < input_values_1.len) : (index += vector_len) {
+ output_values[index..][0..vector_len].* =
+ scaling_factors *
+ @as(@Vector(vector_len, f32), input_values_1[index..][0..vector_len].*) *
+ @as(@Vector(vector_len, f32), input_values_2[index..][0..vector_len].*);
+ }
+
+ return;
+ }
}
+
+ return error.UnsupportedVectorSize;
}
diff --git a/src/tensor.zig b/src/tensor.zig
deleted file mode 100644
index 2874bec..0000000
--- a/src/tensor.zig
+++ /dev/null
@@ -1,93 +0,0 @@
-const std = @import("std");
-const simd = @import("simd.zig");
-
-pub fn Tensor(comptime n_dims: comptime_int) type {
- comptime if (n_dims < 1) @compileError("n_dims < 1");
-
- return struct {
- const Self = @This();
-
- allocator: ?std.mem.Allocator,
- sub_dims: [n_dims - 1]usize,
- values: []f32,
-
- pub fn init(allocator: std.mem.Allocator, dims: [n_dims]usize) !Self {
- const n_values = @reduce(.Mul, @as(@Vector(n_dims, usize), dims));
-
- return .{
- .allocator = allocator,
- .sub_dims = dims[1..].*,
- .values = try allocator.alloc(f32, n_values),
- };
- }
-
- pub fn deinit(self: Self) void {
- if (self.allocator) |allocator| {
- allocator.free(self.values);
- }
- }
-
- pub fn read(self: Self, file: std.fs.File) !void {
- const values: [*]u8 = @ptrCast(self.values);
-
- try file.reader().readNoEof(values[0 .. self.values.len * @sizeOf(f32)]);
- }
-
- pub fn write(self: Self, file: std.fs.File) !void {
- const values: [*]u8 = @ptrCast(self.values);
-
- try file.writer().writeAll(values[0 .. self.values.len * @sizeOf(f32)]);
- }
-
- pub fn slice(self: Self, index: usize) Tensor(n_dims - 1) {
- comptime if (n_dims < 2) @compileError("n_dims < 2");
-
- const n_sub_values = @reduce(.Mul, @as(@Vector(n_dims - 1, usize), self.sub_dims));
-
- return .{
- .allocator = null,
- .sub_dims = self.sub_dims[1..].*,
- .values = self.values[index * n_sub_values ..][0..n_sub_values],
- };
- }
-
- pub fn computeMatrixVectorMultiplication(self: Self, input: anytype, output: anytype) void {
- for (output.values, 0..) |*value, index| {
- value.* = self.slice(index).computeScalarProduct(input);
- }
- }
-
- pub fn computeRMSNorm(self: Self, weight: anytype, output: anytype) void {
- if (self.values.len % 32 == 0)
- simd.computeRMSNorm(f32, 32, self.values, weight.values, output.values)
- else if (self.values.len % 16 == 0)
- simd.computeRMSNorm(f32, 16, self.values, weight.values, output.values)
- else if (self.values.len % 8 == 0)
- simd.computeRMSNorm(f32, 8, self.values, weight.values, output.values)
- else
- simd.computeRMSNorm(f32, 4, self.values, weight.values, output.values);
- }
-
- pub fn computeScalarProduct(self: Self, other: anytype) f32 {
- return if (self.values.len % 32 == 0)
- simd.computeScalarProduct(f32, 32, self.values, other.values)
- else if (self.values.len % 16 == 0)
- simd.computeScalarProduct(f32, 16, self.values, other.values)
- else if (self.values.len % 8 == 0)
- simd.computeScalarProduct(f32, 8, self.values, other.values)
- else
- simd.computeScalarProduct(f32, 4, self.values, other.values);
- }
-
- pub fn computeVectorAddition(self: Self, other: anytype) void {
- if (self.values.len % 32 == 0)
- simd.computeVectorAddition(f32, 32, self.values, other.values, self.values)
- else if (self.values.len % 16 == 0)
- simd.computeVectorAddition(f32, 16, self.values, other.values, self.values)
- else if (self.values.len % 8 == 0)
- simd.computeVectorAddition(f32, 8, self.values, other.values, self.values)
- else
- simd.computeVectorAddition(f32, 4, self.values, other.values, self.values);
- }
- };
-}
diff --git a/src/tokenizer.zig b/src/tokenizer.zig
index 055161a..735f483 100644
--- a/src/tokenizer.zig
+++ b/src/tokenizer.zig
@@ -2,25 +2,12 @@ const Self = @This();
const std = @import("std");
-allocator: std.mem.Allocator,
max_word_length: usize,
vocab: []const []const u8,
word_scores: []const f32,
sorted_vocab: []const VocabEntry,
-pub fn init(allocator: std.mem.Allocator, model_path: []const u8, vocab_size: usize) !Self {
- var vocab = try allocator.alloc([]u8, vocab_size);
-
- errdefer for (vocab) |word| {
- allocator.free(word);
- };
-
- errdefer allocator.free(vocab);
-
- var word_scores = try allocator.alloc(f32, vocab_size);
-
- errdefer allocator.free(word_scores);
-
+pub fn readLeaky(allocator: std.mem.Allocator, model_path: []const u8, vocab_size: usize) !Self {
const path = try std.fs.path.join(allocator, &[_][]const u8{ model_path, "tokenizer.bin" });
defer allocator.free(path);
@@ -29,41 +16,30 @@ pub fn init(allocator: std.mem.Allocator, model_path: []const u8, vocab_size: us
defer file.close();
- const reader = file.reader();
- const max_word_length = try reader.readIntLittle(u32);
+ const max_word_length = try file.reader().readIntLittle(u32);
+
+ var vocab = try allocator.alloc([]u8, vocab_size);
+ var word_scores = try allocator.alloc(f32, vocab_size);
- for (word_scores, 0..) |*word_score, word_index| {
- word_score.* = @bitCast(try reader.readIntLittle(u32));
+ for (word_scores, 0..) |*word_score, index| {
+ word_score.* = @bitCast(try file.reader().readIntLittle(u32));
- const word_length = try reader.readIntLittle(u32);
+ const word_length = try file.reader().readIntLittle(u32);
const word = try allocator.alloc(u8, word_length);
- try reader.readNoEof(word);
+ try file.reader().readNoEof(word);
- vocab[word_index] = word;
+ vocab[index] = word;
}
- const sorted_vocab = try sortVocab(allocator, vocab);
-
return .{
- .allocator = allocator,
.max_word_length = max_word_length,
.vocab = vocab,
.word_scores = word_scores,
- .sorted_vocab = sorted_vocab,
+ .sorted_vocab = try sortVocab(allocator, vocab),
};
}
-pub fn deinit(self: Self) void {
- for (self.vocab) |word| {
- self.allocator.free(word);
- }
-
- self.allocator.free(self.vocab);
- self.allocator.free(self.word_scores);
- self.allocator.free(self.sorted_vocab);
-}
-
pub fn encode(self: Self, allocator: std.mem.Allocator, text: []const u8) ![]usize {
var double_word_buffer = try allocator.alloc(u8, self.max_word_length * 2);
@@ -100,10 +76,10 @@ fn encodeCodepoints(self: Self, allocator: std.mem.Allocator, text: []const u8)
var text_view = try std.unicode.Utf8View.init(text);
var text_iterator = text_view.iterator();
- var token_index: usize = 0;
+ var index: usize = 0;
- while (text_iterator.nextCodepointSlice()) |codepoints| : (token_index += 1) {
- if (token_index == 0) {
+ while (text_iterator.nextCodepointSlice()) |codepoints| : (index += 1) {
+ if (index == 0) {
// https://github.com/karpathy/llama2.c/blob/7ac65cb2c2b169050747be92011b7bebdd1b4544/run.c#L483
try tokens.append(self.lookupToken(" ") orelse return error.BadVocab);
}
@@ -127,12 +103,12 @@ fn mergeBestWordPair(self: Self, tokens: []usize, double_word_buffer: []u8) bool
}
var best_token: ?usize = null;
- var best_token_index: ?usize = null;
+ var best_index: ?usize = null;
var best_word_score = -std.math.floatMax(f32);
- for (0..tokens.len - 1) |token_index| {
- const word1 = self.vocab[tokens[token_index]];
- const word2 = self.vocab[tokens[token_index + 1]];
+ for (0..tokens.len - 1) |index| {
+ const word1 = self.vocab[tokens[index]];
+ const word2 = self.vocab[tokens[index + 1]];
@memcpy(double_word_buffer[0..word1.len], word1);
@memcpy(double_word_buffer[word1.len .. word1.len + word2.len], word2);
@@ -144,19 +120,19 @@ fn mergeBestWordPair(self: Self, tokens: []usize, double_word_buffer: []u8) bool
if (word_score > best_word_score) {
best_token = token;
- best_token_index = token_index;
+ best_index = index;
best_word_score = word_score;
}
}
- if (best_token_index) |token_index| {
+ if (best_index) |index| {
std.mem.copyForwards(
usize,
- tokens[token_index + 1 .. tokens.len - 1],
- tokens[token_index + 2 ..],
+ tokens[index + 1 .. tokens.len - 1],
+ tokens[index + 2 ..],
);
- tokens[token_index] = best_token.?;
+ tokens[index] = best_token.?;
return true;
}
@@ -217,118 +193,110 @@ const tinystories_260k_path = "models/tinystories_260k";
// https://github.com/karpathy/llama2.c/pull/226
// https://github.com/karpathy/llama2.c/pull/297
test "encode utf-8" {
- const tokenizer = try Self.init(std.testing.allocator, tinystories_15m_path, 32000);
+ var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
- defer tokenizer.deinit();
+ defer arena.deinit();
+ const tokenizer = try Self.readLeaky(arena.allocator(), tinystories_15m_path, 32000);
const expected = [_]usize{ 365, 1691, 1018, 3963, 669, 29871, 31409, 30607, 30437, 30564 };
- const actual = try tokenizer.encode(std.testing.allocator, "Lets try ö & 株式会社");
-
- defer std.testing.allocator.free(actual);
+ const actual = try tokenizer.encode(arena.allocator(), "Lets try ö & 株式会社");
try std.testing.expectEqualSlices(usize, expected[0..], actual);
}
test "encode empty string" {
- const tokenizer = try Self.init(std.testing.allocator, tinystories_15m_path, 32000);
+ var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
- defer tokenizer.deinit();
+ defer arena.deinit();
+ const tokenizer = try Self.readLeaky(arena.allocator(), tinystories_15m_path, 32000);
const expected = [_]usize{};
- const actual = try tokenizer.encode(std.testing.allocator, "");
-
- defer std.testing.allocator.free(actual);
+ const actual = try tokenizer.encode(arena.allocator(), "");
try std.testing.expectEqualSlices(usize, expected[0..], actual);
}
test "encode unknown codepoint" {
- const tokenizer = try Self.init(std.testing.allocator, tinystories_15m_path, 32000);
+ var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
- defer tokenizer.deinit();
+ defer arena.deinit();
+ const tokenizer = try Self.readLeaky(arena.allocator(), tinystories_15m_path, 32000);
const expected = [_]usize{ 29871, 243, 149, 145, 154, 243, 150, 147, 144 };
- const actual = try tokenizer.encode(std.testing.allocator, "𒎗𓐍");
-
- defer std.testing.allocator.free(actual);
+ const actual = try tokenizer.encode(arena.allocator(), "𒎗𓐍");
try std.testing.expectEqualSlices(usize, expected[0..], actual);
}
test "encode single chars" {
- const tokenizer = try Self.init(std.testing.allocator, tinystories_260k_path, 512);
+ var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
- defer tokenizer.deinit();
+ defer arena.deinit();
+ const tokenizer = try Self.readLeaky(arena.allocator(), tinystories_260k_path, 512);
const expected = [_]usize{ 261, 430, 429, 418, 411, 431, 428, 415 };
- const actual = try tokenizer.encode(std.testing.allocator, "abcdefgh");
-
- defer std.testing.allocator.free(actual);
+ const actual = try tokenizer.encode(arena.allocator(), "abcdefgh");
try std.testing.expectEqualSlices(usize, expected[0..], actual);
}
// https://github.com/facebookresearch/llama/blob/ea9f33d6d3ea8ed7d560d270986407fd6c2e52b7/example_text_completion.py
test "meta encoding example 1" {
- const tokenizer = try Self.init(std.testing.allocator, tinystories_15m_path, 32000);
+ var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
- defer tokenizer.deinit();
+ defer arena.deinit();
+ const tokenizer = try Self.readLeaky(arena.allocator(), tinystories_15m_path, 32000);
const expected = [_]usize{ 306, 4658, 278, 6593, 310, 2834, 338 };
- const actual = try tokenizer.encode(std.testing.allocator, "I believe the meaning of life is");
-
- defer std.testing.allocator.free(actual);
+ const actual = try tokenizer.encode(arena.allocator(), "I believe the meaning of life is");
try std.testing.expectEqualSlices(usize, expected[0..], actual);
}
test "meta encoding example 2" {
- const tokenizer = try Self.init(std.testing.allocator, tinystories_15m_path, 32000);
+ var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
- defer tokenizer.deinit();
+ defer arena.deinit();
+ const tokenizer = try Self.readLeaky(arena.allocator(), tinystories_15m_path, 32000);
const expected = [_]usize{ 3439, 17632, 1925, 29892, 278, 6368, 310, 14215, 537, 5922, 393, 29871 };
const actual = try tokenizer.encode(
- std.testing.allocator,
+ arena.allocator(),
"Simply put, the theory of relativity states that ",
);
- defer std.testing.allocator.free(actual);
-
try std.testing.expectEqualSlices(usize, expected[0..], actual);
}
test "meta encoding example 3" {
- const tokenizer = try Self.init(std.testing.allocator, tinystories_15m_path, 32000);
+ var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
- defer tokenizer.deinit();
+ defer arena.deinit();
+ const tokenizer = try Self.readLeaky(arena.allocator(), tinystories_15m_path, 32000);
const expected = [_]usize{ 319, 11473, 2643, 378, 629, 271, 18099, 278, 3815, 373, 278, 6826, 29901, 13, 13, 4706, 6324, 14332, 29892, 13, 13, 4706, 306, 925, 29871 };
const actual = try tokenizer.encode(
- std.testing.allocator,
+ arena.allocator(),
"A brief message congratulating the team on the launch:\n\n Hi everyone,\n\n I just ",
);
- defer std.testing.allocator.free(actual);
-
try std.testing.expectEqualSlices(usize, expected[0..], actual);
}
test "meta encoding example 4" {
- const tokenizer = try Self.init(std.testing.allocator, tinystories_15m_path, 32000);
+ var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
- defer tokenizer.deinit();
+ defer arena.deinit();
+ const tokenizer = try Self.readLeaky(arena.allocator(), tinystories_15m_path, 32000);
const expected = [_]usize{ 4103, 9632, 4223, 304, 5176, 29901, 13, 13, 4706, 7205, 4932, 357, 1149, 301, 449, 276, 316, 2778, 13, 4706, 1236, 407, 837, 524, 1149, 6042, 354, 772, 440, 29878, 1318, 13, 4706, 715, 1878, 330, 3055, 1725, 1149, 330, 3055, 1725, 4639, 28754, 13, 4706, 923, 968, 1149 };
const actual = try tokenizer.encode(
- std.testing.allocator,
+ arena.allocator(),
"Translate English to French:\n\n sea otter => loutre de mer\n peppermint => menthe poivrée\n plush girafe => girafe peluche\n cheese =>",
);
- defer std.testing.allocator.free(actual);
-
try std.testing.expectEqualSlices(usize, expected[0..], actual);
}
diff --git a/src/transformer.zig b/src/transformer.zig
index e9d7efb..2d2d64b 100644
--- a/src/transformer.zig
+++ b/src/transformer.zig
@@ -4,88 +4,54 @@ const std = @import("std");
const Attention = @import("attention.zig");
const Checkpoint = @import("checkpoint.zig");
const FFN = @import("ffn.zig");
-const Tensor = @import("./tensor.zig").Tensor;
+const Vector = @import("vector.zig");
-allocator: std.mem.Allocator,
checkpoint: Checkpoint,
sequence_length: usize,
attention: Attention,
ffn: FFN,
-hidden_buffer: Tensor(1),
-output_buffer: Tensor(1),
+hidden: Vector,
+output: Vector,
-pub fn init(
+pub fn createLeaky(
allocator: std.mem.Allocator,
model_path: []const u8,
custom_sequence_length: usize,
) !Self {
- const checkpoint = try Checkpoint.init(allocator, model_path);
-
- errdefer checkpoint.deinit();
+ const checkpoint = try Checkpoint.readLeaky(allocator, model_path);
const sequence_length = if (custom_sequence_length == 0)
checkpoint.max_sequence_length
else
- custom_sequence_length;
-
- const attention = try Attention.init(allocator, checkpoint, sequence_length);
-
- errdefer attention.deinit();
-
- const ffn = try FFN.init(allocator, checkpoint);
-
- errdefer ffn.deinit();
-
- const hidden_buffer = try Tensor(1).init(allocator, [_]usize{checkpoint.embedding_size});
-
- errdefer hidden_buffer.deinit();
-
- const output_buffer = try Tensor(1).init(allocator, [_]usize{checkpoint.vocab_size});
-
- errdefer output_buffer.deinit();
+ @min(custom_sequence_length, checkpoint.max_sequence_length);
return .{
- .allocator = allocator,
.checkpoint = checkpoint,
.sequence_length = sequence_length,
- .attention = attention,
- .ffn = ffn,
- .hidden_buffer = hidden_buffer,
- .output_buffer = output_buffer,
+ .attention = try Attention.createLeaky(allocator, checkpoint, sequence_length),
+ .ffn = try FFN.createLeaky(allocator, checkpoint),
+ .hidden = try Vector.createLeaky(allocator, checkpoint.embedding_size),
+ .output = try Vector.createLeaky(allocator, checkpoint.vocab_size),
};
}
-pub fn deinit(self: Self) void {
- self.checkpoint.deinit();
- self.attention.deinit();
- self.ffn.deinit();
- self.hidden_buffer.deinit();
- self.output_buffer.deinit();
-}
-
-pub fn forward(self: Self, token: usize, position: usize) void {
- const weights = self.checkpoint.weights;
+pub fn forward(self: Self, token: usize, position: usize) !void {
+ const token_embedding_weight = self.checkpoint.token_embedding_weights[token];
- @memcpy(self.hidden_buffer.values, weights.token_embedding_vectors.slice(token).values);
+ @memcpy(self.hidden.values, token_embedding_weight.values);
for (0..self.checkpoint.n_layers) |layer| {
- self.hidden_buffer.computeRMSNorm(
- weights.attention_norm_vectors.slice(layer),
- self.attention.input_buffer,
- );
-
- self.attention.forward(layer, position);
- self.hidden_buffer.computeVectorAddition(self.attention.output_buffer);
-
- self.hidden_buffer.computeRMSNorm(
- weights.ffn_norm_vectors.slice(layer),
- self.ffn.input_buffer,
- );
-
- self.ffn.forward(layer);
- self.hidden_buffer.computeVectorAddition(self.ffn.output_buffer);
+ const attention_norm_weight = self.checkpoint.attention_norm_weights[layer];
+ const ffn_norm_weight = self.checkpoint.ffn_norm_weights[layer];
+
+ try self.hidden.computeRMSNorm(attention_norm_weight, self.attention.input);
+ try self.attention.forward(layer, position);
+ try self.hidden.addVector(self.attention.output);
+ try self.hidden.computeRMSNorm(ffn_norm_weight, self.ffn.input);
+ try self.ffn.forward(layer);
+ try self.hidden.addVector(self.ffn.output);
}
- self.hidden_buffer.computeRMSNorm(weights.output_norm_vector, self.hidden_buffer);
- weights.output_matrix.computeMatrixVectorMultiplication(self.hidden_buffer, self.output_buffer);
+ try self.hidden.computeRMSNorm(self.checkpoint.output_norm_weight, self.hidden);
+ try self.checkpoint.output_weight.multiplyVector(self.hidden, self.output);
}
diff --git a/src/vector.zig b/src/vector.zig
new file mode 100644
index 0000000..3ce2a7d
--- /dev/null
+++ b/src/vector.zig
@@ -0,0 +1,60 @@
+const Self = @This();
+
+const std = @import("std");
+const simd = @import("simd.zig");
+
+values: []f32,
+
+pub fn createLeaky(allocator: std.mem.Allocator, n_values: usize) !Self {
+ return .{ .values = try allocator.alignedAlloc(f32, std.atomic.cache_line, n_values) };
+}
+
+pub fn createMultipleLeaky(
+ allocator: std.mem.Allocator,
+ n_vectors: usize,
+ n_values: usize,
+) ![]Self {
+ const vectors = try allocator.alloc(Self, n_vectors);
+
+ for (vectors) |*vector| {
+ vector.* = try createLeaky(allocator, n_values);
+ }
+
+ return vectors;
+}
+
+pub fn readLeaky(allocator: std.mem.Allocator, file: std.fs.File, n_values: usize) !Self {
+ const vector = try createLeaky(allocator, n_values);
+ const bytes: [*]u8 = @ptrCast(vector.values);
+
+ try file.reader().readNoEof(bytes[0 .. vector.values.len * @sizeOf(f32)]);
+
+ return vector;
+}
+
+pub fn readMultipleLeaky(
+ allocator: std.mem.Allocator,
+ file: std.fs.File,
+ n_vectors: usize,
+ n_values: usize,
+) ![]Self {
+ const vectors = try allocator.alloc(Self, n_vectors);
+
+ for (vectors) |*vector| {
+ vector.* = try readLeaky(allocator, file, n_values);
+ }
+
+ return vectors;
+}
+
+pub fn addVector(self: Self, other: Self) !void {
+ try simd.computeVectorAddition(self.values, other.values, self.values);
+}
+
+pub fn computeRMSNorm(self: Self, weight: Self, output: Self) !void {
+ try simd.computeRMSNorm(self.values, weight.values, output.values);
+}
+
+pub fn computeScalarProduct(self: Self, other: Self) !f32 {
+ return simd.computeScalarProduct(self.values, other.values);
+}