diff --git a/README.md b/README.md index 77c8612..0c4664f 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ This project is a port of Andrej Karpathy's [llama2.c](https://github.com/karpat ## Usage -Build and run the `llama2-generator` for text generation: +Build and run `llama2-generator`: ```sh zig build -Doptimize=ReleaseFast @@ -23,10 +23,10 @@ Lily wanted to play with the ball, but it was too high up in the sky. She tried Lily found a stick and tried to hit the ball. But the stick was too short. She tried again and again, but she couldn't reach it. She felt sad. Suddenly, a kind man came by and saw Lily. He asked her what was wrong. Lily told him about the ball. The man smiled and said, "I have a useful idea!" He took out a long stick and used it to knock the ball down. Lily was so happy! She thanked the man and they played together in the sunshine. -achieved: 726.974 tok/s +achieved: 719.870 tok/s ``` -## Run Llama 2 from Hugging Face +## Run Llama 2 7B from Hugging Face Install `git-lfs` and clone the [Llama 2 7B](https://huggingface.co/meta-llama/Llama-2-7b-hf) model from Hugging Face: @@ -43,7 +43,7 @@ pip3 install -r requirements.txt python3 convert_hf_model.py /path/to/Llama-2-7b-hf models/llama2_7b_hf ``` -Build and run the `llama2-generator` for text generation: +Build and run `llama2-generator`: ```sh zig build -Doptimize=ReleaseFast @@ -55,7 +55,40 @@ The output on an Apple M1 Pro with 32 GB of memory: ``` Once Upon a Time in Hollywood is a 2019 American comedy-drama film written and directed by Quentin Tarantino -achieved: 1.821 tok/s +achieved: 1.800 tok/s +``` + +## Run Llama 2 7B Chat from Hugging Face + +Install `git-lfs` and clone the [Llama 2 7B Chat](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) model from Hugging Face: + +```sh +# Make sure you have git-lfs installed (https://git-lfs.com) +git lfs install +git clone https://huggingface.co/meta-llama/Llama-2-7b-chat-hf +``` + +Install the necessary Python packages and convert the Hugging Face model: + +```sh +pip3 install -r requirements.txt +python3 convert_hf_model.py /path/to/Llama-2-7b-chat-hf models/llama2_7b_chat_hf +``` + +Build and run `llama2-chat`: + +```sh +zig build -Doptimize=ReleaseFast +./zig-out/bin/llama2-chat models/llama2_7b_chat_hf +``` + +The output on an Apple M1 Pro with 32 GB of memory: + +``` +Enter system prompt (optional): +User: Hello +Assistant: Hello! It's nice to meet you. Is there something I can help you with or would you like to chat? +User: ... ``` ## Help diff --git a/build.zig b/build.zig index c04fcbc..ad8280f 100644 --- a/build.zig +++ b/build.zig @@ -4,13 +4,6 @@ pub fn build(b: *std.Build) void { const target = b.standardTargetOptions(.{}); const optimize = b.standardOptimizeOption(.{}); - const chat_exe = b.addExecutable(.{ - .name = "llama2-chat", - .root_source_file = .{ .path = "src/chat_main.zig" }, - .target = target, - .optimize = optimize, - }); - const generator_exe = b.addExecutable(.{ .name = "llama2-generator", .root_source_file = .{ .path = "src/generator_main.zig" }, @@ -18,63 +11,54 @@ pub fn build(b: *std.Build) void { .optimize = optimize, }); - const converter_exe = b.addExecutable(.{ - .name = "llama2-converter", - .root_source_file = .{ .path = "src/converter_main.zig" }, + const chat_exe = b.addExecutable(.{ + .name = "llama2-chat", + .root_source_file = .{ .path = "src/chat_main.zig" }, .target = target, .optimize = optimize, }); const build_options = b.addOptions(); - chat_exe.addOptions("build_options", build_options); generator_exe.addOptions("build_options", build_options); - converter_exe.addOptions("build_options", build_options); + chat_exe.addOptions("build_options", build_options); // This declares intent for the executable to be installed into the // standard location when the user invokes the "install" step (the default // step when running `zig build`). - b.installArtifact(chat_exe); b.installArtifact(generator_exe); - b.installArtifact(converter_exe); + b.installArtifact(chat_exe); // This *creates* a Run step in the build graph, to be executed when another // step is evaluated that depends on it. The next line below will establish // such a dependency. - const run_chat_cmd = b.addRunArtifact(chat_exe); const run_generator_cmd = b.addRunArtifact(generator_exe); - const run_converter_cmd = b.addRunArtifact(converter_exe); + const run_chat_cmd = b.addRunArtifact(chat_exe); // By making the run step depend on the install step, it will be run from the // installation directory rather than directly from within the cache directory. // This is not necessary, however, if the application depends on other installed // files, this ensures they will be present and in the expected location. - run_chat_cmd.step.dependOn(b.getInstallStep()); run_generator_cmd.step.dependOn(b.getInstallStep()); - run_converter_cmd.step.dependOn(b.getInstallStep()); + run_chat_cmd.step.dependOn(b.getInstallStep()); // This allows the user to pass arguments to the application in the build // command itself, like this: `zig build run -- arg1 arg2 etc` if (b.args) |args| { - run_chat_cmd.addArgs(args); run_generator_cmd.addArgs(args); - run_converter_cmd.addArgs(args); + run_chat_cmd.addArgs(args); } // This creates a build step. It will be visible in the `zig build --help` menu, // and can be selected like this: `zig build run` // This will evaluate the `run` step rather than the default, which is "install". - const run_chat_step = b.step("run-chat", "Run the chat"); - - run_chat_step.dependOn(&run_chat_cmd.step); - const run_generator_step = b.step("run-generator", "Run the generator"); run_generator_step.dependOn(&run_generator_cmd.step); - const run_converter_step = b.step("run-converter", "Run the converter"); + const run_chat_step = b.step("run-chat", "Run the chat"); - run_converter_step.dependOn(&run_converter_cmd.step); + run_chat_step.dependOn(&run_chat_cmd.step); const test_step = b.step("test", "Run unit tests"); diff --git a/src/attention.zig b/src/attention.zig index 43c7df3..2065bbe 100644 --- a/src/attention.zig +++ b/src/attention.zig @@ -2,111 +2,91 @@ const Self = @This(); const std = @import("std"); const Checkpoint = @import("checkpoint.zig"); -const math = @import("./math.zig"); -const Tensor = @import("./tensor.zig").Tensor; +const math = @import("math.zig"); +const simd = @import("simd.zig"); +const Vector = @import("vector.zig"); -allocator: std.mem.Allocator, checkpoint: Checkpoint, head_size: usize, head_size_sqrt: f32, -input_buffer: Tensor(2), -output_buffer: Tensor(1), -query_buffer: Tensor(2), -key_cache: Tensor(4), -value_cache: Tensor(4), +input: Vector, +output: Vector, +multi_query: Vector, +key_cache: []const []const Vector, +value_cache: []const []const Vector, scores: []f32, -pub fn init(allocator: std.mem.Allocator, checkpoint: Checkpoint, sequence_length: usize) !Self { - const embedding_size = checkpoint.embedding_size; - const n_attention_heads = checkpoint.n_attention_heads; - const head_size: usize = embedding_size / n_attention_heads; - const input_buffer = try Tensor(2).init(allocator, [_]usize{ n_attention_heads, head_size }); - - errdefer input_buffer.deinit(); - - const output_buffer = try Tensor(1).init(allocator, [_]usize{embedding_size}); - - errdefer output_buffer.deinit(); - - const query_buffer = try Tensor(2).init(allocator, [_]usize{ n_attention_heads, head_size }); - - errdefer query_buffer.deinit(); - - const n_layers = checkpoint.n_layers; - const n_attention_query_groups = checkpoint.n_attention_query_groups; - - const key_cache = try Tensor(4).init( - allocator, - [_]usize{ n_layers, sequence_length, n_attention_query_groups, head_size }, - ); - - errdefer key_cache.deinit(); - - const value_cache = try Tensor(4).init( - allocator, - [_]usize{ n_layers, sequence_length, n_attention_query_groups, head_size }, - ); - - errdefer value_cache.deinit(); +pub fn createLeaky( + allocator: std.mem.Allocator, + checkpoint: Checkpoint, + sequence_length: usize, +) !Self { + const head_size = checkpoint.embedding_size / checkpoint.n_attention_heads; + const key_cache = try allocator.alloc([]Vector, checkpoint.n_layers); + + for (key_cache) |*layer| { + layer.* = try Vector.createMultipleLeaky( + allocator, + sequence_length, + checkpoint.n_attention_query_groups * head_size, + ); + } - const scores = try allocator.alloc(f32, sequence_length); + const value_cache = try allocator.alloc([]Vector, checkpoint.n_layers); - errdefer allocator.free(scores); + for (value_cache) |*layer| { + layer.* = try Vector.createMultipleLeaky( + allocator, + sequence_length, + checkpoint.n_attention_query_groups * head_size, + ); + } return .{ - .allocator = allocator, .checkpoint = checkpoint, .head_size = head_size, .head_size_sqrt = std.math.sqrt(@as(f32, @floatFromInt(head_size))), - .input_buffer = input_buffer, - .output_buffer = output_buffer, - .query_buffer = query_buffer, + .input = try Vector.createLeaky(allocator, checkpoint.embedding_size), + .output = try Vector.createLeaky(allocator, checkpoint.embedding_size), + .multi_query = try Vector.createLeaky(allocator, checkpoint.embedding_size), .key_cache = key_cache, .value_cache = value_cache, - .scores = scores, + .scores = try allocator.alloc(f32, sequence_length), }; } -pub fn deinit(self: Self) void { - self.input_buffer.deinit(); - self.output_buffer.deinit(); - self.query_buffer.deinit(); - self.key_cache.deinit(); - self.value_cache.deinit(); - self.allocator.free(self.scores); -} - -pub fn forward(self: Self, layer: usize, position: usize) void { - const weights = self.checkpoint.weights; - const query_matrix = weights.attention_query_matrices.slice(layer); - const key_matrix = weights.attention_key_matrices.slice(layer); - const value_matrix = weights.attention_value_matrices.slice(layer); - const output_matrix = weights.attention_output_matrices.slice(layer); - const key_buffer = self.key_cache.slice(layer).slice(position); - const value_buffer = self.value_cache.slice(layer).slice(position); +pub fn forward(self: Self, layer: usize, position: usize) !void { + const query_weight = self.checkpoint.attention_query_weights[layer]; + const key_weight = self.checkpoint.attention_key_weights[layer]; + const value_weight = self.checkpoint.attention_value_weights[layer]; + const output_weight = self.checkpoint.attention_output_weights[layer]; + const multi_key = self.key_cache[layer][position]; + const multi_value = self.value_cache[layer][position]; - query_matrix.computeMatrixVectorMultiplication(self.input_buffer, self.query_buffer); - key_matrix.computeMatrixVectorMultiplication(self.input_buffer, key_buffer); - value_matrix.computeMatrixVectorMultiplication(self.input_buffer, value_buffer); + try query_weight.multiplyVector(self.input, self.multi_query); + try key_weight.multiplyVector(self.input, multi_key); + try value_weight.multiplyVector(self.input, multi_value); - self.computeRoPE(position, key_buffer); + self.computeRoPE(position, multi_key.values); for (0..self.checkpoint.n_attention_heads) |head| { - self.computeGQA(layer, position, head); + try self.computeGQA(layer, position, head); } - output_matrix.computeMatrixVectorMultiplication(self.input_buffer, self.output_buffer); + try output_weight.multiplyVector(self.input, self.output); } // Rotary positional embeddings: https://arxiv.org/abs/2104.09864 -fn computeRoPE(self: Self, position: usize, key_buffer: Tensor(2)) void { +fn computeRoPE(self: Self, position: usize, multi_key_values: []f32) void { @setFloatMode(.Optimized); - std.debug.assert(self.query_buffer.values.len % key_buffer.values.len == 0); + const multi_query_values = self.multi_query.values; + + std.debug.assert(multi_query_values.len % multi_key_values.len == 0); var index: usize = 0; - while (index < self.query_buffer.values.len) : (index += 2) { + while (index < multi_query_values.len) : (index += 2) { const head: f32 = @floatFromInt(index % self.head_size); const frequency = @@ -116,27 +96,27 @@ fn computeRoPE(self: Self, position: usize, key_buffer: Tensor(2)) void { const real_rotation_value: f32 = std.math.cos(rotation_scaling_factor); const imag_rotation_value: f32 = std.math.sin(rotation_scaling_factor); - const q_0 = self.query_buffer.values[index]; - const q_1 = self.query_buffer.values[index + 1]; + const q_0 = multi_query_values[index]; + const q_1 = multi_query_values[index + 1]; - self.query_buffer.values[index] = q_0 * real_rotation_value - q_1 * imag_rotation_value; - self.query_buffer.values[index + 1] = q_0 * imag_rotation_value + q_1 * real_rotation_value; + multi_query_values[index] = q_0 * real_rotation_value - q_1 * imag_rotation_value; + multi_query_values[index + 1] = q_0 * imag_rotation_value + q_1 * real_rotation_value; - if (index < key_buffer.values.len) { - const k_0 = key_buffer.values[index]; - const k_1 = key_buffer.values[index + 1]; + if (index < multi_key_values.len) { + const k_0 = multi_key_values[index]; + const k_1 = multi_key_values[index + 1]; - key_buffer.values[index] = k_0 * real_rotation_value - k_1 * imag_rotation_value; - key_buffer.values[index + 1] = k_0 * imag_rotation_value + k_1 * real_rotation_value; + multi_key_values[index] = k_0 * real_rotation_value - k_1 * imag_rotation_value; + multi_key_values[index + 1] = k_0 * imag_rotation_value + k_1 * real_rotation_value; } } } // Grouped-query attention: https://arxiv.org/abs/2305.13245v1 -fn computeGQA(self: Self, layer: usize, current_position: usize, head: usize) void { +fn computeGQA(self: Self, layer: usize, current_position: usize, head: usize) !void { @setFloatMode(.Optimized); - const query_vector = self.query_buffer.slice(head); + const query_values = self.multi_query.values[head * self.head_size ..][0..self.head_size]; const query_group = head / (self.checkpoint.n_attention_heads / self.checkpoint.n_attention_query_groups); @@ -144,23 +124,26 @@ fn computeGQA(self: Self, layer: usize, current_position: usize, head: usize) vo const next_position = current_position + 1; for (0..next_position) |position| { - const key_vector = self.key_cache.slice(layer).slice(position).slice(query_group); + const multi_key = self.key_cache[layer][position]; + const key_values = multi_key.values[query_group * self.head_size ..][0..self.head_size]; - self.scores[position] = query_vector.computeScalarProduct(key_vector) / self.head_size_sqrt; + self.scores[position] = + try simd.computeScalarProduct(query_values, key_values) / self.head_size_sqrt; } math.softmax(self.scores[0..next_position]); - const attention_buffer = self.input_buffer.slice(head); + const attention_values = self.input.values[head * self.head_size ..][0..self.head_size]; - @memset(attention_buffer.values, 0); + @memset(attention_values, 0); for (0..next_position) |position| { - const value_vector = self.value_cache.slice(layer).slice(position).slice(query_group); + const multi_value = self.value_cache[layer][position]; + const value_values = multi_value.values[query_group * self.head_size ..][0..self.head_size]; const weight = self.scores[position]; for (0..self.head_size) |index| { - attention_buffer.values[index] += value_vector.values[index] * weight; + attention_values[index] += value_values[index] * weight; } } } diff --git a/src/chat.zig b/src/chat.zig index f7290e5..1953973 100644 --- a/src/chat.zig +++ b/src/chat.zig @@ -7,43 +7,25 @@ const Sampler = @import("sampler.zig"); const Tokenizer = @import("tokenizer.zig"); const Transformer = @import("transformer.zig"); -allocator: std.mem.Allocator, transformer: Transformer, tokenizer: Tokenizer, sampler: Sampler, system_prompt: []const u8, user_prompt: []const u8, -pub fn init(allocator: std.mem.Allocator, args: ChatArgs) !Self { - const transformer = try Transformer.init(allocator, args.model_path, args.sequence_length); - - errdefer transformer.deinit(); - +pub fn createLeaky(allocator: std.mem.Allocator, args: ChatArgs) !Self { + const transformer = try Transformer.createLeaky(allocator, args.model_path, args.sequence_length); const vocab_size = transformer.checkpoint.vocab_size; - const tokenizer = try Tokenizer.init(allocator, args.model_path, vocab_size); - - errdefer tokenizer.deinit(); - - const sampler = try Sampler.init(allocator, args, vocab_size); - - errdefer sampler.deinit(); return .{ - .allocator = allocator, .transformer = transformer, - .tokenizer = tokenizer, - .sampler = sampler, + .tokenizer = try Tokenizer.readLeaky(allocator, args.model_path, vocab_size), + .sampler = try Sampler.createLeaky(allocator, args, vocab_size), .system_prompt = args.system_prompt, .user_prompt = args.user_prompt, }; } -pub fn deinit(self: Self) void { - self.transformer.deinit(); - self.tokenizer.deinit(); - self.sampler.deinit(); -} - const system_prompt_template_start = "<>\n"; const system_prompt_template_close = "\n<>\n\n"; const user_prompt_template_start = "[INST] "; @@ -68,7 +50,7 @@ pub fn start(self: *Self, allocator: std.mem.Allocator) !void { }; for (0..self.transformer.sequence_length) |position| { - self.transformer.forward(token, position); + try self.transformer.forward(token, position); if (token == bos_token and user_turn) { var user_prompt = std.ArrayList(u8).init(allocator); @@ -129,7 +111,7 @@ pub fn start(self: *Self, allocator: std.mem.Allocator) !void { user_prompt_tokens_index += 1; if (next_token == 0) { - next_token = self.sampler.sample(self.transformer.output_buffer.values); + next_token = self.sampler.sample(self.transformer.output.values); } if (next_token == eos_token) { diff --git a/src/chat_args.zig b/src/chat_args.zig index 040bda4..b38e9c7 100644 --- a/src/chat_args.zig +++ b/src/chat_args.zig @@ -2,7 +2,6 @@ const Self = @This(); const std = @import("std"); -arg_iterator: std.process.ArgIterator, model_path: []const u8, temperature: f32, top_p: f32, @@ -20,11 +19,9 @@ const Option = enum { user_prompt, }; -pub fn init(allocator: std.mem.Allocator) !Self { +pub fn createLeaky(allocator: std.mem.Allocator) !Self { var arg_iterator = try std.process.argsWithAllocator(allocator); - errdefer arg_iterator.deinit(); - _ = arg_iterator.next().?; const model_path = arg_iterator.next() orelse try help(1); @@ -78,7 +75,6 @@ pub fn init(allocator: std.mem.Allocator) !Self { } return .{ - .arg_iterator = arg_iterator, .model_path = model_path, .temperature = @max(@min(temperature orelse 1, 1), 0), .top_p = @max(@min(top_p orelse 0.9, 1), 0), @@ -89,10 +85,6 @@ pub fn init(allocator: std.mem.Allocator) !Self { }; } -pub fn deinit(self: *Self) void { - self.arg_iterator.deinit(); -} - fn help(exit_status: u8) !noreturn { const console = if (exit_status == 0) std.io.getStdOut().writer() diff --git a/src/chat_main.zig b/src/chat_main.zig index 4f35682..fffe5ed 100644 --- a/src/chat_main.zig +++ b/src/chat_main.zig @@ -3,15 +3,13 @@ const Chat = @import("chat.zig"); const ChatArgs = @import("chat_args.zig"); pub fn main() !void { - const allocator = std.heap.page_allocator; + var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); - var args = try ChatArgs.init(allocator); + defer arena.deinit(); - defer args.deinit(); + const args = try ChatArgs.createLeaky(arena.allocator()); - var chat = try Chat.init(allocator, args); + var chat = try Chat.createLeaky(arena.allocator(), args); - defer chat.deinit(); - - try chat.start(allocator); + try chat.start(arena.allocator()); } diff --git a/src/checkpoint.zig b/src/checkpoint.zig index fe5da8a..b7e6993 100644 --- a/src/checkpoint.zig +++ b/src/checkpoint.zig @@ -1,9 +1,9 @@ const Self = @This(); const std = @import("std"); -const Tensor = @import("./tensor.zig").Tensor; +const Matrix = @import("matrix.zig"); +const Vector = @import("vector.zig"); -allocator: std.mem.Allocator, embedding_size: usize, ffn_hidden_size: usize, n_layers: usize, @@ -11,55 +11,21 @@ n_attention_heads: usize, n_attention_query_groups: usize, vocab_size: usize, max_sequence_length: usize, -shared_output_matrix: bool, -weights: struct { - token_embedding_vectors: Tensor(2), - attention_norm_vectors: Tensor(2), - attention_query_matrices: Tensor(3), - attention_key_matrices: Tensor(3), - attention_value_matrices: Tensor(3), - attention_output_matrices: Tensor(3), - ffn_norm_vectors: Tensor(2), - ffn_gate_matrices: Tensor(3), - ffn_down_matrices: Tensor(3), - ffn_up_matrices: Tensor(3), - output_norm_vector: Tensor(1), - output_matrix: Tensor(2), -}, - -pub fn init(allocator: std.mem.Allocator, model_path: []const u8) !Self { - const v1_path = try std.fs.path.join( - allocator, - &[_][]const u8{ model_path, "checkpoint_v1.bin" }, - ); - - defer allocator.free(v1_path); - - const v1_file = std.fs.cwd().openFile(v1_path, .{}) catch null; - - defer if (v1_file) |file| file.close(); - - if (v1_file) |file| return try readV1(allocator, file); - - const legacy_path = try std.fs.path.join( - allocator, - &[_][]const u8{ model_path, "checkpoint_legacy.bin" }, - ); - - defer allocator.free(legacy_path); - - const legacy_file = std.fs.cwd().openFile(legacy_path, .{}) catch null; - - defer if (legacy_file) |file| file.close(); - - if (legacy_file) |file| return try readLegacy(allocator, file); - - return error.CheckpointFileNotFound; -} - -// https://github.com/karpathy/llama2.c/blob/d9862069e7ef665fe6309e3c17398ded2f121bf5/export.py#L132 -pub fn writeV1(self: Self, allocator: std.mem.Allocator, model_path: []const u8) !void { +token_embedding_weights: []const Vector, +attention_norm_weights: []const Vector, +attention_query_weights: []const Matrix, +attention_key_weights: []const Matrix, +attention_value_weights: []const Matrix, +attention_output_weights: []const Matrix, +ffn_norm_weights: []const Vector, +ffn_gate_weights: []const Matrix, +ffn_down_weights: []const Matrix, +ffn_up_weights: []const Matrix, +output_norm_weight: Vector, +output_weight: Matrix, + +pub fn readLeaky(allocator: std.mem.Allocator, model_path: []const u8) !Self { const path = try std.fs.path.join( allocator, &[_][]const u8{ model_path, "checkpoint_v1.bin" }, @@ -67,49 +33,15 @@ pub fn writeV1(self: Self, allocator: std.mem.Allocator, model_path: []const u8) defer allocator.free(path); - const file = try std.fs.cwd().createFile(path, .{ .truncate = true }); + const file = try std.fs.cwd().openFile(path, .{}); defer file.close(); - try file.writer().writeIntLittle(u32, 0x616b3432); - try file.writer().writeIntLittle(i32, 1); - try file.writer().writeIntLittle(i32, @as(i32, @intCast(self.embedding_size))); - try file.writer().writeIntLittle(i32, @as(i32, @intCast(self.ffn_hidden_size))); - try file.writer().writeIntLittle(i32, @as(i32, @intCast(self.n_layers))); - try file.writer().writeIntLittle(i32, @as(i32, @intCast(self.n_attention_heads))); - try file.writer().writeIntLittle(i32, @as(i32, @intCast(self.n_attention_query_groups))); - try file.writer().writeIntLittle(i32, @as(i32, @intCast(self.vocab_size))); - try file.writer().writeIntLittle(i32, @as(i32, @intCast(self.max_sequence_length))); - try file.writer().writeIntLittle(u8, @as(u8, @intFromBool(self.shared_output_matrix))); - try file.writer().writeByteNTimes(0, 256 - try file.getPos()); - try self.weights.attention_norm_vectors.write(file); - try self.weights.ffn_norm_vectors.write(file); - try self.weights.output_norm_vector.write(file); - try self.weights.token_embedding_vectors.write(file); - try self.weights.attention_query_matrices.write(file); - try self.weights.attention_key_matrices.write(file); - try self.weights.attention_value_matrices.write(file); - try self.weights.attention_output_matrices.write(file); - try self.weights.ffn_gate_matrices.write(file); - try self.weights.ffn_down_matrices.write(file); - try self.weights.ffn_up_matrices.write(file); - - if (!self.shared_output_matrix) { - try self.weights.output_matrix.write(file); - } -} - -// https://github.com/karpathy/llama2.c/blob/d9862069e7ef665fe6309e3c17398ded2f121bf5/export.py#L132 -fn readV1(allocator: std.mem.Allocator, file: std.fs.File) !Self { - const magic = try file.reader().readIntLittle(u32); - - if (magic != 0x616b3432) { + if (try file.reader().readIntLittle(u32) != 0x616b3432) { return error.InvalidMagic; } - const version = try file.reader().readIntLittle(i32); - - if (version != 1) { + if (try file.reader().readIntLittle(i32) != 1) { return error.InvalidVersion; } @@ -124,111 +56,93 @@ fn readV1(allocator: std.mem.Allocator, file: std.fs.File) !Self { try file.seekTo(256); - const attention_norm_vectors = try Tensor(2).init( + const attention_norm_weights = try Vector.readMultipleLeaky( allocator, - [_]usize{ n_layers, embedding_size }, + file, + n_layers, + embedding_size, ); - errdefer attention_norm_vectors.deinit(); - try attention_norm_vectors.read(file); - - const ffn_norm_vectors = try Tensor(2).init( + const ffn_norm_weights = try Vector.readMultipleLeaky( allocator, - [_]usize{ n_layers, embedding_size }, + file, + n_layers, + embedding_size, ); - errdefer ffn_norm_vectors.deinit(); - try ffn_norm_vectors.read(file); + const output_norm_weight = try Vector.readLeaky(allocator, file, embedding_size); - const output_norm_vector = try Tensor(1).init( + const token_embedding_weights = try Vector.readMultipleLeaky( allocator, - [_]usize{embedding_size}, + file, + vocab_size, + embedding_size, ); - errdefer output_norm_vector.deinit(); - try output_norm_vector.read(file); - - const token_embedding_vectors = try Tensor(2).init( - allocator, - [_]usize{ vocab_size, embedding_size }, - ); - - errdefer token_embedding_vectors.deinit(); - try token_embedding_vectors.read(file); - - const attention_query_matrices = try Tensor(3).init( + const attention_query_weights = try Matrix.readMultipleLeaky( allocator, - [_]usize{ n_layers, embedding_size, embedding_size }, + file, + n_layers, + embedding_size, + embedding_size, ); - errdefer attention_query_matrices.deinit(); - try attention_query_matrices.read(file); - const attention_head_size: usize = embedding_size / n_attention_heads; - const attention_key_matrices = try Tensor(3).init( + const attention_key_weights = try Matrix.readMultipleLeaky( allocator, - [_]usize{ n_layers, n_attention_query_groups * attention_head_size, embedding_size }, + file, + n_layers, + n_attention_query_groups * attention_head_size, + embedding_size, ); - errdefer attention_key_matrices.deinit(); - try attention_key_matrices.read(file); - - const attention_value_matrices = try Tensor(3).init( + const attention_value_weights = try Matrix.readMultipleLeaky( allocator, - [_]usize{ n_layers, n_attention_query_groups * attention_head_size, embedding_size }, + file, + n_layers, + n_attention_query_groups * attention_head_size, + embedding_size, ); - errdefer attention_value_matrices.deinit(); - try attention_value_matrices.read(file); - - const attention_output_matrices = try Tensor(3).init( + const attention_output_weights = try Matrix.readMultipleLeaky( allocator, - [_]usize{ n_layers, embedding_size, embedding_size }, + file, + n_layers, + embedding_size, + embedding_size, ); - errdefer attention_output_matrices.deinit(); - try attention_output_matrices.read(file); - - const ffn_gate_matrices = try Tensor(3).init( + const ffn_gate_weights = try Matrix.readMultipleLeaky( allocator, - [_]usize{ n_layers, ffn_hidden_size, embedding_size }, + file, + n_layers, + ffn_hidden_size, + embedding_size, ); - errdefer ffn_gate_matrices.deinit(); - try ffn_gate_matrices.read(file); - - const ffn_down_matrices = try Tensor(3).init( + const ffn_down_weights = try Matrix.readMultipleLeaky( allocator, - [_]usize{ n_layers, embedding_size, ffn_hidden_size }, + file, + n_layers, + embedding_size, + ffn_hidden_size, ); - errdefer ffn_down_matrices.deinit(); - try ffn_down_matrices.read(file); - - const ffn_up_matrices = try Tensor(3).init( + const ffn_up_weights = try Matrix.readMultipleLeaky( allocator, - [_]usize{ n_layers, ffn_hidden_size, embedding_size }, + file, + n_layers, + ffn_hidden_size, + embedding_size, ); - errdefer ffn_up_matrices.deinit(); - try ffn_up_matrices.read(file); - - const output_matrix = if (shared_output_matrix) - token_embedding_vectors + const output_weight = if (shared_output_matrix) + Matrix{ .rows = token_embedding_weights } else - try Tensor(2).init(allocator, [_]usize{ vocab_size, embedding_size }); - - errdefer if (!shared_output_matrix) { - output_matrix.deinit(); - }; - - if (!shared_output_matrix) { - try output_matrix.read(file); - } + try Matrix.readLeaky(allocator, file, vocab_size, embedding_size); return .{ - .allocator = allocator, .embedding_size = embedding_size, .ffn_hidden_size = ffn_hidden_size, .n_layers = n_layers, @@ -236,187 +150,18 @@ fn readV1(allocator: std.mem.Allocator, file: std.fs.File) !Self { .n_attention_query_groups = n_attention_query_groups, .vocab_size = vocab_size, .max_sequence_length = max_sequence_length, - .shared_output_matrix = shared_output_matrix, - .weights = .{ - .token_embedding_vectors = token_embedding_vectors, - .attention_norm_vectors = attention_norm_vectors, - .attention_query_matrices = attention_query_matrices, - .attention_key_matrices = attention_key_matrices, - .attention_value_matrices = attention_value_matrices, - .attention_output_matrices = attention_output_matrices, - .ffn_norm_vectors = ffn_norm_vectors, - .ffn_gate_matrices = ffn_gate_matrices, - .ffn_down_matrices = ffn_down_matrices, - .ffn_up_matrices = ffn_up_matrices, - .output_norm_vector = output_norm_vector, - .output_matrix = output_matrix, - }, + .token_embedding_weights = token_embedding_weights, + .attention_norm_weights = attention_norm_weights, + .attention_query_weights = attention_query_weights, + .attention_key_weights = attention_key_weights, + .attention_value_weights = attention_value_weights, + .attention_output_weights = attention_output_weights, + .ffn_norm_weights = ffn_norm_weights, + .ffn_gate_weights = ffn_gate_weights, + .ffn_down_weights = ffn_down_weights, + .ffn_up_weights = ffn_up_weights, + .output_norm_weight = output_norm_weight, + .output_weight = output_weight, }; } - -// https://github.com/karpathy/llama2.c/blob/d9862069e7ef665fe6309e3c17398ded2f121bf5/export.py#L75 -fn readLegacy(allocator: std.mem.Allocator, file: std.fs.File) !Self { - const embedding_size: usize = @intCast(try file.reader().readIntLittle(i32)); - const ffn_hidden_size: usize = @intCast(try file.reader().readIntLittle(i32)); - const n_layers: usize = @intCast(try file.reader().readIntLittle(i32)); - const n_attention_heads: usize = @intCast(try file.reader().readIntLittle(i32)); - const n_attention_query_groups: usize = @intCast(try file.reader().readIntLittle(i32)); - - // https://github.com/karpathy/llama2.c/blob/35deb5e0fa55f0a257040bcf1624ed8386e63dc7/run.c#L153 - const signed_vocab_size = try file.reader().readIntLittle(i32); - const shared_output_matrix = signed_vocab_size > 0; - - const vocab_size: usize = @abs(signed_vocab_size); - const max_sequence_length: usize = @intCast(try file.reader().readIntLittle(i32)); - - const token_embedding_vectors = try Tensor(2).init( - allocator, - [_]usize{ vocab_size, embedding_size }, - ); - - errdefer token_embedding_vectors.deinit(); - try token_embedding_vectors.read(file); - - const attention_norm_vectors = try Tensor(2).init( - allocator, - [_]usize{ n_layers, embedding_size }, - ); - - errdefer attention_norm_vectors.deinit(); - try attention_norm_vectors.read(file); - - const attention_query_matrices = try Tensor(3).init( - allocator, - [_]usize{ n_layers, embedding_size, embedding_size }, - ); - - errdefer attention_query_matrices.deinit(); - try attention_query_matrices.read(file); - - const attention_head_size: usize = embedding_size / n_attention_heads; - - const attention_key_matrices = try Tensor(3).init( - allocator, - [_]usize{ n_layers, n_attention_query_groups * attention_head_size, embedding_size }, - ); - - errdefer attention_key_matrices.deinit(); - try attention_key_matrices.read(file); - - const attention_value_matrices = try Tensor(3).init( - allocator, - [_]usize{ n_layers, n_attention_query_groups * attention_head_size, embedding_size }, - ); - - errdefer attention_value_matrices.deinit(); - try attention_value_matrices.read(file); - - const attention_output_matrices = try Tensor(3).init( - allocator, - [_]usize{ n_layers, embedding_size, embedding_size }, - ); - - errdefer attention_output_matrices.deinit(); - try attention_output_matrices.read(file); - - const ffn_norm_vectors = try Tensor(2).init( - allocator, - [_]usize{ n_layers, embedding_size }, - ); - - errdefer ffn_norm_vectors.deinit(); - try ffn_norm_vectors.read(file); - - const ffn_gate_matrices = try Tensor(3).init( - allocator, - [_]usize{ n_layers, ffn_hidden_size, embedding_size }, - ); - - errdefer ffn_gate_matrices.deinit(); - try ffn_gate_matrices.read(file); - - const ffn_down_matrices = try Tensor(3).init( - allocator, - [_]usize{ n_layers, embedding_size, ffn_hidden_size }, - ); - - errdefer ffn_down_matrices.deinit(); - try ffn_down_matrices.read(file); - - const ffn_up_matrices = try Tensor(3).init( - allocator, - [_]usize{ n_layers, ffn_hidden_size, embedding_size }, - ); - - errdefer ffn_up_matrices.deinit(); - try ffn_up_matrices.read(file); - - const output_norm_vector = try Tensor(1).init( - allocator, - [_]usize{embedding_size}, - ); - - errdefer output_norm_vector.deinit(); - try output_norm_vector.read(file); - - try file.seekBy(@intCast(max_sequence_length * attention_head_size * @sizeOf(f32))); - - const output_matrix = if (shared_output_matrix) - token_embedding_vectors - else - try Tensor(2).init(allocator, [_]usize{ vocab_size, embedding_size }); - - errdefer if (!shared_output_matrix) { - output_matrix.deinit(); - }; - - if (!shared_output_matrix) { - try output_matrix.read(file); - } - - return .{ - .allocator = allocator, - .embedding_size = embedding_size, - .ffn_hidden_size = ffn_hidden_size, - .n_layers = n_layers, - .n_attention_heads = n_attention_heads, - .n_attention_query_groups = n_attention_query_groups, - .vocab_size = vocab_size, - .max_sequence_length = max_sequence_length, - .shared_output_matrix = shared_output_matrix, - - .weights = .{ - .token_embedding_vectors = token_embedding_vectors, - .attention_norm_vectors = attention_norm_vectors, - .attention_query_matrices = attention_query_matrices, - .attention_key_matrices = attention_key_matrices, - .attention_value_matrices = attention_value_matrices, - .attention_output_matrices = attention_output_matrices, - .ffn_norm_vectors = ffn_norm_vectors, - .ffn_gate_matrices = ffn_gate_matrices, - .ffn_down_matrices = ffn_down_matrices, - .ffn_up_matrices = ffn_up_matrices, - .output_norm_vector = output_norm_vector, - .output_matrix = output_matrix, - }, - }; -} - -pub fn deinit(self: Self) void { - self.weights.token_embedding_vectors.deinit(); - self.weights.attention_norm_vectors.deinit(); - self.weights.attention_query_matrices.deinit(); - self.weights.attention_key_matrices.deinit(); - self.weights.attention_value_matrices.deinit(); - self.weights.attention_output_matrices.deinit(); - self.weights.ffn_norm_vectors.deinit(); - self.weights.ffn_gate_matrices.deinit(); - self.weights.ffn_down_matrices.deinit(); - self.weights.ffn_up_matrices.deinit(); - self.weights.output_norm_vector.deinit(); - - if (!self.shared_output_matrix) { - self.weights.output_matrix.deinit(); - } -} diff --git a/src/converter_args.zig b/src/converter_args.zig deleted file mode 100644 index c086b7c..0000000 --- a/src/converter_args.zig +++ /dev/null @@ -1,40 +0,0 @@ -const Self = @This(); - -const std = @import("std"); - -arg_iterator: std.process.ArgIterator, -model_path: []const u8, - -pub fn init(allocator: std.mem.Allocator) !Self { - var arg_iterator = try std.process.argsWithAllocator(allocator); - - errdefer arg_iterator.deinit(); - - _ = arg_iterator.next().?; - - const model_path = arg_iterator.next() orelse try help(1); - - while (arg_iterator.next()) |arg| { - try help(if (std.mem.eql(u8, arg, "--help")) 0 else 1); - } - - return .{ .arg_iterator = arg_iterator, .model_path = model_path }; -} - -pub fn deinit(self: *Self) void { - self.arg_iterator.deinit(); -} - -fn help(exit_status: u8) !noreturn { - const console = if (exit_status == 0) - std.io.getStdOut().writer() - else - std.io.getStdErr().writer(); - - try console.print("Usage: llama2-converter [options]\n\n", .{}); - - try console.print("Options:\n", .{}); - try console.print(" --help\n", .{}); - - std.process.exit(exit_status); -} diff --git a/src/converter_main.zig b/src/converter_main.zig deleted file mode 100644 index eeba13f..0000000 --- a/src/converter_main.zig +++ /dev/null @@ -1,17 +0,0 @@ -const std = @import("std"); -const Checkpoint = @import("checkpoint.zig"); -const ConverterArgs = @import("converter_args.zig"); - -pub fn main() !void { - const allocator = std.heap.page_allocator; - - var args = try ConverterArgs.init(allocator); - - defer args.deinit(); - - const checkpoint = try Checkpoint.init(allocator, args.model_path); - - defer checkpoint.deinit(); - - try checkpoint.writeV1(allocator, args.model_path); -} diff --git a/src/ffn.zig b/src/ffn.zig index 936c137..ba943a8 100644 --- a/src/ffn.zig +++ b/src/ffn.zig @@ -2,66 +2,40 @@ const Self = @This(); const std = @import("std"); const Checkpoint = @import("checkpoint.zig"); -const Tensor = @import("./tensor.zig").Tensor; +const Vector = @import("vector.zig"); -allocator: std.mem.Allocator, checkpoint: Checkpoint, -input_buffer: Tensor(1), -gate_buffer: Tensor(1), -hidden_buffer: Tensor(1), -output_buffer: Tensor(1), - -pub fn init(allocator: std.mem.Allocator, checkpoint: Checkpoint) !Self { - const input_buffer = try Tensor(1).init(allocator, [_]usize{checkpoint.embedding_size}); - - errdefer input_buffer.deinit(); - - const gate_buffer = try Tensor(1).init(allocator, [_]usize{checkpoint.ffn_hidden_size}); - - errdefer gate_buffer.deinit(); - - const hidden_buffer = try Tensor(1).init(allocator, [_]usize{checkpoint.ffn_hidden_size}); - - errdefer hidden_buffer.deinit(); - - const output_buffer = try Tensor(1).init(allocator, [_]usize{checkpoint.embedding_size}); - - errdefer output_buffer.deinit(); +input: Vector, +gate: Vector, +hidden: Vector, +output: Vector, +pub fn createLeaky(allocator: std.mem.Allocator, checkpoint: Checkpoint) !Self { return .{ - .allocator = allocator, .checkpoint = checkpoint, - .input_buffer = input_buffer, - .gate_buffer = gate_buffer, - .hidden_buffer = hidden_buffer, - .output_buffer = output_buffer, + .input = try Vector.createLeaky(allocator, checkpoint.embedding_size), + .gate = try Vector.createLeaky(allocator, checkpoint.ffn_hidden_size), + .hidden = try Vector.createLeaky(allocator, checkpoint.ffn_hidden_size), + .output = try Vector.createLeaky(allocator, checkpoint.embedding_size), }; } -pub fn deinit(self: Self) void { - self.input_buffer.deinit(); - self.gate_buffer.deinit(); - self.hidden_buffer.deinit(); - self.output_buffer.deinit(); -} - // SwiGLU activation function: https://arxiv.org/abs/2002.05202 -pub fn forward(self: Self, layer: usize) void { +pub fn forward(self: Self, layer: usize) !void { @setFloatMode(.Optimized); - const weights = self.checkpoint.weights; - const gate_matrix = weights.ffn_gate_matrices.slice(layer); - const up_matrix = weights.ffn_up_matrices.slice(layer); - const down_matrix = weights.ffn_down_matrices.slice(layer); + const gate_weight = self.checkpoint.ffn_gate_weights[layer]; + const up_weight = self.checkpoint.ffn_up_weights[layer]; + const down_weight = self.checkpoint.ffn_down_weights[layer]; - gate_matrix.computeMatrixVectorMultiplication(self.input_buffer, self.gate_buffer); - up_matrix.computeMatrixVectorMultiplication(self.input_buffer, self.hidden_buffer); + try gate_weight.multiplyVector(self.input, self.gate); + try up_weight.multiplyVector(self.input, self.hidden); for (0..self.checkpoint.ffn_hidden_size) |index| { - self.hidden_buffer.values[index] *= swish(self.gate_buffer.values[index]); + self.hidden.values[index] *= swish(self.gate.values[index]); } - down_matrix.computeMatrixVectorMultiplication(self.hidden_buffer, self.output_buffer); + try down_weight.multiplyVector(self.hidden, self.output); } // Swish activation function: https://arxiv.org/abs/1710.05941 diff --git a/src/generator.zig b/src/generator.zig index 1dc62fd..66f30aa 100644 --- a/src/generator.zig +++ b/src/generator.zig @@ -7,46 +7,31 @@ const Sampler = @import("sampler.zig"); const Tokenizer = @import("tokenizer.zig"); const Transformer = @import("transformer.zig"); -allocator: std.mem.Allocator, transformer: Transformer, tokenizer: Tokenizer, sampler: Sampler, prompt_tokens: []usize, verbose: bool, -pub fn init(allocator: std.mem.Allocator, args: GeneratorArgs) !Self { - const transformer = try Transformer.init(allocator, args.model_path, args.sequence_length); - - errdefer transformer.deinit(); +pub fn createLeaky(allocator: std.mem.Allocator, args: GeneratorArgs) !Self { + const transformer = try Transformer.createLeaky( + allocator, + args.model_path, + args.sequence_length, + ); const vocab_size = transformer.checkpoint.vocab_size; - const tokenizer = try Tokenizer.init(allocator, args.model_path, vocab_size); - - errdefer tokenizer.deinit(); - - const sampler = try Sampler.init(allocator, args, vocab_size); - - errdefer sampler.deinit(); - - const prompt_tokens = try tokenizer.encode(allocator, args.prompt); + const tokenizer = try Tokenizer.readLeaky(allocator, args.model_path, vocab_size); return .{ - .allocator = allocator, .transformer = transformer, .tokenizer = tokenizer, - .sampler = sampler, - .prompt_tokens = prompt_tokens, + .sampler = try Sampler.createLeaky(allocator, args, vocab_size), + .prompt_tokens = try tokenizer.encode(allocator, args.prompt), .verbose = args.verbose, }; } -pub fn deinit(self: Self) void { - self.transformer.deinit(); - self.tokenizer.deinit(); - self.sampler.deinit(); - self.allocator.free(self.prompt_tokens); -} - const bos_token = 1; // beginning of sequence const eos_token = 2; // end of sequence @@ -64,7 +49,7 @@ pub fn generate(self: *Self, writer: anytype) !void { start_time = std.time.milliTimestamp(); } - self.transformer.forward(token, position); + try self.transformer.forward(token, position); if (start_time > 0) { total_time += std.time.milliTimestamp() - start_time; @@ -74,7 +59,7 @@ pub fn generate(self: *Self, writer: anytype) !void { next_token = self.prompt_tokens[prompt_tokens_index]; prompt_tokens_index += 1; } else { - next_token = self.sampler.sample(self.transformer.output_buffer.values); + next_token = self.sampler.sample(self.transformer.output.values); } if (next_token == bos_token or next_token == eos_token) { @@ -99,16 +84,13 @@ pub fn generate(self: *Self, writer: anytype) !void { } test "generate tiny story" { - var output = std.ArrayList(u8).init(std.testing.allocator); + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); - defer output.deinit(); + defer arena.deinit(); - var arg_iterator = try std.process.argsWithAllocator(std.testing.allocator); - - defer arg_iterator.deinit(); + var output = std.ArrayList(u8).init(arena.allocator()); const args = GeneratorArgs{ - .arg_iterator = arg_iterator, .model_path = "models/tinystories_260k", .temperature = 1, .top_p = 0.9, @@ -118,9 +100,7 @@ test "generate tiny story" { .verbose = false, }; - var generator = try Self.init(std.testing.allocator, args); - - defer generator.deinit(); + var generator = try Self.createLeaky(arena.allocator(), args); try generator.generate(output.writer()); diff --git a/src/generator_args.zig b/src/generator_args.zig index b95fe05..57c93d0 100644 --- a/src/generator_args.zig +++ b/src/generator_args.zig @@ -2,7 +2,6 @@ const Self = @This(); const std = @import("std"); -arg_iterator: std.process.ArgIterator, model_path: []const u8, temperature: f32, top_p: f32, @@ -13,11 +12,9 @@ verbose: bool, const Option = enum { temperature, top_p, random_seed, sequence_length, prompt }; -pub fn init(allocator: std.mem.Allocator) !Self { +pub fn createLeaky(allocator: std.mem.Allocator) !Self { var arg_iterator = try std.process.argsWithAllocator(allocator); - errdefer arg_iterator.deinit(); - _ = arg_iterator.next().?; const model_path = arg_iterator.next() orelse try help(1); @@ -69,7 +66,6 @@ pub fn init(allocator: std.mem.Allocator) !Self { } return .{ - .arg_iterator = arg_iterator, .model_path = model_path, .temperature = @max(@min(temperature orelse 1, 1), 0), .top_p = @max(@min(top_p orelse 0.9, 1), 0), @@ -80,10 +76,6 @@ pub fn init(allocator: std.mem.Allocator) !Self { }; } -pub fn deinit(self: *Self) void { - self.arg_iterator.deinit(); -} - fn help(exit_status: u8) !noreturn { const console = if (exit_status == 0) std.io.getStdOut().writer() diff --git a/src/generator_main.zig b/src/generator_main.zig index 6b420ad..2bf605d 100644 --- a/src/generator_main.zig +++ b/src/generator_main.zig @@ -3,15 +3,13 @@ const Generator = @import("generator.zig"); const GeneratorArgs = @import("generator_args.zig"); pub fn main() !void { - const allocator = std.heap.page_allocator; + var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); - var args = try GeneratorArgs.init(allocator); + defer arena.deinit(); - defer args.deinit(); + const args = try GeneratorArgs.createLeaky(arena.allocator()); - var generator = try Generator.init(allocator, args); - - defer generator.deinit(); + var generator = try Generator.createLeaky(arena.allocator(), args); try generator.generate(std.io.getStdOut().writer()); } diff --git a/src/matrix.zig b/src/matrix.zig new file mode 100644 index 0000000..ad372b5 --- /dev/null +++ b/src/matrix.zig @@ -0,0 +1,39 @@ +const Self = @This(); + +const std = @import("std"); +const Vector = @import("vector.zig"); + +rows: []const Vector, + +pub fn readLeaky( + allocator: std.mem.Allocator, + file: std.fs.File, + m_rows: usize, + n_cols: usize, +) !Self { + return .{ .rows = try Vector.readMultipleLeaky(allocator, file, m_rows, n_cols) }; +} + +pub fn readMultipleLeaky( + allocator: std.mem.Allocator, + file: std.fs.File, + n_matrices: usize, + m_rows: usize, + n_cols: usize, +) ![]Self { + const matrices = try allocator.alloc(Self, n_matrices); + + for (matrices) |*matrix| { + matrix.* = try readLeaky(allocator, file, m_rows, n_cols); + } + + return matrices; +} + +pub fn multiplyVector(self: Self, input: Vector, output: Vector) !void { + std.debug.assert(self.rows.len == output.values.len); + + for (output.values, 0..) |*value, index| { + value.* = try self.rows[index].computeScalarProduct(input); + } +} diff --git a/src/quantized_tensor.zig b/src/quantized_tensor.zig deleted file mode 100644 index b41bd64..0000000 --- a/src/quantized_tensor.zig +++ /dev/null @@ -1,117 +0,0 @@ -const std = @import("std"); - -pub fn QuantizedTensor(comptime n_dims: comptime_int) type { - comptime if (n_dims < 1) @compileError("n_dims < 1"); - - return struct { - const Self = @This(); - - allocator: ?std.mem.Allocator, - sub_dims: [n_dims - 1]usize, - group_size: usize, - values: []i8, - scaling_factors: []f32, - - pub fn init(allocator: std.mem.Allocator, dims: [n_dims]usize, group_size: usize) !Self { - const n_values = @reduce(.Mul, @as(@Vector(n_dims, usize), dims)); - - if (n_values % group_size != 0) { - return error.InvalidGroupSize; - } - - const n_groups = n_values / group_size; - - return .{ - .allocator = allocator, - .sub_dims = dims[1..].*, - .group_size = group_size, - .values = try allocator.alloc(i8, n_values), - .scaling_factors = try allocator.alloc(f32, n_groups), - }; - } - - pub fn deinit(self: Self) void { - if (self.allocator) |allocator| { - allocator.free(self.values); - allocator.free(self.scaling_factors); - } - } - - pub fn slice(self: Self, index: usize) !QuantizedTensor(n_dims - 1) { - comptime if (n_dims < 2) @compileError("n_dims < 2"); - - const n_sub_values = @reduce(.Mul, @as(@Vector(n_dims - 1, usize), self.sub_dims)); - - if (n_sub_values % self.group_size != 0) { - return error.InvalidGroupSize; - } - - const n_sub_groups = n_sub_values / self.group_size; - - return .{ - .allocator = null, - .sub_dims = self.sub_dims[1..].*, - .group_size = self.group_size, - .values = self.values[index * n_sub_values ..][0..n_sub_values], - .scaling_factors = self.scaling_factors[index * n_sub_groups ..][0..n_sub_groups], - }; - } - - pub fn computeMatrixVectorMultiplication( - self: Self, - input: anytype, - output: anytype, - ) !void { - for (output.values, 0..) |*value, index| { - value.* = try (try self.slice(index)).computeScalarProduct(&input); - } - } - - fn computeScalarProduct(self: Self, other: anytype) !f32 { - // https://github.com/karpathy/llama2.c/pull/312#issuecomment-1684140683 - if (self.group_size == 32) { - return _computeScalarProduct(32, self, other); - } - - if (self.group_size == 16) { - return _computeScalarProduct(16, self, other); - } - - if (self.group_size == 8) { - return _computeScalarProduct(8, self, other); - } - - if (self.group_size == 4) { - return _computeScalarProduct(4, self, other); - } - - return error.UnsupportedGroupSize; - } - }; -} - -fn _computeScalarProduct( - comptime vector_size: comptime_int, - input_1: anytype, - input_2: anytype, -) f32 { - @setFloatMode(.Optimized); - - std.debug.assert(input_1.values.len == input_2.values.len); - std.debug.assert(input_1.scaling_factors.len == input_2.scaling_factors.len); - - var output_value: f32 = 0; - var index: usize = 0; - - while (index < input_1.values.len) : (index += vector_size) { - const values: @Vector(vector_size, i32) = - @as(@Vector(vector_size, i8), input_1.values[index..][0..vector_size].*) * - @as(@Vector(vector_size, i8), input_2.values[index..][0..vector_size].*); - - output_value += @as(f32, @floatFromInt(@reduce(.Add, values))) * - input_1.scaling_factors[index / vector_size] * - input_2.scaling_factors[index / vector_size]; - } - - return output_value; -} diff --git a/src/sampler.zig b/src/sampler.zig index 6001921..c9e74fa 100644 --- a/src/sampler.zig +++ b/src/sampler.zig @@ -4,29 +4,20 @@ const builtin = @import("builtin"); const std = @import("std"); const math = @import("math.zig"); -allocator: std.mem.Allocator, -probability_index_pairs_buffer: []ProbabilityIndexPair, +probability_index_pairs: []ProbabilityIndexPair, +rng_state: u64, temperature: f32, top_p: f32, -rng_state: u64, - -pub fn init(allocator: std.mem.Allocator, args: anytype, vocab_size: usize) !Self { - const probability_index_pairs_buffer = - try allocator.alloc(ProbabilityIndexPair, vocab_size); +pub fn createLeaky(allocator: std.mem.Allocator, args: anytype, vocab_size: usize) !Self { return .{ - .allocator = allocator, - .probability_index_pairs_buffer = probability_index_pairs_buffer, + .probability_index_pairs = try allocator.alloc(ProbabilityIndexPair, vocab_size), + .rng_state = args.random_seed, .temperature = args.temperature, .top_p = args.top_p, - .rng_state = args.random_seed, }; } -pub fn deinit(self: Self) void { - self.allocator.free(self.probability_index_pairs_buffer); -} - pub fn sample(self: *Self, probability_distribution: []f32) usize { if (self.temperature == 0) { return math.argmax(probability_distribution); @@ -42,11 +33,7 @@ pub fn sample(self: *Self, probability_distribution: []f32) usize { return self.sampleMultinomial(probability_distribution); } - return self.sampleNucleus( - probability_distribution, - self.top_p, - self.probability_index_pairs_buffer, - ); + return self.sampleNucleus(probability_distribution); } const tolerance: comptime_float = std.math.sqrt(std.math.floatEps(f32)); @@ -82,31 +69,26 @@ fn sampleMultinomial(self: *Self, probability_distribution: []const f32) usize { const ProbabilityIndexPair = struct { probability: f32, index: usize }; // Nucleus sampling: https://arxiv.org/abs/1904.09751 -fn sampleNucleus( - self: *Self, - probability_distribution: []const f32, - top_p: f32, - probability_index_pairs_buffer: []ProbabilityIndexPair, -) usize { +fn sampleNucleus(self: *Self, probability_distribution: []const f32) usize { @setFloatMode(.Optimized); std.debug.assert(probability_distribution.len > 0); // https://github.com/karpathy/llama2.c/commit/d421a95b2bfe593b2d9e5c147f3efc8d128afe0e var probability_threshold: f32 = - (1 - top_p) / @as(f32, @floatFromInt(probability_distribution.len - 1)); + (1 - self.top_p) / @as(f32, @floatFromInt(probability_distribution.len - 1)); var n_probability_index_pairs: usize = 0; for (probability_distribution, 0..) |probability, index| { - if (probability_threshold < probability) { - probability_index_pairs_buffer[n_probability_index_pairs].probability = probability; - probability_index_pairs_buffer[n_probability_index_pairs].index = index; + if (probability >= probability_threshold) { + self.probability_index_pairs[n_probability_index_pairs].probability = probability; + self.probability_index_pairs[n_probability_index_pairs].index = index; n_probability_index_pairs += 1; } } - var probability_index_pairs = probability_index_pairs_buffer[0..n_probability_index_pairs]; + var probability_index_pairs = self.probability_index_pairs[0..n_probability_index_pairs]; std.sort.block(ProbabilityIndexPair, probability_index_pairs, {}, lessThan); @@ -115,7 +97,7 @@ fn sampleNucleus( for (probability_index_pairs, 0..) |probability_index_pair, index| { cumulative_probability += probability_index_pair.probability; - if (cumulative_probability > top_p) { + if (cumulative_probability > self.top_p) { probability_index_pairs = probability_index_pairs[0 .. index + 1]; break; diff --git a/src/simd.zig b/src/simd.zig index 37c637f..539e4ff 100644 --- a/src/simd.zig +++ b/src/simd.zig @@ -2,96 +2,104 @@ const std = @import("std"); // Pre-normalization using RMSNorm: https://arxiv.org/abs/1910.07467 pub fn computeRMSNorm( - comptime TValue: type, - comptime vector_size: comptime_int, - input_values: []const TValue, - weight_values: []const TValue, - output_values: []TValue, -) void { + input_values: []const f32, + weight_values: []const f32, + output_values: []f32, +) !void { @setFloatMode(.Optimized); - var rms_scaling_factor = computeScalarProduct(TValue, vector_size, input_values, input_values); + var scaling_factor = try computeScalarProduct(input_values, input_values); - rms_scaling_factor /= @floatFromInt(input_values.len); - rms_scaling_factor += 1e-5; - rms_scaling_factor = 1 / std.math.sqrt(rms_scaling_factor); + scaling_factor /= @floatFromInt(input_values.len); + scaling_factor += 1e-5; + scaling_factor = 1 / std.math.sqrt(scaling_factor); - computeVectorMultiplication( - TValue, - vector_size, - rms_scaling_factor, - input_values, - weight_values, - output_values, - ); + try computeVectorMultiplication(scaling_factor, input_values, weight_values, output_values); } -pub fn computeScalarProduct( - comptime TValue: type, - comptime vector_size: comptime_int, - values_1: []const TValue, - values_2: []const TValue, -) f32 { +pub fn computeScalarProduct(input_values_1: []const f32, input_values_2: []const f32) !f32 { @setFloatMode(.Optimized); - std.debug.assert(values_1.len == values_2.len); - std.debug.assert(values_1.len % vector_size == 0); + std.debug.assert(input_values_1.len == input_values_2.len); + + comptime var vector_len = std.atomic.cache_line / @sizeOf(f32); + + inline while (vector_len >= 4) : (vector_len /= 2) { + if (input_values_1.len % vector_len == 0) { + var output_values: @Vector(vector_len, f32) = @splat(0); + var index: usize = 0; - var output_values: @Vector(vector_size, f32) = @splat(0.0); - var index: usize = 0; + while (index < input_values_1.len) : (index += vector_len) { + output_values += + @as(@Vector(vector_len, f32), input_values_1[index..][0..vector_len].*) * + @as(@Vector(vector_len, f32), input_values_2[index..][0..vector_len].*); + } - while (index < values_1.len) : (index += vector_size) { - output_values += - @as(@Vector(vector_size, f32), values_1[index..][0..vector_size].*) * - @as(@Vector(vector_size, f32), values_2[index..][0..vector_size].*); + return @reduce(.Add, output_values); + } } - return @reduce(.Add, output_values); + return error.UnsupportedVectorSize; } pub fn computeVectorAddition( - comptime TValue: type, - comptime vector_size: comptime_int, - input_values_1: []const TValue, - input_values_2: []const TValue, - output_values: []TValue, -) void { + input_values_1: []const f32, + input_values_2: []const f32, + output_values: []f32, +) !void { @setFloatMode(.Optimized); std.debug.assert(input_values_1.len == input_values_2.len); - std.debug.assert(input_values_1.len % vector_size == 0); + std.debug.assert(input_values_1.len == output_values.len); + + comptime var vector_len = std.atomic.cache_line / @sizeOf(f32); - var index: usize = 0; + inline while (vector_len >= 4) : (vector_len /= 2) { + if (input_values_1.len % vector_len == 0) { + var index: usize = 0; - while (index < input_values_1.len) : (index += vector_size) { - output_values[index..][0..vector_size].* = - @as(@Vector(vector_size, TValue), input_values_1[index..][0..vector_size].*) + - @as(@Vector(vector_size, TValue), input_values_2[index..][0..vector_size].*); + while (index < input_values_1.len) : (index += vector_len) { + output_values[index..][0..vector_len].* = + @as(@Vector(vector_len, f32), input_values_1[index..][0..vector_len].*) + + @as(@Vector(vector_len, f32), input_values_2[index..][0..vector_len].*); + } + + return; + } } + + return error.UnsupportedVectorSize; } pub fn computeVectorMultiplication( - comptime TValue: type, - comptime vector_size: comptime_int, scaling_factor: f32, - input_values_1: []const TValue, - input_values_2: []const TValue, - output_values: []TValue, -) void { + input_values_1: []const f32, + input_values_2: []const f32, + output_values: []f32, +) !void { @setFloatMode(.Optimized); std.debug.assert(input_values_1.len == input_values_2.len); std.debug.assert(input_values_1.len == output_values.len); - std.debug.assert(input_values_1.len % vector_size == 0); - const scaling_factors: @Vector(vector_size, f32) = @splat(scaling_factor); + comptime var vector_len = std.atomic.cache_line / @sizeOf(f32); + + inline while (vector_len >= 4) : (vector_len /= 2) { + if (input_values_1.len % vector_len == 0) { + const scaling_factors: @Vector(vector_len, f32) = @splat(scaling_factor); - var index: usize = 0; + var index: usize = 0; - while (index < input_values_1.len) : (index += vector_size) { - output_values[index..][0..vector_size].* = - scaling_factors * - @as(@Vector(vector_size, TValue), input_values_1[index..][0..vector_size].*) * - @as(@Vector(vector_size, TValue), input_values_2[index..][0..vector_size].*); + while (index < input_values_1.len) : (index += vector_len) { + output_values[index..][0..vector_len].* = + scaling_factors * + @as(@Vector(vector_len, f32), input_values_1[index..][0..vector_len].*) * + @as(@Vector(vector_len, f32), input_values_2[index..][0..vector_len].*); + } + + return; + } } + + return error.UnsupportedVectorSize; } diff --git a/src/tensor.zig b/src/tensor.zig deleted file mode 100644 index 2874bec..0000000 --- a/src/tensor.zig +++ /dev/null @@ -1,93 +0,0 @@ -const std = @import("std"); -const simd = @import("simd.zig"); - -pub fn Tensor(comptime n_dims: comptime_int) type { - comptime if (n_dims < 1) @compileError("n_dims < 1"); - - return struct { - const Self = @This(); - - allocator: ?std.mem.Allocator, - sub_dims: [n_dims - 1]usize, - values: []f32, - - pub fn init(allocator: std.mem.Allocator, dims: [n_dims]usize) !Self { - const n_values = @reduce(.Mul, @as(@Vector(n_dims, usize), dims)); - - return .{ - .allocator = allocator, - .sub_dims = dims[1..].*, - .values = try allocator.alloc(f32, n_values), - }; - } - - pub fn deinit(self: Self) void { - if (self.allocator) |allocator| { - allocator.free(self.values); - } - } - - pub fn read(self: Self, file: std.fs.File) !void { - const values: [*]u8 = @ptrCast(self.values); - - try file.reader().readNoEof(values[0 .. self.values.len * @sizeOf(f32)]); - } - - pub fn write(self: Self, file: std.fs.File) !void { - const values: [*]u8 = @ptrCast(self.values); - - try file.writer().writeAll(values[0 .. self.values.len * @sizeOf(f32)]); - } - - pub fn slice(self: Self, index: usize) Tensor(n_dims - 1) { - comptime if (n_dims < 2) @compileError("n_dims < 2"); - - const n_sub_values = @reduce(.Mul, @as(@Vector(n_dims - 1, usize), self.sub_dims)); - - return .{ - .allocator = null, - .sub_dims = self.sub_dims[1..].*, - .values = self.values[index * n_sub_values ..][0..n_sub_values], - }; - } - - pub fn computeMatrixVectorMultiplication(self: Self, input: anytype, output: anytype) void { - for (output.values, 0..) |*value, index| { - value.* = self.slice(index).computeScalarProduct(input); - } - } - - pub fn computeRMSNorm(self: Self, weight: anytype, output: anytype) void { - if (self.values.len % 32 == 0) - simd.computeRMSNorm(f32, 32, self.values, weight.values, output.values) - else if (self.values.len % 16 == 0) - simd.computeRMSNorm(f32, 16, self.values, weight.values, output.values) - else if (self.values.len % 8 == 0) - simd.computeRMSNorm(f32, 8, self.values, weight.values, output.values) - else - simd.computeRMSNorm(f32, 4, self.values, weight.values, output.values); - } - - pub fn computeScalarProduct(self: Self, other: anytype) f32 { - return if (self.values.len % 32 == 0) - simd.computeScalarProduct(f32, 32, self.values, other.values) - else if (self.values.len % 16 == 0) - simd.computeScalarProduct(f32, 16, self.values, other.values) - else if (self.values.len % 8 == 0) - simd.computeScalarProduct(f32, 8, self.values, other.values) - else - simd.computeScalarProduct(f32, 4, self.values, other.values); - } - - pub fn computeVectorAddition(self: Self, other: anytype) void { - if (self.values.len % 32 == 0) - simd.computeVectorAddition(f32, 32, self.values, other.values, self.values) - else if (self.values.len % 16 == 0) - simd.computeVectorAddition(f32, 16, self.values, other.values, self.values) - else if (self.values.len % 8 == 0) - simd.computeVectorAddition(f32, 8, self.values, other.values, self.values) - else - simd.computeVectorAddition(f32, 4, self.values, other.values, self.values); - } - }; -} diff --git a/src/tokenizer.zig b/src/tokenizer.zig index 055161a..735f483 100644 --- a/src/tokenizer.zig +++ b/src/tokenizer.zig @@ -2,25 +2,12 @@ const Self = @This(); const std = @import("std"); -allocator: std.mem.Allocator, max_word_length: usize, vocab: []const []const u8, word_scores: []const f32, sorted_vocab: []const VocabEntry, -pub fn init(allocator: std.mem.Allocator, model_path: []const u8, vocab_size: usize) !Self { - var vocab = try allocator.alloc([]u8, vocab_size); - - errdefer for (vocab) |word| { - allocator.free(word); - }; - - errdefer allocator.free(vocab); - - var word_scores = try allocator.alloc(f32, vocab_size); - - errdefer allocator.free(word_scores); - +pub fn readLeaky(allocator: std.mem.Allocator, model_path: []const u8, vocab_size: usize) !Self { const path = try std.fs.path.join(allocator, &[_][]const u8{ model_path, "tokenizer.bin" }); defer allocator.free(path); @@ -29,41 +16,30 @@ pub fn init(allocator: std.mem.Allocator, model_path: []const u8, vocab_size: us defer file.close(); - const reader = file.reader(); - const max_word_length = try reader.readIntLittle(u32); + const max_word_length = try file.reader().readIntLittle(u32); + + var vocab = try allocator.alloc([]u8, vocab_size); + var word_scores = try allocator.alloc(f32, vocab_size); - for (word_scores, 0..) |*word_score, word_index| { - word_score.* = @bitCast(try reader.readIntLittle(u32)); + for (word_scores, 0..) |*word_score, index| { + word_score.* = @bitCast(try file.reader().readIntLittle(u32)); - const word_length = try reader.readIntLittle(u32); + const word_length = try file.reader().readIntLittle(u32); const word = try allocator.alloc(u8, word_length); - try reader.readNoEof(word); + try file.reader().readNoEof(word); - vocab[word_index] = word; + vocab[index] = word; } - const sorted_vocab = try sortVocab(allocator, vocab); - return .{ - .allocator = allocator, .max_word_length = max_word_length, .vocab = vocab, .word_scores = word_scores, - .sorted_vocab = sorted_vocab, + .sorted_vocab = try sortVocab(allocator, vocab), }; } -pub fn deinit(self: Self) void { - for (self.vocab) |word| { - self.allocator.free(word); - } - - self.allocator.free(self.vocab); - self.allocator.free(self.word_scores); - self.allocator.free(self.sorted_vocab); -} - pub fn encode(self: Self, allocator: std.mem.Allocator, text: []const u8) ![]usize { var double_word_buffer = try allocator.alloc(u8, self.max_word_length * 2); @@ -100,10 +76,10 @@ fn encodeCodepoints(self: Self, allocator: std.mem.Allocator, text: []const u8) var text_view = try std.unicode.Utf8View.init(text); var text_iterator = text_view.iterator(); - var token_index: usize = 0; + var index: usize = 0; - while (text_iterator.nextCodepointSlice()) |codepoints| : (token_index += 1) { - if (token_index == 0) { + while (text_iterator.nextCodepointSlice()) |codepoints| : (index += 1) { + if (index == 0) { // https://github.com/karpathy/llama2.c/blob/7ac65cb2c2b169050747be92011b7bebdd1b4544/run.c#L483 try tokens.append(self.lookupToken(" ") orelse return error.BadVocab); } @@ -127,12 +103,12 @@ fn mergeBestWordPair(self: Self, tokens: []usize, double_word_buffer: []u8) bool } var best_token: ?usize = null; - var best_token_index: ?usize = null; + var best_index: ?usize = null; var best_word_score = -std.math.floatMax(f32); - for (0..tokens.len - 1) |token_index| { - const word1 = self.vocab[tokens[token_index]]; - const word2 = self.vocab[tokens[token_index + 1]]; + for (0..tokens.len - 1) |index| { + const word1 = self.vocab[tokens[index]]; + const word2 = self.vocab[tokens[index + 1]]; @memcpy(double_word_buffer[0..word1.len], word1); @memcpy(double_word_buffer[word1.len .. word1.len + word2.len], word2); @@ -144,19 +120,19 @@ fn mergeBestWordPair(self: Self, tokens: []usize, double_word_buffer: []u8) bool if (word_score > best_word_score) { best_token = token; - best_token_index = token_index; + best_index = index; best_word_score = word_score; } } - if (best_token_index) |token_index| { + if (best_index) |index| { std.mem.copyForwards( usize, - tokens[token_index + 1 .. tokens.len - 1], - tokens[token_index + 2 ..], + tokens[index + 1 .. tokens.len - 1], + tokens[index + 2 ..], ); - tokens[token_index] = best_token.?; + tokens[index] = best_token.?; return true; } @@ -217,118 +193,110 @@ const tinystories_260k_path = "models/tinystories_260k"; // https://github.com/karpathy/llama2.c/pull/226 // https://github.com/karpathy/llama2.c/pull/297 test "encode utf-8" { - const tokenizer = try Self.init(std.testing.allocator, tinystories_15m_path, 32000); + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); - defer tokenizer.deinit(); + defer arena.deinit(); + const tokenizer = try Self.readLeaky(arena.allocator(), tinystories_15m_path, 32000); const expected = [_]usize{ 365, 1691, 1018, 3963, 669, 29871, 31409, 30607, 30437, 30564 }; - const actual = try tokenizer.encode(std.testing.allocator, "Lets try ö & 株式会社"); - - defer std.testing.allocator.free(actual); + const actual = try tokenizer.encode(arena.allocator(), "Lets try ö & 株式会社"); try std.testing.expectEqualSlices(usize, expected[0..], actual); } test "encode empty string" { - const tokenizer = try Self.init(std.testing.allocator, tinystories_15m_path, 32000); + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); - defer tokenizer.deinit(); + defer arena.deinit(); + const tokenizer = try Self.readLeaky(arena.allocator(), tinystories_15m_path, 32000); const expected = [_]usize{}; - const actual = try tokenizer.encode(std.testing.allocator, ""); - - defer std.testing.allocator.free(actual); + const actual = try tokenizer.encode(arena.allocator(), ""); try std.testing.expectEqualSlices(usize, expected[0..], actual); } test "encode unknown codepoint" { - const tokenizer = try Self.init(std.testing.allocator, tinystories_15m_path, 32000); + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); - defer tokenizer.deinit(); + defer arena.deinit(); + const tokenizer = try Self.readLeaky(arena.allocator(), tinystories_15m_path, 32000); const expected = [_]usize{ 29871, 243, 149, 145, 154, 243, 150, 147, 144 }; - const actual = try tokenizer.encode(std.testing.allocator, "𒎗𓐍"); - - defer std.testing.allocator.free(actual); + const actual = try tokenizer.encode(arena.allocator(), "𒎗𓐍"); try std.testing.expectEqualSlices(usize, expected[0..], actual); } test "encode single chars" { - const tokenizer = try Self.init(std.testing.allocator, tinystories_260k_path, 512); + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); - defer tokenizer.deinit(); + defer arena.deinit(); + const tokenizer = try Self.readLeaky(arena.allocator(), tinystories_260k_path, 512); const expected = [_]usize{ 261, 430, 429, 418, 411, 431, 428, 415 }; - const actual = try tokenizer.encode(std.testing.allocator, "abcdefgh"); - - defer std.testing.allocator.free(actual); + const actual = try tokenizer.encode(arena.allocator(), "abcdefgh"); try std.testing.expectEqualSlices(usize, expected[0..], actual); } // https://github.com/facebookresearch/llama/blob/ea9f33d6d3ea8ed7d560d270986407fd6c2e52b7/example_text_completion.py test "meta encoding example 1" { - const tokenizer = try Self.init(std.testing.allocator, tinystories_15m_path, 32000); + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); - defer tokenizer.deinit(); + defer arena.deinit(); + const tokenizer = try Self.readLeaky(arena.allocator(), tinystories_15m_path, 32000); const expected = [_]usize{ 306, 4658, 278, 6593, 310, 2834, 338 }; - const actual = try tokenizer.encode(std.testing.allocator, "I believe the meaning of life is"); - - defer std.testing.allocator.free(actual); + const actual = try tokenizer.encode(arena.allocator(), "I believe the meaning of life is"); try std.testing.expectEqualSlices(usize, expected[0..], actual); } test "meta encoding example 2" { - const tokenizer = try Self.init(std.testing.allocator, tinystories_15m_path, 32000); + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); - defer tokenizer.deinit(); + defer arena.deinit(); + const tokenizer = try Self.readLeaky(arena.allocator(), tinystories_15m_path, 32000); const expected = [_]usize{ 3439, 17632, 1925, 29892, 278, 6368, 310, 14215, 537, 5922, 393, 29871 }; const actual = try tokenizer.encode( - std.testing.allocator, + arena.allocator(), "Simply put, the theory of relativity states that ", ); - defer std.testing.allocator.free(actual); - try std.testing.expectEqualSlices(usize, expected[0..], actual); } test "meta encoding example 3" { - const tokenizer = try Self.init(std.testing.allocator, tinystories_15m_path, 32000); + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); - defer tokenizer.deinit(); + defer arena.deinit(); + const tokenizer = try Self.readLeaky(arena.allocator(), tinystories_15m_path, 32000); const expected = [_]usize{ 319, 11473, 2643, 378, 629, 271, 18099, 278, 3815, 373, 278, 6826, 29901, 13, 13, 4706, 6324, 14332, 29892, 13, 13, 4706, 306, 925, 29871 }; const actual = try tokenizer.encode( - std.testing.allocator, + arena.allocator(), "A brief message congratulating the team on the launch:\n\n Hi everyone,\n\n I just ", ); - defer std.testing.allocator.free(actual); - try std.testing.expectEqualSlices(usize, expected[0..], actual); } test "meta encoding example 4" { - const tokenizer = try Self.init(std.testing.allocator, tinystories_15m_path, 32000); + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); - defer tokenizer.deinit(); + defer arena.deinit(); + const tokenizer = try Self.readLeaky(arena.allocator(), tinystories_15m_path, 32000); const expected = [_]usize{ 4103, 9632, 4223, 304, 5176, 29901, 13, 13, 4706, 7205, 4932, 357, 1149, 301, 449, 276, 316, 2778, 13, 4706, 1236, 407, 837, 524, 1149, 6042, 354, 772, 440, 29878, 1318, 13, 4706, 715, 1878, 330, 3055, 1725, 1149, 330, 3055, 1725, 4639, 28754, 13, 4706, 923, 968, 1149 }; const actual = try tokenizer.encode( - std.testing.allocator, + arena.allocator(), "Translate English to French:\n\n sea otter => loutre de mer\n peppermint => menthe poivrée\n plush girafe => girafe peluche\n cheese =>", ); - defer std.testing.allocator.free(actual); - try std.testing.expectEqualSlices(usize, expected[0..], actual); } diff --git a/src/transformer.zig b/src/transformer.zig index e9d7efb..2d2d64b 100644 --- a/src/transformer.zig +++ b/src/transformer.zig @@ -4,88 +4,54 @@ const std = @import("std"); const Attention = @import("attention.zig"); const Checkpoint = @import("checkpoint.zig"); const FFN = @import("ffn.zig"); -const Tensor = @import("./tensor.zig").Tensor; +const Vector = @import("vector.zig"); -allocator: std.mem.Allocator, checkpoint: Checkpoint, sequence_length: usize, attention: Attention, ffn: FFN, -hidden_buffer: Tensor(1), -output_buffer: Tensor(1), +hidden: Vector, +output: Vector, -pub fn init( +pub fn createLeaky( allocator: std.mem.Allocator, model_path: []const u8, custom_sequence_length: usize, ) !Self { - const checkpoint = try Checkpoint.init(allocator, model_path); - - errdefer checkpoint.deinit(); + const checkpoint = try Checkpoint.readLeaky(allocator, model_path); const sequence_length = if (custom_sequence_length == 0) checkpoint.max_sequence_length else - custom_sequence_length; - - const attention = try Attention.init(allocator, checkpoint, sequence_length); - - errdefer attention.deinit(); - - const ffn = try FFN.init(allocator, checkpoint); - - errdefer ffn.deinit(); - - const hidden_buffer = try Tensor(1).init(allocator, [_]usize{checkpoint.embedding_size}); - - errdefer hidden_buffer.deinit(); - - const output_buffer = try Tensor(1).init(allocator, [_]usize{checkpoint.vocab_size}); - - errdefer output_buffer.deinit(); + @min(custom_sequence_length, checkpoint.max_sequence_length); return .{ - .allocator = allocator, .checkpoint = checkpoint, .sequence_length = sequence_length, - .attention = attention, - .ffn = ffn, - .hidden_buffer = hidden_buffer, - .output_buffer = output_buffer, + .attention = try Attention.createLeaky(allocator, checkpoint, sequence_length), + .ffn = try FFN.createLeaky(allocator, checkpoint), + .hidden = try Vector.createLeaky(allocator, checkpoint.embedding_size), + .output = try Vector.createLeaky(allocator, checkpoint.vocab_size), }; } -pub fn deinit(self: Self) void { - self.checkpoint.deinit(); - self.attention.deinit(); - self.ffn.deinit(); - self.hidden_buffer.deinit(); - self.output_buffer.deinit(); -} - -pub fn forward(self: Self, token: usize, position: usize) void { - const weights = self.checkpoint.weights; +pub fn forward(self: Self, token: usize, position: usize) !void { + const token_embedding_weight = self.checkpoint.token_embedding_weights[token]; - @memcpy(self.hidden_buffer.values, weights.token_embedding_vectors.slice(token).values); + @memcpy(self.hidden.values, token_embedding_weight.values); for (0..self.checkpoint.n_layers) |layer| { - self.hidden_buffer.computeRMSNorm( - weights.attention_norm_vectors.slice(layer), - self.attention.input_buffer, - ); - - self.attention.forward(layer, position); - self.hidden_buffer.computeVectorAddition(self.attention.output_buffer); - - self.hidden_buffer.computeRMSNorm( - weights.ffn_norm_vectors.slice(layer), - self.ffn.input_buffer, - ); - - self.ffn.forward(layer); - self.hidden_buffer.computeVectorAddition(self.ffn.output_buffer); + const attention_norm_weight = self.checkpoint.attention_norm_weights[layer]; + const ffn_norm_weight = self.checkpoint.ffn_norm_weights[layer]; + + try self.hidden.computeRMSNorm(attention_norm_weight, self.attention.input); + try self.attention.forward(layer, position); + try self.hidden.addVector(self.attention.output); + try self.hidden.computeRMSNorm(ffn_norm_weight, self.ffn.input); + try self.ffn.forward(layer); + try self.hidden.addVector(self.ffn.output); } - self.hidden_buffer.computeRMSNorm(weights.output_norm_vector, self.hidden_buffer); - weights.output_matrix.computeMatrixVectorMultiplication(self.hidden_buffer, self.output_buffer); + try self.hidden.computeRMSNorm(self.checkpoint.output_norm_weight, self.hidden); + try self.checkpoint.output_weight.multiplyVector(self.hidden, self.output); } diff --git a/src/vector.zig b/src/vector.zig new file mode 100644 index 0000000..3ce2a7d --- /dev/null +++ b/src/vector.zig @@ -0,0 +1,60 @@ +const Self = @This(); + +const std = @import("std"); +const simd = @import("simd.zig"); + +values: []f32, + +pub fn createLeaky(allocator: std.mem.Allocator, n_values: usize) !Self { + return .{ .values = try allocator.alignedAlloc(f32, std.atomic.cache_line, n_values) }; +} + +pub fn createMultipleLeaky( + allocator: std.mem.Allocator, + n_vectors: usize, + n_values: usize, +) ![]Self { + const vectors = try allocator.alloc(Self, n_vectors); + + for (vectors) |*vector| { + vector.* = try createLeaky(allocator, n_values); + } + + return vectors; +} + +pub fn readLeaky(allocator: std.mem.Allocator, file: std.fs.File, n_values: usize) !Self { + const vector = try createLeaky(allocator, n_values); + const bytes: [*]u8 = @ptrCast(vector.values); + + try file.reader().readNoEof(bytes[0 .. vector.values.len * @sizeOf(f32)]); + + return vector; +} + +pub fn readMultipleLeaky( + allocator: std.mem.Allocator, + file: std.fs.File, + n_vectors: usize, + n_values: usize, +) ![]Self { + const vectors = try allocator.alloc(Self, n_vectors); + + for (vectors) |*vector| { + vector.* = try readLeaky(allocator, file, n_values); + } + + return vectors; +} + +pub fn addVector(self: Self, other: Self) !void { + try simd.computeVectorAddition(self.values, other.values, self.values); +} + +pub fn computeRMSNorm(self: Self, weight: Self, output: Self) !void { + try simd.computeRMSNorm(self.values, weight.values, output.values); +} + +pub fn computeScalarProduct(self: Self, other: Self) !f32 { + return simd.computeScalarProduct(self.values, other.values); +}