Implemented Arena Allocator, dropped legacy file format, aligned vect…

…ors to cache line (no performance benefit observed)
clebert · Oct 22, 2023 · af6c25c · af6c25c
1 parent d5be75c
commit af6c25c
Show file tree

Hide file tree

Showing 21 changed files with 511 additions and 1,094 deletions.
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ This project is a port of Andrej Karpathy's [llama2.c](https://github.com/karpat
 
 ## Usage
 
-Build and run the `llama2-generator` for text generation:
+Build and run `llama2-generator`:
 
 ```sh
 zig build -Doptimize=ReleaseFast
@@ -23,10 +23,10 @@ Lily wanted to play with the ball, but it was too high up in the sky. She tried
 Lily found a stick and tried to hit the ball. But the stick was too short. She tried again and again, but she couldn't reach it. She felt sad.
 Suddenly, a kind man came by and saw Lily. He asked her what was wrong. Lily told him about the ball. The man smiled and said, "I have a useful idea!" He took out a long stick and used it to knock the ball down. Lily was so happy! She thanked the man and they played together in the sunshine.
 
-achieved: 726.974 tok/s
+achieved: 719.870 tok/s
 ```
 
-## Run Llama 2 from Hugging Face
+## Run Llama 2 7B from Hugging Face
 
 Install `git-lfs` and clone the [Llama 2 7B](https://huggingface.co/meta-llama/Llama-2-7b-hf) model from Hugging Face:
 
@@ -43,7 +43,7 @@ pip3 install -r requirements.txt
 python3 convert_hf_model.py /path/to/Llama-2-7b-hf models/llama2_7b_hf
 ```
 
-Build and run the `llama2-generator` for text generation:
+Build and run `llama2-generator`:
 
 ```sh
 zig build -Doptimize=ReleaseFast
@@ -55,7 +55,40 @@ The output on an Apple M1 Pro with 32 GB of memory:
 ```
 Once Upon a Time in Hollywood is a 2019 American comedy-drama film written and directed by Quentin Tarantino
 
-achieved: 1.821 tok/s
+achieved: 1.800 tok/s
+```
+
+## Run Llama 2 7B Chat from Hugging Face
+
+Install `git-lfs` and clone the [Llama 2 7B Chat](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) model from Hugging Face:
+
+```sh
+# Make sure you have git-lfs installed (https://git-lfs.com)
+git lfs install
+git clone https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+```
+
+Install the necessary Python packages and convert the Hugging Face model:
+
+```sh
+pip3 install -r requirements.txt
+python3 convert_hf_model.py /path/to/Llama-2-7b-chat-hf models/llama2_7b_chat_hf
+```
+
+Build and run `llama2-chat`:
+
+```sh
+zig build -Doptimize=ReleaseFast
+./zig-out/bin/llama2-chat models/llama2_7b_chat_hf
+```
+
+The output on an Apple M1 Pro with 32 GB of memory:
+
+```
+Enter system prompt (optional):
+User: Hello
+Assistant: Hello! It's nice to meet you. Is there something I can help you with or would you like to chat?
+User: ...
 ```
 
 ## Help

diff --git a/build.zig b/build.zig
@@ -4,77 +4,61 @@ pub fn build(b: *std.Build) void {
     const target = b.standardTargetOptions(.{});
     const optimize = b.standardOptimizeOption(.{});
 
-    const chat_exe = b.addExecutable(.{
-        .name = "llama2-chat",
-        .root_source_file = .{ .path = "src/chat_main.zig" },
-        .target = target,
-        .optimize = optimize,
-    });
-
     const generator_exe = b.addExecutable(.{
         .name = "llama2-generator",
         .root_source_file = .{ .path = "src/generator_main.zig" },
         .target = target,
         .optimize = optimize,
     });
 
-    const converter_exe = b.addExecutable(.{
-        .name = "llama2-converter",
-        .root_source_file = .{ .path = "src/converter_main.zig" },
+    const chat_exe = b.addExecutable(.{
+        .name = "llama2-chat",
+        .root_source_file = .{ .path = "src/chat_main.zig" },
         .target = target,
         .optimize = optimize,
     });
 
     const build_options = b.addOptions();
 
-    chat_exe.addOptions("build_options", build_options);
     generator_exe.addOptions("build_options", build_options);
-    converter_exe.addOptions("build_options", build_options);
+    chat_exe.addOptions("build_options", build_options);
 
     // This declares intent for the executable to be installed into the
     // standard location when the user invokes the "install" step (the default
     // step when running `zig build`).
-    b.installArtifact(chat_exe);
     b.installArtifact(generator_exe);
-    b.installArtifact(converter_exe);
+    b.installArtifact(chat_exe);
 
     // This *creates* a Run step in the build graph, to be executed when another
     // step is evaluated that depends on it. The next line below will establish
     // such a dependency.
-    const run_chat_cmd = b.addRunArtifact(chat_exe);
     const run_generator_cmd = b.addRunArtifact(generator_exe);
-    const run_converter_cmd = b.addRunArtifact(converter_exe);
+    const run_chat_cmd = b.addRunArtifact(chat_exe);
 
     // By making the run step depend on the install step, it will be run from the
     // installation directory rather than directly from within the cache directory.
     // This is not necessary, however, if the application depends on other installed
     // files, this ensures they will be present and in the expected location.
-    run_chat_cmd.step.dependOn(b.getInstallStep());
     run_generator_cmd.step.dependOn(b.getInstallStep());
-    run_converter_cmd.step.dependOn(b.getInstallStep());
+    run_chat_cmd.step.dependOn(b.getInstallStep());
 
     // This allows the user to pass arguments to the application in the build
     // command itself, like this: `zig build run -- arg1 arg2 etc`
     if (b.args) |args| {
-        run_chat_cmd.addArgs(args);
         run_generator_cmd.addArgs(args);
-        run_converter_cmd.addArgs(args);
+        run_chat_cmd.addArgs(args);
     }
 
     // This creates a build step. It will be visible in the `zig build --help` menu,
     // and can be selected like this: `zig build run`
     // This will evaluate the `run` step rather than the default, which is "install".
-    const run_chat_step = b.step("run-chat", "Run the chat");
-
-    run_chat_step.dependOn(&run_chat_cmd.step);
-
     const run_generator_step = b.step("run-generator", "Run the generator");
 
     run_generator_step.dependOn(&run_generator_cmd.step);
 
-    const run_converter_step = b.step("run-converter", "Run the converter");
+    const run_chat_step = b.step("run-chat", "Run the chat");
 
-    run_converter_step.dependOn(&run_converter_cmd.step);
+    run_chat_step.dependOn(&run_chat_cmd.step);
 
     const test_step = b.step("test", "Run unit tests");
 

diff --git a/src/attention.zig b/src/attention.zig
@@ -2,111 +2,91 @@ const Self = @This();
 
 const std = @import("std");
 const Checkpoint = @import("checkpoint.zig");
-const math = @import("./math.zig");
-const Tensor = @import("./tensor.zig").Tensor;
+const math = @import("math.zig");
+const simd = @import("simd.zig");
+const Vector = @import("vector.zig");
 
-allocator: std.mem.Allocator,
 checkpoint: Checkpoint,
 head_size: usize,
 head_size_sqrt: f32,
-input_buffer: Tensor(2),
-output_buffer: Tensor(1),
-query_buffer: Tensor(2),
-key_cache: Tensor(4),
-value_cache: Tensor(4),
+input: Vector,
+output: Vector,
+multi_query: Vector,
+key_cache: []const []const Vector,
+value_cache: []const []const Vector,
 scores: []f32,
 
-pub fn init(allocator: std.mem.Allocator, checkpoint: Checkpoint, sequence_length: usize) !Self {
-    const embedding_size = checkpoint.embedding_size;
-    const n_attention_heads = checkpoint.n_attention_heads;
-    const head_size: usize = embedding_size / n_attention_heads;
-    const input_buffer = try Tensor(2).init(allocator, [_]usize{ n_attention_heads, head_size });
-
-    errdefer input_buffer.deinit();
-
-    const output_buffer = try Tensor(1).init(allocator, [_]usize{embedding_size});
-
-    errdefer output_buffer.deinit();
-
-    const query_buffer = try Tensor(2).init(allocator, [_]usize{ n_attention_heads, head_size });
-
-    errdefer query_buffer.deinit();
-
-    const n_layers = checkpoint.n_layers;
-    const n_attention_query_groups = checkpoint.n_attention_query_groups;
-
-    const key_cache = try Tensor(4).init(
-        allocator,
-        [_]usize{ n_layers, sequence_length, n_attention_query_groups, head_size },
-    );
-
-    errdefer key_cache.deinit();
-
-    const value_cache = try Tensor(4).init(
-        allocator,
-        [_]usize{ n_layers, sequence_length, n_attention_query_groups, head_size },
-    );
-
-    errdefer value_cache.deinit();
+pub fn createLeaky(
+    allocator: std.mem.Allocator,
+    checkpoint: Checkpoint,
+    sequence_length: usize,
+) !Self {
+    const head_size = checkpoint.embedding_size / checkpoint.n_attention_heads;
+    const key_cache = try allocator.alloc([]Vector, checkpoint.n_layers);
+
+    for (key_cache) |*layer| {
+        layer.* = try Vector.createMultipleLeaky(
+            allocator,
+            sequence_length,
+            checkpoint.n_attention_query_groups * head_size,
+        );
+    }
 
-    const scores = try allocator.alloc(f32, sequence_length);
+    const value_cache = try allocator.alloc([]Vector, checkpoint.n_layers);
 
-    errdefer allocator.free(scores);
+    for (value_cache) |*layer| {
+        layer.* = try Vector.createMultipleLeaky(
+            allocator,
+            sequence_length,
+            checkpoint.n_attention_query_groups * head_size,
+        );
+    }
 
     return .{
-        .allocator = allocator,
         .checkpoint = checkpoint,
         .head_size = head_size,
         .head_size_sqrt = std.math.sqrt(@as(f32, @floatFromInt(head_size))),
-        .input_buffer = input_buffer,
-        .output_buffer = output_buffer,
-        .query_buffer = query_buffer,
+        .input = try Vector.createLeaky(allocator, checkpoint.embedding_size),
+        .output = try Vector.createLeaky(allocator, checkpoint.embedding_size),
+        .multi_query = try Vector.createLeaky(allocator, checkpoint.embedding_size),
         .key_cache = key_cache,
         .value_cache = value_cache,
-        .scores = scores,
+        .scores = try allocator.alloc(f32, sequence_length),
     };
 }
 
-pub fn deinit(self: Self) void {
-    self.input_buffer.deinit();
-    self.output_buffer.deinit();
-    self.query_buffer.deinit();
-    self.key_cache.deinit();
-    self.value_cache.deinit();
-    self.allocator.free(self.scores);
-}
-
-pub fn forward(self: Self, layer: usize, position: usize) void {
-    const weights = self.checkpoint.weights;
-    const query_matrix = weights.attention_query_matrices.slice(layer);
-    const key_matrix = weights.attention_key_matrices.slice(layer);
-    const value_matrix = weights.attention_value_matrices.slice(layer);
-    const output_matrix = weights.attention_output_matrices.slice(layer);
-    const key_buffer = self.key_cache.slice(layer).slice(position);
-    const value_buffer = self.value_cache.slice(layer).slice(position);
+pub fn forward(self: Self, layer: usize, position: usize) !void {
+    const query_weight = self.checkpoint.attention_query_weights[layer];
+    const key_weight = self.checkpoint.attention_key_weights[layer];
+    const value_weight = self.checkpoint.attention_value_weights[layer];
+    const output_weight = self.checkpoint.attention_output_weights[layer];
+    const multi_key = self.key_cache[layer][position];
+    const multi_value = self.value_cache[layer][position];
 
-    query_matrix.computeMatrixVectorMultiplication(self.input_buffer, self.query_buffer);
-    key_matrix.computeMatrixVectorMultiplication(self.input_buffer, key_buffer);
-    value_matrix.computeMatrixVectorMultiplication(self.input_buffer, value_buffer);
+    try query_weight.multiplyVector(self.input, self.multi_query);
+    try key_weight.multiplyVector(self.input, multi_key);
+    try value_weight.multiplyVector(self.input, multi_value);
 
-    self.computeRoPE(position, key_buffer);
+    self.computeRoPE(position, multi_key.values);
 
     for (0..self.checkpoint.n_attention_heads) |head| {
-        self.computeGQA(layer, position, head);
+        try self.computeGQA(layer, position, head);
     }
 
-    output_matrix.computeMatrixVectorMultiplication(self.input_buffer, self.output_buffer);
+    try output_weight.multiplyVector(self.input, self.output);
 }
 
 // Rotary positional embeddings: https://arxiv.org/abs/2104.09864
-fn computeRoPE(self: Self, position: usize, key_buffer: Tensor(2)) void {
+fn computeRoPE(self: Self, position: usize, multi_key_values: []f32) void {
     @setFloatMode(.Optimized);
 
-    std.debug.assert(self.query_buffer.values.len % key_buffer.values.len == 0);
+    const multi_query_values = self.multi_query.values;
+
+    std.debug.assert(multi_query_values.len % multi_key_values.len == 0);
 
     var index: usize = 0;
 
-    while (index < self.query_buffer.values.len) : (index += 2) {
+    while (index < multi_query_values.len) : (index += 2) {
         const head: f32 = @floatFromInt(index % self.head_size);
 
         const frequency =
@@ -116,51 +96,54 @@ fn computeRoPE(self: Self, position: usize, key_buffer: Tensor(2)) void {
         const real_rotation_value: f32 = std.math.cos(rotation_scaling_factor);
         const imag_rotation_value: f32 = std.math.sin(rotation_scaling_factor);
 
-        const q_0 = self.query_buffer.values[index];
-        const q_1 = self.query_buffer.values[index + 1];
+        const q_0 = multi_query_values[index];
+        const q_1 = multi_query_values[index + 1];
 
-        self.query_buffer.values[index] = q_0 * real_rotation_value - q_1 * imag_rotation_value;
-        self.query_buffer.values[index + 1] = q_0 * imag_rotation_value + q_1 * real_rotation_value;
+        multi_query_values[index] = q_0 * real_rotation_value - q_1 * imag_rotation_value;
+        multi_query_values[index + 1] = q_0 * imag_rotation_value + q_1 * real_rotation_value;
 
-        if (index < key_buffer.values.len) {
-            const k_0 = key_buffer.values[index];
-            const k_1 = key_buffer.values[index + 1];
+        if (index < multi_key_values.len) {
+            const k_0 = multi_key_values[index];
+            const k_1 = multi_key_values[index + 1];
 
-            key_buffer.values[index] = k_0 * real_rotation_value - k_1 * imag_rotation_value;
-            key_buffer.values[index + 1] = k_0 * imag_rotation_value + k_1 * real_rotation_value;
+            multi_key_values[index] = k_0 * real_rotation_value - k_1 * imag_rotation_value;
+            multi_key_values[index + 1] = k_0 * imag_rotation_value + k_1 * real_rotation_value;
         }
     }
 }
 
 // Grouped-query attention: https://arxiv.org/abs/2305.13245v1
-fn computeGQA(self: Self, layer: usize, current_position: usize, head: usize) void {
+fn computeGQA(self: Self, layer: usize, current_position: usize, head: usize) !void {
     @setFloatMode(.Optimized);
 
-    const query_vector = self.query_buffer.slice(head);
+    const query_values = self.multi_query.values[head * self.head_size ..][0..self.head_size];
 
     const query_group =
         head / (self.checkpoint.n_attention_heads / self.checkpoint.n_attention_query_groups);
 
     const next_position = current_position + 1;
 
     for (0..next_position) |position| {
-        const key_vector = self.key_cache.slice(layer).slice(position).slice(query_group);
+        const multi_key = self.key_cache[layer][position];
+        const key_values = multi_key.values[query_group * self.head_size ..][0..self.head_size];
 
-        self.scores[position] = query_vector.computeScalarProduct(key_vector) / self.head_size_sqrt;
+        self.scores[position] =
+            try simd.computeScalarProduct(query_values, key_values) / self.head_size_sqrt;
     }
 
     math.softmax(self.scores[0..next_position]);
 
-    const attention_buffer = self.input_buffer.slice(head);
+    const attention_values = self.input.values[head * self.head_size ..][0..self.head_size];
 
-    @memset(attention_buffer.values, 0);
+    @memset(attention_values, 0);
 
     for (0..next_position) |position| {
-        const value_vector = self.value_cache.slice(layer).slice(position).slice(query_group);
+        const multi_value = self.value_cache[layer][position];
+        const value_values = multi_value.values[query_group * self.head_size ..][0..self.head_size];
         const weight = self.scores[position];
 
         for (0..self.head_size) |index| {
-            attention_buffer.values[index] += value_vector.values[index] * weight;
+            attention_values[index] += value_values[index] * weight;
         }
     }
 }