Skip to content

Commit

Permalink
Implemented Arena Allocator, dropped legacy file format, aligned vect…
Browse files Browse the repository at this point in the history
…ors to cache line (no performance benefit observed)
  • Loading branch information
clebert committed Oct 22, 2023
1 parent d5be75c commit af6c25c
Show file tree
Hide file tree
Showing 21 changed files with 511 additions and 1,094 deletions.
43 changes: 38 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ This project is a port of Andrej Karpathy's [llama2.c](https://github.com/karpat

## Usage

Build and run the `llama2-generator` for text generation:
Build and run `llama2-generator`:

```sh
zig build -Doptimize=ReleaseFast
Expand All @@ -23,10 +23,10 @@ Lily wanted to play with the ball, but it was too high up in the sky. She tried
Lily found a stick and tried to hit the ball. But the stick was too short. She tried again and again, but she couldn't reach it. She felt sad.
Suddenly, a kind man came by and saw Lily. He asked her what was wrong. Lily told him about the ball. The man smiled and said, "I have a useful idea!" He took out a long stick and used it to knock the ball down. Lily was so happy! She thanked the man and they played together in the sunshine.
achieved: 726.974 tok/s
achieved: 719.870 tok/s
```

## Run Llama 2 from Hugging Face
## Run Llama 2 7B from Hugging Face

Install `git-lfs` and clone the [Llama 2 7B](https://huggingface.co/meta-llama/Llama-2-7b-hf) model from Hugging Face:

Expand All @@ -43,7 +43,7 @@ pip3 install -r requirements.txt
python3 convert_hf_model.py /path/to/Llama-2-7b-hf models/llama2_7b_hf
```

Build and run the `llama2-generator` for text generation:
Build and run `llama2-generator`:

```sh
zig build -Doptimize=ReleaseFast
Expand All @@ -55,7 +55,40 @@ The output on an Apple M1 Pro with 32 GB of memory:
```
Once Upon a Time in Hollywood is a 2019 American comedy-drama film written and directed by Quentin Tarantino
achieved: 1.821 tok/s
achieved: 1.800 tok/s
```

## Run Llama 2 7B Chat from Hugging Face

Install `git-lfs` and clone the [Llama 2 7B Chat](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) model from Hugging Face:

```sh
# Make sure you have git-lfs installed (https://git-lfs.com)
git lfs install
git clone https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
```

Install the necessary Python packages and convert the Hugging Face model:

```sh
pip3 install -r requirements.txt
python3 convert_hf_model.py /path/to/Llama-2-7b-chat-hf models/llama2_7b_chat_hf
```

Build and run `llama2-chat`:

```sh
zig build -Doptimize=ReleaseFast
./zig-out/bin/llama2-chat models/llama2_7b_chat_hf
```

The output on an Apple M1 Pro with 32 GB of memory:

```
Enter system prompt (optional):
User: Hello
Assistant: Hello! It's nice to meet you. Is there something I can help you with or would you like to chat?
User: ...
```

## Help
Expand Down
36 changes: 10 additions & 26 deletions build.zig
Original file line number Diff line number Diff line change
Expand Up @@ -4,77 +4,61 @@ pub fn build(b: *std.Build) void {
const target = b.standardTargetOptions(.{});
const optimize = b.standardOptimizeOption(.{});

const chat_exe = b.addExecutable(.{
.name = "llama2-chat",
.root_source_file = .{ .path = "src/chat_main.zig" },
.target = target,
.optimize = optimize,
});

const generator_exe = b.addExecutable(.{
.name = "llama2-generator",
.root_source_file = .{ .path = "src/generator_main.zig" },
.target = target,
.optimize = optimize,
});

const converter_exe = b.addExecutable(.{
.name = "llama2-converter",
.root_source_file = .{ .path = "src/converter_main.zig" },
const chat_exe = b.addExecutable(.{
.name = "llama2-chat",
.root_source_file = .{ .path = "src/chat_main.zig" },
.target = target,
.optimize = optimize,
});

const build_options = b.addOptions();

chat_exe.addOptions("build_options", build_options);
generator_exe.addOptions("build_options", build_options);
converter_exe.addOptions("build_options", build_options);
chat_exe.addOptions("build_options", build_options);

// This declares intent for the executable to be installed into the
// standard location when the user invokes the "install" step (the default
// step when running `zig build`).
b.installArtifact(chat_exe);
b.installArtifact(generator_exe);
b.installArtifact(converter_exe);
b.installArtifact(chat_exe);

// This *creates* a Run step in the build graph, to be executed when another
// step is evaluated that depends on it. The next line below will establish
// such a dependency.
const run_chat_cmd = b.addRunArtifact(chat_exe);
const run_generator_cmd = b.addRunArtifact(generator_exe);
const run_converter_cmd = b.addRunArtifact(converter_exe);
const run_chat_cmd = b.addRunArtifact(chat_exe);

// By making the run step depend on the install step, it will be run from the
// installation directory rather than directly from within the cache directory.
// This is not necessary, however, if the application depends on other installed
// files, this ensures they will be present and in the expected location.
run_chat_cmd.step.dependOn(b.getInstallStep());
run_generator_cmd.step.dependOn(b.getInstallStep());
run_converter_cmd.step.dependOn(b.getInstallStep());
run_chat_cmd.step.dependOn(b.getInstallStep());

// This allows the user to pass arguments to the application in the build
// command itself, like this: `zig build run -- arg1 arg2 etc`
if (b.args) |args| {
run_chat_cmd.addArgs(args);
run_generator_cmd.addArgs(args);
run_converter_cmd.addArgs(args);
run_chat_cmd.addArgs(args);
}

// This creates a build step. It will be visible in the `zig build --help` menu,
// and can be selected like this: `zig build run`
// This will evaluate the `run` step rather than the default, which is "install".
const run_chat_step = b.step("run-chat", "Run the chat");

run_chat_step.dependOn(&run_chat_cmd.step);

const run_generator_step = b.step("run-generator", "Run the generator");

run_generator_step.dependOn(&run_generator_cmd.step);

const run_converter_step = b.step("run-converter", "Run the converter");
const run_chat_step = b.step("run-chat", "Run the chat");

run_converter_step.dependOn(&run_converter_cmd.step);
run_chat_step.dependOn(&run_chat_cmd.step);

const test_step = b.step("test", "Run unit tests");

Expand Down
163 changes: 73 additions & 90 deletions src/attention.zig
Original file line number Diff line number Diff line change
Expand Up @@ -2,111 +2,91 @@ const Self = @This();

const std = @import("std");
const Checkpoint = @import("checkpoint.zig");
const math = @import("./math.zig");
const Tensor = @import("./tensor.zig").Tensor;
const math = @import("math.zig");
const simd = @import("simd.zig");
const Vector = @import("vector.zig");

allocator: std.mem.Allocator,
checkpoint: Checkpoint,
head_size: usize,
head_size_sqrt: f32,
input_buffer: Tensor(2),
output_buffer: Tensor(1),
query_buffer: Tensor(2),
key_cache: Tensor(4),
value_cache: Tensor(4),
input: Vector,
output: Vector,
multi_query: Vector,
key_cache: []const []const Vector,
value_cache: []const []const Vector,
scores: []f32,

pub fn init(allocator: std.mem.Allocator, checkpoint: Checkpoint, sequence_length: usize) !Self {
const embedding_size = checkpoint.embedding_size;
const n_attention_heads = checkpoint.n_attention_heads;
const head_size: usize = embedding_size / n_attention_heads;
const input_buffer = try Tensor(2).init(allocator, [_]usize{ n_attention_heads, head_size });

errdefer input_buffer.deinit();

const output_buffer = try Tensor(1).init(allocator, [_]usize{embedding_size});

errdefer output_buffer.deinit();

const query_buffer = try Tensor(2).init(allocator, [_]usize{ n_attention_heads, head_size });

errdefer query_buffer.deinit();

const n_layers = checkpoint.n_layers;
const n_attention_query_groups = checkpoint.n_attention_query_groups;

const key_cache = try Tensor(4).init(
allocator,
[_]usize{ n_layers, sequence_length, n_attention_query_groups, head_size },
);

errdefer key_cache.deinit();

const value_cache = try Tensor(4).init(
allocator,
[_]usize{ n_layers, sequence_length, n_attention_query_groups, head_size },
);

errdefer value_cache.deinit();
pub fn createLeaky(
allocator: std.mem.Allocator,
checkpoint: Checkpoint,
sequence_length: usize,
) !Self {
const head_size = checkpoint.embedding_size / checkpoint.n_attention_heads;
const key_cache = try allocator.alloc([]Vector, checkpoint.n_layers);

for (key_cache) |*layer| {
layer.* = try Vector.createMultipleLeaky(
allocator,
sequence_length,
checkpoint.n_attention_query_groups * head_size,
);
}

const scores = try allocator.alloc(f32, sequence_length);
const value_cache = try allocator.alloc([]Vector, checkpoint.n_layers);

errdefer allocator.free(scores);
for (value_cache) |*layer| {
layer.* = try Vector.createMultipleLeaky(
allocator,
sequence_length,
checkpoint.n_attention_query_groups * head_size,
);
}

return .{
.allocator = allocator,
.checkpoint = checkpoint,
.head_size = head_size,
.head_size_sqrt = std.math.sqrt(@as(f32, @floatFromInt(head_size))),
.input_buffer = input_buffer,
.output_buffer = output_buffer,
.query_buffer = query_buffer,
.input = try Vector.createLeaky(allocator, checkpoint.embedding_size),
.output = try Vector.createLeaky(allocator, checkpoint.embedding_size),
.multi_query = try Vector.createLeaky(allocator, checkpoint.embedding_size),
.key_cache = key_cache,
.value_cache = value_cache,
.scores = scores,
.scores = try allocator.alloc(f32, sequence_length),
};
}

pub fn deinit(self: Self) void {
self.input_buffer.deinit();
self.output_buffer.deinit();
self.query_buffer.deinit();
self.key_cache.deinit();
self.value_cache.deinit();
self.allocator.free(self.scores);
}

pub fn forward(self: Self, layer: usize, position: usize) void {
const weights = self.checkpoint.weights;
const query_matrix = weights.attention_query_matrices.slice(layer);
const key_matrix = weights.attention_key_matrices.slice(layer);
const value_matrix = weights.attention_value_matrices.slice(layer);
const output_matrix = weights.attention_output_matrices.slice(layer);
const key_buffer = self.key_cache.slice(layer).slice(position);
const value_buffer = self.value_cache.slice(layer).slice(position);
pub fn forward(self: Self, layer: usize, position: usize) !void {
const query_weight = self.checkpoint.attention_query_weights[layer];
const key_weight = self.checkpoint.attention_key_weights[layer];
const value_weight = self.checkpoint.attention_value_weights[layer];
const output_weight = self.checkpoint.attention_output_weights[layer];
const multi_key = self.key_cache[layer][position];
const multi_value = self.value_cache[layer][position];

query_matrix.computeMatrixVectorMultiplication(self.input_buffer, self.query_buffer);
key_matrix.computeMatrixVectorMultiplication(self.input_buffer, key_buffer);
value_matrix.computeMatrixVectorMultiplication(self.input_buffer, value_buffer);
try query_weight.multiplyVector(self.input, self.multi_query);
try key_weight.multiplyVector(self.input, multi_key);
try value_weight.multiplyVector(self.input, multi_value);

self.computeRoPE(position, key_buffer);
self.computeRoPE(position, multi_key.values);

for (0..self.checkpoint.n_attention_heads) |head| {
self.computeGQA(layer, position, head);
try self.computeGQA(layer, position, head);
}

output_matrix.computeMatrixVectorMultiplication(self.input_buffer, self.output_buffer);
try output_weight.multiplyVector(self.input, self.output);
}

// Rotary positional embeddings: https://arxiv.org/abs/2104.09864
fn computeRoPE(self: Self, position: usize, key_buffer: Tensor(2)) void {
fn computeRoPE(self: Self, position: usize, multi_key_values: []f32) void {
@setFloatMode(.Optimized);

std.debug.assert(self.query_buffer.values.len % key_buffer.values.len == 0);
const multi_query_values = self.multi_query.values;

std.debug.assert(multi_query_values.len % multi_key_values.len == 0);

var index: usize = 0;

while (index < self.query_buffer.values.len) : (index += 2) {
while (index < multi_query_values.len) : (index += 2) {
const head: f32 = @floatFromInt(index % self.head_size);

const frequency =
Expand All @@ -116,51 +96,54 @@ fn computeRoPE(self: Self, position: usize, key_buffer: Tensor(2)) void {
const real_rotation_value: f32 = std.math.cos(rotation_scaling_factor);
const imag_rotation_value: f32 = std.math.sin(rotation_scaling_factor);

const q_0 = self.query_buffer.values[index];
const q_1 = self.query_buffer.values[index + 1];
const q_0 = multi_query_values[index];
const q_1 = multi_query_values[index + 1];

self.query_buffer.values[index] = q_0 * real_rotation_value - q_1 * imag_rotation_value;
self.query_buffer.values[index + 1] = q_0 * imag_rotation_value + q_1 * real_rotation_value;
multi_query_values[index] = q_0 * real_rotation_value - q_1 * imag_rotation_value;
multi_query_values[index + 1] = q_0 * imag_rotation_value + q_1 * real_rotation_value;

if (index < key_buffer.values.len) {
const k_0 = key_buffer.values[index];
const k_1 = key_buffer.values[index + 1];
if (index < multi_key_values.len) {
const k_0 = multi_key_values[index];
const k_1 = multi_key_values[index + 1];

key_buffer.values[index] = k_0 * real_rotation_value - k_1 * imag_rotation_value;
key_buffer.values[index + 1] = k_0 * imag_rotation_value + k_1 * real_rotation_value;
multi_key_values[index] = k_0 * real_rotation_value - k_1 * imag_rotation_value;
multi_key_values[index + 1] = k_0 * imag_rotation_value + k_1 * real_rotation_value;
}
}
}

// Grouped-query attention: https://arxiv.org/abs/2305.13245v1
fn computeGQA(self: Self, layer: usize, current_position: usize, head: usize) void {
fn computeGQA(self: Self, layer: usize, current_position: usize, head: usize) !void {
@setFloatMode(.Optimized);

const query_vector = self.query_buffer.slice(head);
const query_values = self.multi_query.values[head * self.head_size ..][0..self.head_size];

const query_group =
head / (self.checkpoint.n_attention_heads / self.checkpoint.n_attention_query_groups);

const next_position = current_position + 1;

for (0..next_position) |position| {
const key_vector = self.key_cache.slice(layer).slice(position).slice(query_group);
const multi_key = self.key_cache[layer][position];
const key_values = multi_key.values[query_group * self.head_size ..][0..self.head_size];

self.scores[position] = query_vector.computeScalarProduct(key_vector) / self.head_size_sqrt;
self.scores[position] =
try simd.computeScalarProduct(query_values, key_values) / self.head_size_sqrt;
}

math.softmax(self.scores[0..next_position]);

const attention_buffer = self.input_buffer.slice(head);
const attention_values = self.input.values[head * self.head_size ..][0..self.head_size];

@memset(attention_buffer.values, 0);
@memset(attention_values, 0);

for (0..next_position) |position| {
const value_vector = self.value_cache.slice(layer).slice(position).slice(query_group);
const multi_value = self.value_cache[layer][position];
const value_values = multi_value.values[query_group * self.head_size ..][0..self.head_size];
const weight = self.scores[position];

for (0..self.head_size) |index| {
attention_buffer.values[index] += value_vector.values[index] * weight;
attention_values[index] += value_values[index] * weight;
}
}
}
Loading

0 comments on commit af6c25c

Please sign in to comment.