diff --git a/README.md b/README.md
index 77c8612..0c4664f 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ This project is a port of Andrej Karpathy's [llama2.c](https://github.com/karpat
 
 ## Usage
 
-Build and run the `llama2-generator` for text generation:
+Build and run `llama2-generator`:
 
 ```sh
 zig build -Doptimize=ReleaseFast
@@ -23,10 +23,10 @@ Lily wanted to play with the ball, but it was too high up in the sky. She tried
 Lily found a stick and tried to hit the ball. But the stick was too short. She tried again and again, but she couldn't reach it. She felt sad.
 Suddenly, a kind man came by and saw Lily. He asked her what was wrong. Lily told him about the ball. The man smiled and said, "I have a useful idea!" He took out a long stick and used it to knock the ball down. Lily was so happy! She thanked the man and they played together in the sunshine.
 
-achieved: 726.974 tok/s
+achieved: 719.870 tok/s
 ```
 
-## Run Llama 2 from Hugging Face
+## Run Llama 2 7B from Hugging Face
 
 Install `git-lfs` and clone the [Llama 2 7B](https://huggingface.co/meta-llama/Llama-2-7b-hf) model from Hugging Face:
 
@@ -43,7 +43,7 @@ pip3 install -r requirements.txt
 python3 convert_hf_model.py /path/to/Llama-2-7b-hf models/llama2_7b_hf
 ```
 
-Build and run the `llama2-generator` for text generation:
+Build and run `llama2-generator`:
 
 ```sh
 zig build -Doptimize=ReleaseFast
@@ -55,7 +55,40 @@ The output on an Apple M1 Pro with 32 GB of memory:
 ```
 Once Upon a Time in Hollywood is a 2019 American comedy-drama film written and directed by Quentin Tarantino
 
-achieved: 1.821 tok/s
+achieved: 1.800 tok/s
+```
+
+## Run Llama 2 7B Chat from Hugging Face
+
+Install `git-lfs` and clone the [Llama 2 7B Chat](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) model from Hugging Face:
+
+```sh
+# Make sure you have git-lfs installed (https://git-lfs.com)
+git lfs install
+git clone https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+```
+
+Install the necessary Python packages and convert the Hugging Face model:
+
+```sh
+pip3 install -r requirements.txt
+python3 convert_hf_model.py /path/to/Llama-2-7b-chat-hf models/llama2_7b_chat_hf
+```
+
+Build and run `llama2-chat`:
+
+```sh
+zig build -Doptimize=ReleaseFast
+./zig-out/bin/llama2-chat models/llama2_7b_chat_hf
+```
+
+The output on an Apple M1 Pro with 32 GB of memory:
+
+```
+Enter system prompt (optional):
+User: Hello
+Assistant: Hello! It's nice to meet you. Is there something I can help you with or would you like to chat?
+User: ...
 ```
 
 ## Help
diff --git a/build.zig b/build.zig
index c04fcbc..ad8280f 100644
--- a/build.zig
+++ b/build.zig
@@ -4,13 +4,6 @@ pub fn build(b: *std.Build) void {
     const target = b.standardTargetOptions(.{});
     const optimize = b.standardOptimizeOption(.{});
 
-    const chat_exe = b.addExecutable(.{
-        .name = "llama2-chat",
-        .root_source_file = .{ .path = "src/chat_main.zig" },
-        .target = target,
-        .optimize = optimize,
-    });
-
     const generator_exe = b.addExecutable(.{
         .name = "llama2-generator",
         .root_source_file = .{ .path = "src/generator_main.zig" },
@@ -18,63 +11,54 @@ pub fn build(b: *std.Build) void {
         .optimize = optimize,
     });
 
-    const converter_exe = b.addExecutable(.{
-        .name = "llama2-converter",
-        .root_source_file = .{ .path = "src/converter_main.zig" },
+    const chat_exe = b.addExecutable(.{
+        .name = "llama2-chat",
+        .root_source_file = .{ .path = "src/chat_main.zig" },
         .target = target,
         .optimize = optimize,
     });
 
     const build_options = b.addOptions();
 
-    chat_exe.addOptions("build_options", build_options);
     generator_exe.addOptions("build_options", build_options);
-    converter_exe.addOptions("build_options", build_options);
+    chat_exe.addOptions("build_options", build_options);
 
     // This declares intent for the executable to be installed into the
     // standard location when the user invokes the "install" step (the default
     // step when running `zig build`).
-    b.installArtifact(chat_exe);
     b.installArtifact(generator_exe);
-    b.installArtifact(converter_exe);
+    b.installArtifact(chat_exe);
 
     // This *creates* a Run step in the build graph, to be executed when another
     // step is evaluated that depends on it. The next line below will establish
     // such a dependency.
-    const run_chat_cmd = b.addRunArtifact(chat_exe);
     const run_generator_cmd = b.addRunArtifact(generator_exe);
-    const run_converter_cmd = b.addRunArtifact(converter_exe);
+    const run_chat_cmd = b.addRunArtifact(chat_exe);
 
     // By making the run step depend on the install step, it will be run from the
     // installation directory rather than directly from within the cache directory.
     // This is not necessary, however, if the application depends on other installed
     // files, this ensures they will be present and in the expected location.
-    run_chat_cmd.step.dependOn(b.getInstallStep());
     run_generator_cmd.step.dependOn(b.getInstallStep());
-    run_converter_cmd.step.dependOn(b.getInstallStep());
+    run_chat_cmd.step.dependOn(b.getInstallStep());
 
     // This allows the user to pass arguments to the application in the build
     // command itself, like this: `zig build run -- arg1 arg2 etc`
     if (b.args) |args| {
-        run_chat_cmd.addArgs(args);
         run_generator_cmd.addArgs(args);
-        run_converter_cmd.addArgs(args);
+        run_chat_cmd.addArgs(args);
     }
 
     // This creates a build step. It will be visible in the `zig build --help` menu,
     // and can be selected like this: `zig build run`
     // This will evaluate the `run` step rather than the default, which is "install".
-    const run_chat_step = b.step("run-chat", "Run the chat");
-
-    run_chat_step.dependOn(&run_chat_cmd.step);
-
     const run_generator_step = b.step("run-generator", "Run the generator");
 
     run_generator_step.dependOn(&run_generator_cmd.step);
 
-    const run_converter_step = b.step("run-converter", "Run the converter");
+    const run_chat_step = b.step("run-chat", "Run the chat");
 
-    run_converter_step.dependOn(&run_converter_cmd.step);
+    run_chat_step.dependOn(&run_chat_cmd.step);
 
     const test_step = b.step("test", "Run unit tests");
 
diff --git a/src/attention.zig b/src/attention.zig
index 43c7df3..2065bbe 100644
--- a/src/attention.zig
+++ b/src/attention.zig
@@ -2,111 +2,91 @@ const Self = @This();
 
 const std = @import("std");
 const Checkpoint = @import("checkpoint.zig");
-const math = @import("./math.zig");
-const Tensor = @import("./tensor.zig").Tensor;
+const math = @import("math.zig");
+const simd = @import("simd.zig");
+const Vector = @import("vector.zig");
 
-allocator: std.mem.Allocator,
 checkpoint: Checkpoint,
 head_size: usize,
 head_size_sqrt: f32,
-input_buffer: Tensor(2),
-output_buffer: Tensor(1),
-query_buffer: Tensor(2),
-key_cache: Tensor(4),
-value_cache: Tensor(4),
+input: Vector,
+output: Vector,
+multi_query: Vector,
+key_cache: []const []const Vector,
+value_cache: []const []const Vector,
 scores: []f32,
 
-pub fn init(allocator: std.mem.Allocator, checkpoint: Checkpoint, sequence_length: usize) !Self {
-    const embedding_size = checkpoint.embedding_size;
-    const n_attention_heads = checkpoint.n_attention_heads;
-    const head_size: usize = embedding_size / n_attention_heads;
-    const input_buffer = try Tensor(2).init(allocator, [_]usize{ n_attention_heads, head_size });
-
-    errdefer input_buffer.deinit();
-
-    const output_buffer = try Tensor(1).init(allocator, [_]usize{embedding_size});
-
-    errdefer output_buffer.deinit();
-
-    const query_buffer = try Tensor(2).init(allocator, [_]usize{ n_attention_heads, head_size });
-
-    errdefer query_buffer.deinit();
-
-    const n_layers = checkpoint.n_layers;
-    const n_attention_query_groups = checkpoint.n_attention_query_groups;
-
-    const key_cache = try Tensor(4).init(
-        allocator,
-        [_]usize{ n_layers, sequence_length, n_attention_query_groups, head_size },
-    );
-
-    errdefer key_cache.deinit();
-
-    const value_cache = try Tensor(4).init(
-        allocator,
-        [_]usize{ n_layers, sequence_length, n_attention_query_groups, head_size },
-    );
-
-    errdefer value_cache.deinit();
+pub fn createLeaky(
+    allocator: std.mem.Allocator,
+    checkpoint: Checkpoint,
+    sequence_length: usize,
+) !Self {
+    const head_size = checkpoint.embedding_size / checkpoint.n_attention_heads;
+    const key_cache = try allocator.alloc([]Vector, checkpoint.n_layers);
+
+    for (key_cache) |*layer| {
+        layer.* = try Vector.createMultipleLeaky(
+            allocator,
+            sequence_length,
+            checkpoint.n_attention_query_groups * head_size,
+        );
+    }
 
-    const scores = try allocator.alloc(f32, sequence_length);
+    const value_cache = try allocator.alloc([]Vector, checkpoint.n_layers);
 
-    errdefer allocator.free(scores);
+    for (value_cache) |*layer| {
+        layer.* = try Vector.createMultipleLeaky(
+            allocator,
+            sequence_length,
+            checkpoint.n_attention_query_groups * head_size,
+        );
+    }
 
     return .{
-        .allocator = allocator,
         .checkpoint = checkpoint,
         .head_size = head_size,
         .head_size_sqrt = std.math.sqrt(@as(f32, @floatFromInt(head_size))),
-        .input_buffer = input_buffer,
-        .output_buffer = output_buffer,
-        .query_buffer = query_buffer,
+        .input = try Vector.createLeaky(allocator, checkpoint.embedding_size),
+        .output = try Vector.createLeaky(allocator, checkpoint.embedding_size),
+        .multi_query = try Vector.createLeaky(allocator, checkpoint.embedding_size),
         .key_cache = key_cache,
         .value_cache = value_cache,
-        .scores = scores,
+        .scores = try allocator.alloc(f32, sequence_length),
     };
 }
 
-pub fn deinit(self: Self) void {
-    self.input_buffer.deinit();
-    self.output_buffer.deinit();
-    self.query_buffer.deinit();
-    self.key_cache.deinit();
-    self.value_cache.deinit();
-    self.allocator.free(self.scores);
-}
-
-pub fn forward(self: Self, layer: usize, position: usize) void {
-    const weights = self.checkpoint.weights;
-    const query_matrix = weights.attention_query_matrices.slice(layer);
-    const key_matrix = weights.attention_key_matrices.slice(layer);
-    const value_matrix = weights.attention_value_matrices.slice(layer);
-    const output_matrix = weights.attention_output_matrices.slice(layer);
-    const key_buffer = self.key_cache.slice(layer).slice(position);
-    const value_buffer = self.value_cache.slice(layer).slice(position);
+pub fn forward(self: Self, layer: usize, position: usize) !void {
+    const query_weight = self.checkpoint.attention_query_weights[layer];
+    const key_weight = self.checkpoint.attention_key_weights[layer];
+    const value_weight = self.checkpoint.attention_value_weights[layer];
+    const output_weight = self.checkpoint.attention_output_weights[layer];
+    const multi_key = self.key_cache[layer][position];
+    const multi_value = self.value_cache[layer][position];
 
-    query_matrix.computeMatrixVectorMultiplication(self.input_buffer, self.query_buffer);
-    key_matrix.computeMatrixVectorMultiplication(self.input_buffer, key_buffer);
-    value_matrix.computeMatrixVectorMultiplication(self.input_buffer, value_buffer);
+    try query_weight.multiplyVector(self.input, self.multi_query);
+    try key_weight.multiplyVector(self.input, multi_key);
+    try value_weight.multiplyVector(self.input, multi_value);
 
-    self.computeRoPE(position, key_buffer);
+    self.computeRoPE(position, multi_key.values);
 
     for (0..self.checkpoint.n_attention_heads) |head| {
-        self.computeGQA(layer, position, head);
+        try self.computeGQA(layer, position, head);
     }
 
-    output_matrix.computeMatrixVectorMultiplication(self.input_buffer, self.output_buffer);
+    try output_weight.multiplyVector(self.input, self.output);
 }
 
 // Rotary positional embeddings: https://arxiv.org/abs/2104.09864
-fn computeRoPE(self: Self, position: usize, key_buffer: Tensor(2)) void {
+fn computeRoPE(self: Self, position: usize, multi_key_values: []f32) void {
     @setFloatMode(.Optimized);
 
-    std.debug.assert(self.query_buffer.values.len % key_buffer.values.len == 0);
+    const multi_query_values = self.multi_query.values;
+
+    std.debug.assert(multi_query_values.len % multi_key_values.len == 0);
 
     var index: usize = 0;
 
-    while (index < self.query_buffer.values.len) : (index += 2) {
+    while (index < multi_query_values.len) : (index += 2) {
         const head: f32 = @floatFromInt(index % self.head_size);
 
         const frequency =
@@ -116,27 +96,27 @@ fn computeRoPE(self: Self, position: usize, key_buffer: Tensor(2)) void {
         const real_rotation_value: f32 = std.math.cos(rotation_scaling_factor);
         const imag_rotation_value: f32 = std.math.sin(rotation_scaling_factor);
 
-        const q_0 = self.query_buffer.values[index];
-        const q_1 = self.query_buffer.values[index + 1];
+        const q_0 = multi_query_values[index];
+        const q_1 = multi_query_values[index + 1];
 
-        self.query_buffer.values[index] = q_0 * real_rotation_value - q_1 * imag_rotation_value;
-        self.query_buffer.values[index + 1] = q_0 * imag_rotation_value + q_1 * real_rotation_value;
+        multi_query_values[index] = q_0 * real_rotation_value - q_1 * imag_rotation_value;
+        multi_query_values[index + 1] = q_0 * imag_rotation_value + q_1 * real_rotation_value;
 
-        if (index < key_buffer.values.len) {
-            const k_0 = key_buffer.values[index];
-            const k_1 = key_buffer.values[index + 1];
+        if (index < multi_key_values.len) {
+            const k_0 = multi_key_values[index];
+            const k_1 = multi_key_values[index + 1];
 
-            key_buffer.values[index] = k_0 * real_rotation_value - k_1 * imag_rotation_value;
-            key_buffer.values[index + 1] = k_0 * imag_rotation_value + k_1 * real_rotation_value;
+            multi_key_values[index] = k_0 * real_rotation_value - k_1 * imag_rotation_value;
+            multi_key_values[index + 1] = k_0 * imag_rotation_value + k_1 * real_rotation_value;
         }
     }
 }
 
 // Grouped-query attention: https://arxiv.org/abs/2305.13245v1
-fn computeGQA(self: Self, layer: usize, current_position: usize, head: usize) void {
+fn computeGQA(self: Self, layer: usize, current_position: usize, head: usize) !void {
     @setFloatMode(.Optimized);
 
-    const query_vector = self.query_buffer.slice(head);
+    const query_values = self.multi_query.values[head * self.head_size ..][0..self.head_size];
 
     const query_group =
         head / (self.checkpoint.n_attention_heads / self.checkpoint.n_attention_query_groups);
@@ -144,23 +124,26 @@ fn computeGQA(self: Self, layer: usize, current_position: usize, head: usize) vo
     const next_position = current_position + 1;
 
     for (0..next_position) |position| {
-        const key_vector = self.key_cache.slice(layer).slice(position).slice(query_group);
+        const multi_key = self.key_cache[layer][position];
+        const key_values = multi_key.values[query_group * self.head_size ..][0..self.head_size];
 
-        self.scores[position] = query_vector.computeScalarProduct(key_vector) / self.head_size_sqrt;
+        self.scores[position] =
+            try simd.computeScalarProduct(query_values, key_values) / self.head_size_sqrt;
     }
 
     math.softmax(self.scores[0..next_position]);
 
-    const attention_buffer = self.input_buffer.slice(head);
+    const attention_values = self.input.values[head * self.head_size ..][0..self.head_size];
 
-    @memset(attention_buffer.values, 0);
+    @memset(attention_values, 0);
 
     for (0..next_position) |position| {
-        const value_vector = self.value_cache.slice(layer).slice(position).slice(query_group);
+        const multi_value = self.value_cache[layer][position];
+        const value_values = multi_value.values[query_group * self.head_size ..][0..self.head_size];
         const weight = self.scores[position];
 
         for (0..self.head_size) |index| {
-            attention_buffer.values[index] += value_vector.values[index] * weight;
+            attention_values[index] += value_values[index] * weight;
         }
     }
 }
diff --git a/src/chat.zig b/src/chat.zig
index f7290e5..1953973 100644
--- a/src/chat.zig
+++ b/src/chat.zig
@@ -7,43 +7,25 @@ const Sampler = @import("sampler.zig");
 const Tokenizer = @import("tokenizer.zig");
 const Transformer = @import("transformer.zig");
 
-allocator: std.mem.Allocator,
 transformer: Transformer,
 tokenizer: Tokenizer,
 sampler: Sampler,
 system_prompt: []const u8,
 user_prompt: []const u8,
 
-pub fn init(allocator: std.mem.Allocator, args: ChatArgs) !Self {
-    const transformer = try Transformer.init(allocator, args.model_path, args.sequence_length);
-
-    errdefer transformer.deinit();
-
+pub fn createLeaky(allocator: std.mem.Allocator, args: ChatArgs) !Self {
+    const transformer = try Transformer.createLeaky(allocator, args.model_path, args.sequence_length);
     const vocab_size = transformer.checkpoint.vocab_size;
-    const tokenizer = try Tokenizer.init(allocator, args.model_path, vocab_size);
-
-    errdefer tokenizer.deinit();
-
-    const sampler = try Sampler.init(allocator, args, vocab_size);
-
-    errdefer sampler.deinit();
 
     return .{
-        .allocator = allocator,
         .transformer = transformer,
-        .tokenizer = tokenizer,
-        .sampler = sampler,
+        .tokenizer = try Tokenizer.readLeaky(allocator, args.model_path, vocab_size),
+        .sampler = try Sampler.createLeaky(allocator, args, vocab_size),
         .system_prompt = args.system_prompt,
         .user_prompt = args.user_prompt,
     };
 }
 
-pub fn deinit(self: Self) void {
-    self.transformer.deinit();
-    self.tokenizer.deinit();
-    self.sampler.deinit();
-}
-
 const system_prompt_template_start = "<<SYS>>\n";
 const system_prompt_template_close = "\n<</SYS>>\n\n";
 const user_prompt_template_start = "[INST] ";
@@ -68,7 +50,7 @@ pub fn start(self: *Self, allocator: std.mem.Allocator) !void {
     };
 
     for (0..self.transformer.sequence_length) |position| {
-        self.transformer.forward(token, position);
+        try self.transformer.forward(token, position);
 
         if (token == bos_token and user_turn) {
             var user_prompt = std.ArrayList(u8).init(allocator);
@@ -129,7 +111,7 @@ pub fn start(self: *Self, allocator: std.mem.Allocator) !void {
         user_prompt_tokens_index += 1;
 
         if (next_token == 0) {
-            next_token = self.sampler.sample(self.transformer.output_buffer.values);
+            next_token = self.sampler.sample(self.transformer.output.values);
         }
 
         if (next_token == eos_token) {
diff --git a/src/chat_args.zig b/src/chat_args.zig
index 040bda4..b38e9c7 100644
--- a/src/chat_args.zig
+++ b/src/chat_args.zig
@@ -2,7 +2,6 @@ const Self = @This();
 
 const std = @import("std");
 
-arg_iterator: std.process.ArgIterator,
 model_path: []const u8,
 temperature: f32,
 top_p: f32,
@@ -20,11 +19,9 @@ const Option = enum {
     user_prompt,
 };
 
-pub fn init(allocator: std.mem.Allocator) !Self {
+pub fn createLeaky(allocator: std.mem.Allocator) !Self {
     var arg_iterator = try std.process.argsWithAllocator(allocator);
 
-    errdefer arg_iterator.deinit();
-
     _ = arg_iterator.next().?;
 
     const model_path = arg_iterator.next() orelse try help(1);
@@ -78,7 +75,6 @@ pub fn init(allocator: std.mem.Allocator) !Self {
     }
 
     return .{
-        .arg_iterator = arg_iterator,
         .model_path = model_path,
         .temperature = @max(@min(temperature orelse 1, 1), 0),
         .top_p = @max(@min(top_p orelse 0.9, 1), 0),
@@ -89,10 +85,6 @@ pub fn init(allocator: std.mem.Allocator) !Self {
     };
 }
 
-pub fn deinit(self: *Self) void {
-    self.arg_iterator.deinit();
-}
-
 fn help(exit_status: u8) !noreturn {
     const console = if (exit_status == 0)
         std.io.getStdOut().writer()
diff --git a/src/chat_main.zig b/src/chat_main.zig
index 4f35682..fffe5ed 100644
--- a/src/chat_main.zig
+++ b/src/chat_main.zig
@@ -3,15 +3,13 @@ const Chat = @import("chat.zig");
 const ChatArgs = @import("chat_args.zig");
 
 pub fn main() !void {
-    const allocator = std.heap.page_allocator;
+    var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
 
-    var args = try ChatArgs.init(allocator);
+    defer arena.deinit();
 
-    defer args.deinit();
+    const args = try ChatArgs.createLeaky(arena.allocator());
 
-    var chat = try Chat.init(allocator, args);
+    var chat = try Chat.createLeaky(arena.allocator(), args);
 
-    defer chat.deinit();
-
-    try chat.start(allocator);
+    try chat.start(arena.allocator());
 }
diff --git a/src/checkpoint.zig b/src/checkpoint.zig
index fe5da8a..b7e6993 100644
--- a/src/checkpoint.zig
+++ b/src/checkpoint.zig
@@ -1,9 +1,9 @@
 const Self = @This();
 
 const std = @import("std");
-const Tensor = @import("./tensor.zig").Tensor;
+const Matrix = @import("matrix.zig");
+const Vector = @import("vector.zig");
 
-allocator: std.mem.Allocator,
 embedding_size: usize,
 ffn_hidden_size: usize,
 n_layers: usize,
@@ -11,55 +11,21 @@ n_attention_heads: usize,
 n_attention_query_groups: usize,
 vocab_size: usize,
 max_sequence_length: usize,
-shared_output_matrix: bool,
 
-weights: struct {
-    token_embedding_vectors: Tensor(2),
-    attention_norm_vectors: Tensor(2),
-    attention_query_matrices: Tensor(3),
-    attention_key_matrices: Tensor(3),
-    attention_value_matrices: Tensor(3),
-    attention_output_matrices: Tensor(3),
-    ffn_norm_vectors: Tensor(2),
-    ffn_gate_matrices: Tensor(3),
-    ffn_down_matrices: Tensor(3),
-    ffn_up_matrices: Tensor(3),
-    output_norm_vector: Tensor(1),
-    output_matrix: Tensor(2),
-},
-
-pub fn init(allocator: std.mem.Allocator, model_path: []const u8) !Self {
-    const v1_path = try std.fs.path.join(
-        allocator,
-        &[_][]const u8{ model_path, "checkpoint_v1.bin" },
-    );
-
-    defer allocator.free(v1_path);
-
-    const v1_file = std.fs.cwd().openFile(v1_path, .{}) catch null;
-
-    defer if (v1_file) |file| file.close();
-
-    if (v1_file) |file| return try readV1(allocator, file);
-
-    const legacy_path = try std.fs.path.join(
-        allocator,
-        &[_][]const u8{ model_path, "checkpoint_legacy.bin" },
-    );
-
-    defer allocator.free(legacy_path);
-
-    const legacy_file = std.fs.cwd().openFile(legacy_path, .{}) catch null;
-
-    defer if (legacy_file) |file| file.close();
-
-    if (legacy_file) |file| return try readLegacy(allocator, file);
-
-    return error.CheckpointFileNotFound;
-}
-
-// https://github.com/karpathy/llama2.c/blob/d9862069e7ef665fe6309e3c17398ded2f121bf5/export.py#L132
-pub fn writeV1(self: Self, allocator: std.mem.Allocator, model_path: []const u8) !void {
+token_embedding_weights: []const Vector,
+attention_norm_weights: []const Vector,
+attention_query_weights: []const Matrix,
+attention_key_weights: []const Matrix,
+attention_value_weights: []const Matrix,
+attention_output_weights: []const Matrix,
+ffn_norm_weights: []const Vector,
+ffn_gate_weights: []const Matrix,
+ffn_down_weights: []const Matrix,
+ffn_up_weights: []const Matrix,
+output_norm_weight: Vector,
+output_weight: Matrix,
+
+pub fn readLeaky(allocator: std.mem.Allocator, model_path: []const u8) !Self {
     const path = try std.fs.path.join(
         allocator,
         &[_][]const u8{ model_path, "checkpoint_v1.bin" },
@@ -67,49 +33,15 @@ pub fn writeV1(self: Self, allocator: std.mem.Allocator, model_path: []const u8)
 
     defer allocator.free(path);
 
-    const file = try std.fs.cwd().createFile(path, .{ .truncate = true });
+    const file = try std.fs.cwd().openFile(path, .{});
 
     defer file.close();
 
-    try file.writer().writeIntLittle(u32, 0x616b3432);
-    try file.writer().writeIntLittle(i32, 1);
-    try file.writer().writeIntLittle(i32, @as(i32, @intCast(self.embedding_size)));
-    try file.writer().writeIntLittle(i32, @as(i32, @intCast(self.ffn_hidden_size)));
-    try file.writer().writeIntLittle(i32, @as(i32, @intCast(self.n_layers)));
-    try file.writer().writeIntLittle(i32, @as(i32, @intCast(self.n_attention_heads)));
-    try file.writer().writeIntLittle(i32, @as(i32, @intCast(self.n_attention_query_groups)));
-    try file.writer().writeIntLittle(i32, @as(i32, @intCast(self.vocab_size)));
-    try file.writer().writeIntLittle(i32, @as(i32, @intCast(self.max_sequence_length)));
-    try file.writer().writeIntLittle(u8, @as(u8, @intFromBool(self.shared_output_matrix)));
-    try file.writer().writeByteNTimes(0, 256 - try file.getPos());
-    try self.weights.attention_norm_vectors.write(file);
-    try self.weights.ffn_norm_vectors.write(file);
-    try self.weights.output_norm_vector.write(file);
-    try self.weights.token_embedding_vectors.write(file);
-    try self.weights.attention_query_matrices.write(file);
-    try self.weights.attention_key_matrices.write(file);
-    try self.weights.attention_value_matrices.write(file);
-    try self.weights.attention_output_matrices.write(file);
-    try self.weights.ffn_gate_matrices.write(file);
-    try self.weights.ffn_down_matrices.write(file);
-    try self.weights.ffn_up_matrices.write(file);
-
-    if (!self.shared_output_matrix) {
-        try self.weights.output_matrix.write(file);
-    }
-}
-
-// https://github.com/karpathy/llama2.c/blob/d9862069e7ef665fe6309e3c17398ded2f121bf5/export.py#L132
-fn readV1(allocator: std.mem.Allocator, file: std.fs.File) !Self {
-    const magic = try file.reader().readIntLittle(u32);
-
-    if (magic != 0x616b3432) {
+    if (try file.reader().readIntLittle(u32) != 0x616b3432) {
         return error.InvalidMagic;
     }
 
-    const version = try file.reader().readIntLittle(i32);
-
-    if (version != 1) {
+    if (try file.reader().readIntLittle(i32) != 1) {
         return error.InvalidVersion;
     }
 
@@ -124,111 +56,93 @@ fn readV1(allocator: std.mem.Allocator, file: std.fs.File) !Self {
 
     try file.seekTo(256);
 
-    const attention_norm_vectors = try Tensor(2).init(
+    const attention_norm_weights = try Vector.readMultipleLeaky(
         allocator,
-        [_]usize{ n_layers, embedding_size },
+        file,
+        n_layers,
+        embedding_size,
     );
 
-    errdefer attention_norm_vectors.deinit();
-    try attention_norm_vectors.read(file);
-
-    const ffn_norm_vectors = try Tensor(2).init(
+    const ffn_norm_weights = try Vector.readMultipleLeaky(
         allocator,
-        [_]usize{ n_layers, embedding_size },
+        file,
+        n_layers,
+        embedding_size,
     );
 
-    errdefer ffn_norm_vectors.deinit();
-    try ffn_norm_vectors.read(file);
+    const output_norm_weight = try Vector.readLeaky(allocator, file, embedding_size);
 
-    const output_norm_vector = try Tensor(1).init(
+    const token_embedding_weights = try Vector.readMultipleLeaky(
         allocator,
-        [_]usize{embedding_size},
+        file,
+        vocab_size,
+        embedding_size,
     );
 
-    errdefer output_norm_vector.deinit();
-    try output_norm_vector.read(file);
-
-    const token_embedding_vectors = try Tensor(2).init(
-        allocator,
-        [_]usize{ vocab_size, embedding_size },
-    );
-
-    errdefer token_embedding_vectors.deinit();
-    try token_embedding_vectors.read(file);
-
-    const attention_query_matrices = try Tensor(3).init(
+    const attention_query_weights = try Matrix.readMultipleLeaky(
         allocator,
-        [_]usize{ n_layers, embedding_size, embedding_size },
+        file,
+        n_layers,
+        embedding_size,
+        embedding_size,
     );
 
-    errdefer attention_query_matrices.deinit();
-    try attention_query_matrices.read(file);
-
     const attention_head_size: usize = embedding_size / n_attention_heads;
 
-    const attention_key_matrices = try Tensor(3).init(
+    const attention_key_weights = try Matrix.readMultipleLeaky(
         allocator,
-        [_]usize{ n_layers, n_attention_query_groups * attention_head_size, embedding_size },
+        file,
+        n_layers,
+        n_attention_query_groups * attention_head_size,
+        embedding_size,
     );
 
-    errdefer attention_key_matrices.deinit();
-    try attention_key_matrices.read(file);
-
-    const attention_value_matrices = try Tensor(3).init(
+    const attention_value_weights = try Matrix.readMultipleLeaky(
         allocator,
-        [_]usize{ n_layers, n_attention_query_groups * attention_head_size, embedding_size },
+        file,
+        n_layers,
+        n_attention_query_groups * attention_head_size,
+        embedding_size,
     );
 
-    errdefer attention_value_matrices.deinit();
-    try attention_value_matrices.read(file);
-
-    const attention_output_matrices = try Tensor(3).init(
+    const attention_output_weights = try Matrix.readMultipleLeaky(
         allocator,
-        [_]usize{ n_layers, embedding_size, embedding_size },
+        file,
+        n_layers,
+        embedding_size,
+        embedding_size,
     );
 
-    errdefer attention_output_matrices.deinit();
-    try attention_output_matrices.read(file);
-
-    const ffn_gate_matrices = try Tensor(3).init(
+    const ffn_gate_weights = try Matrix.readMultipleLeaky(
         allocator,
-        [_]usize{ n_layers, ffn_hidden_size, embedding_size },
+        file,
+        n_layers,
+        ffn_hidden_size,
+        embedding_size,
     );
 
-    errdefer ffn_gate_matrices.deinit();
-    try ffn_gate_matrices.read(file);
-
-    const ffn_down_matrices = try Tensor(3).init(
+    const ffn_down_weights = try Matrix.readMultipleLeaky(
         allocator,
-        [_]usize{ n_layers, embedding_size, ffn_hidden_size },
+        file,
+        n_layers,
+        embedding_size,
+        ffn_hidden_size,
     );
 
-    errdefer ffn_down_matrices.deinit();
-    try ffn_down_matrices.read(file);
-
-    const ffn_up_matrices = try Tensor(3).init(
+    const ffn_up_weights = try Matrix.readMultipleLeaky(
         allocator,
-        [_]usize{ n_layers, ffn_hidden_size, embedding_size },
+        file,
+        n_layers,
+        ffn_hidden_size,
+        embedding_size,
     );
 
-    errdefer ffn_up_matrices.deinit();
-    try ffn_up_matrices.read(file);
-
-    const output_matrix = if (shared_output_matrix)
-        token_embedding_vectors
+    const output_weight = if (shared_output_matrix)
+        Matrix{ .rows = token_embedding_weights }
     else
-        try Tensor(2).init(allocator, [_]usize{ vocab_size, embedding_size });
-
-    errdefer if (!shared_output_matrix) {
-        output_matrix.deinit();
-    };
-
-    if (!shared_output_matrix) {
-        try output_matrix.read(file);
-    }
+        try Matrix.readLeaky(allocator, file, vocab_size, embedding_size);
 
     return .{
-        .allocator = allocator,
         .embedding_size = embedding_size,
         .ffn_hidden_size = ffn_hidden_size,
         .n_layers = n_layers,
@@ -236,187 +150,18 @@ fn readV1(allocator: std.mem.Allocator, file: std.fs.File) !Self {
         .n_attention_query_groups = n_attention_query_groups,
         .vocab_size = vocab_size,
         .max_sequence_length = max_sequence_length,
-        .shared_output_matrix = shared_output_matrix,
 
-        .weights = .{
-            .token_embedding_vectors = token_embedding_vectors,
-            .attention_norm_vectors = attention_norm_vectors,
-            .attention_query_matrices = attention_query_matrices,
-            .attention_key_matrices = attention_key_matrices,
-            .attention_value_matrices = attention_value_matrices,
-            .attention_output_matrices = attention_output_matrices,
-            .ffn_norm_vectors = ffn_norm_vectors,
-            .ffn_gate_matrices = ffn_gate_matrices,
-            .ffn_down_matrices = ffn_down_matrices,
-            .ffn_up_matrices = ffn_up_matrices,
-            .output_norm_vector = output_norm_vector,
-            .output_matrix = output_matrix,
-        },
+        .token_embedding_weights = token_embedding_weights,
+        .attention_norm_weights = attention_norm_weights,
+        .attention_query_weights = attention_query_weights,
+        .attention_key_weights = attention_key_weights,
+        .attention_value_weights = attention_value_weights,
+        .attention_output_weights = attention_output_weights,
+        .ffn_norm_weights = ffn_norm_weights,
+        .ffn_gate_weights = ffn_gate_weights,
+        .ffn_down_weights = ffn_down_weights,
+        .ffn_up_weights = ffn_up_weights,
+        .output_norm_weight = output_norm_weight,
+        .output_weight = output_weight,
     };
 }
-
-// https://github.com/karpathy/llama2.c/blob/d9862069e7ef665fe6309e3c17398ded2f121bf5/export.py#L75
-fn readLegacy(allocator: std.mem.Allocator, file: std.fs.File) !Self {
-    const embedding_size: usize = @intCast(try file.reader().readIntLittle(i32));
-    const ffn_hidden_size: usize = @intCast(try file.reader().readIntLittle(i32));
-    const n_layers: usize = @intCast(try file.reader().readIntLittle(i32));
-    const n_attention_heads: usize = @intCast(try file.reader().readIntLittle(i32));
-    const n_attention_query_groups: usize = @intCast(try file.reader().readIntLittle(i32));
-
-    // https://github.com/karpathy/llama2.c/blob/35deb5e0fa55f0a257040bcf1624ed8386e63dc7/run.c#L153
-    const signed_vocab_size = try file.reader().readIntLittle(i32);
-    const shared_output_matrix = signed_vocab_size > 0;
-
-    const vocab_size: usize = @abs(signed_vocab_size);
-    const max_sequence_length: usize = @intCast(try file.reader().readIntLittle(i32));
-
-    const token_embedding_vectors = try Tensor(2).init(
-        allocator,
-        [_]usize{ vocab_size, embedding_size },
-    );
-
-    errdefer token_embedding_vectors.deinit();
-    try token_embedding_vectors.read(file);
-
-    const attention_norm_vectors = try Tensor(2).init(
-        allocator,
-        [_]usize{ n_layers, embedding_size },
-    );
-
-    errdefer attention_norm_vectors.deinit();
-    try attention_norm_vectors.read(file);
-
-    const attention_query_matrices = try Tensor(3).init(
-        allocator,
-        [_]usize{ n_layers, embedding_size, embedding_size },
-    );
-
-    errdefer attention_query_matrices.deinit();
-    try attention_query_matrices.read(file);
-
-    const attention_head_size: usize = embedding_size / n_attention_heads;
-
-    const attention_key_matrices = try Tensor(3).init(
-        allocator,
-        [_]usize{ n_layers, n_attention_query_groups * attention_head_size, embedding_size },
-    );
-
-    errdefer attention_key_matrices.deinit();
-    try attention_key_matrices.read(file);
-
-    const attention_value_matrices = try Tensor(3).init(
-        allocator,
-        [_]usize{ n_layers, n_attention_query_groups * attention_head_size, embedding_size },
-    );
-
-    errdefer attention_value_matrices.deinit();
-    try attention_value_matrices.read(file);
-
-    const attention_output_matrices = try Tensor(3).init(
-        allocator,
-        [_]usize{ n_layers, embedding_size, embedding_size },
-    );
-
-    errdefer attention_output_matrices.deinit();
-    try attention_output_matrices.read(file);
-
-    const ffn_norm_vectors = try Tensor(2).init(
-        allocator,
-        [_]usize{ n_layers, embedding_size },
-    );
-
-    errdefer ffn_norm_vectors.deinit();
-    try ffn_norm_vectors.read(file);
-
-    const ffn_gate_matrices = try Tensor(3).init(
-        allocator,
-        [_]usize{ n_layers, ffn_hidden_size, embedding_size },
-    );
-
-    errdefer ffn_gate_matrices.deinit();
-    try ffn_gate_matrices.read(file);
-
-    const ffn_down_matrices = try Tensor(3).init(
-        allocator,
-        [_]usize{ n_layers, embedding_size, ffn_hidden_size },
-    );
-
-    errdefer ffn_down_matrices.deinit();
-    try ffn_down_matrices.read(file);
-
-    const ffn_up_matrices = try Tensor(3).init(
-        allocator,
-        [_]usize{ n_layers, ffn_hidden_size, embedding_size },
-    );
-
-    errdefer ffn_up_matrices.deinit();
-    try ffn_up_matrices.read(file);
-
-    const output_norm_vector = try Tensor(1).init(
-        allocator,
-        [_]usize{embedding_size},
-    );
-
-    errdefer output_norm_vector.deinit();
-    try output_norm_vector.read(file);
-
-    try file.seekBy(@intCast(max_sequence_length * attention_head_size * @sizeOf(f32)));
-
-    const output_matrix = if (shared_output_matrix)
-        token_embedding_vectors
-    else
-        try Tensor(2).init(allocator, [_]usize{ vocab_size, embedding_size });
-
-    errdefer if (!shared_output_matrix) {
-        output_matrix.deinit();
-    };
-
-    if (!shared_output_matrix) {
-        try output_matrix.read(file);
-    }
-
-    return .{
-        .allocator = allocator,
-        .embedding_size = embedding_size,
-        .ffn_hidden_size = ffn_hidden_size,
-        .n_layers = n_layers,
-        .n_attention_heads = n_attention_heads,
-        .n_attention_query_groups = n_attention_query_groups,
-        .vocab_size = vocab_size,
-        .max_sequence_length = max_sequence_length,
-        .shared_output_matrix = shared_output_matrix,
-
-        .weights = .{
-            .token_embedding_vectors = token_embedding_vectors,
-            .attention_norm_vectors = attention_norm_vectors,
-            .attention_query_matrices = attention_query_matrices,
-            .attention_key_matrices = attention_key_matrices,
-            .attention_value_matrices = attention_value_matrices,
-            .attention_output_matrices = attention_output_matrices,
-            .ffn_norm_vectors = ffn_norm_vectors,
-            .ffn_gate_matrices = ffn_gate_matrices,
-            .ffn_down_matrices = ffn_down_matrices,
-            .ffn_up_matrices = ffn_up_matrices,
-            .output_norm_vector = output_norm_vector,
-            .output_matrix = output_matrix,
-        },
-    };
-}
-
-pub fn deinit(self: Self) void {
-    self.weights.token_embedding_vectors.deinit();
-    self.weights.attention_norm_vectors.deinit();
-    self.weights.attention_query_matrices.deinit();
-    self.weights.attention_key_matrices.deinit();
-    self.weights.attention_value_matrices.deinit();
-    self.weights.attention_output_matrices.deinit();
-    self.weights.ffn_norm_vectors.deinit();
-    self.weights.ffn_gate_matrices.deinit();
-    self.weights.ffn_down_matrices.deinit();
-    self.weights.ffn_up_matrices.deinit();
-    self.weights.output_norm_vector.deinit();
-
-    if (!self.shared_output_matrix) {
-        self.weights.output_matrix.deinit();
-    }
-}
diff --git a/src/converter_args.zig b/src/converter_args.zig
deleted file mode 100644
index c086b7c..0000000
--- a/src/converter_args.zig
+++ /dev/null
@@ -1,40 +0,0 @@
-const Self = @This();
-
-const std = @import("std");
-
-arg_iterator: std.process.ArgIterator,
-model_path: []const u8,
-
-pub fn init(allocator: std.mem.Allocator) !Self {
-    var arg_iterator = try std.process.argsWithAllocator(allocator);
-
-    errdefer arg_iterator.deinit();
-
-    _ = arg_iterator.next().?;
-
-    const model_path = arg_iterator.next() orelse try help(1);
-
-    while (arg_iterator.next()) |arg| {
-        try help(if (std.mem.eql(u8, arg, "--help")) 0 else 1);
-    }
-
-    return .{ .arg_iterator = arg_iterator, .model_path = model_path };
-}
-
-pub fn deinit(self: *Self) void {
-    self.arg_iterator.deinit();
-}
-
-fn help(exit_status: u8) !noreturn {
-    const console = if (exit_status == 0)
-        std.io.getStdOut().writer()
-    else
-        std.io.getStdErr().writer();
-
-    try console.print("Usage: llama2-converter <model_path> [options]\n\n", .{});
-
-    try console.print("Options:\n", .{});
-    try console.print("  --help\n", .{});
-
-    std.process.exit(exit_status);
-}
diff --git a/src/converter_main.zig b/src/converter_main.zig
deleted file mode 100644
index eeba13f..0000000
--- a/src/converter_main.zig
+++ /dev/null
@@ -1,17 +0,0 @@
-const std = @import("std");
-const Checkpoint = @import("checkpoint.zig");
-const ConverterArgs = @import("converter_args.zig");
-
-pub fn main() !void {
-    const allocator = std.heap.page_allocator;
-
-    var args = try ConverterArgs.init(allocator);
-
-    defer args.deinit();
-
-    const checkpoint = try Checkpoint.init(allocator, args.model_path);
-
-    defer checkpoint.deinit();
-
-    try checkpoint.writeV1(allocator, args.model_path);
-}
diff --git a/src/ffn.zig b/src/ffn.zig
index 936c137..ba943a8 100644
--- a/src/ffn.zig
+++ b/src/ffn.zig
@@ -2,66 +2,40 @@ const Self = @This();
 
 const std = @import("std");
 const Checkpoint = @import("checkpoint.zig");
-const Tensor = @import("./tensor.zig").Tensor;
+const Vector = @import("vector.zig");
 
-allocator: std.mem.Allocator,
 checkpoint: Checkpoint,
-input_buffer: Tensor(1),
-gate_buffer: Tensor(1),
-hidden_buffer: Tensor(1),
-output_buffer: Tensor(1),
-
-pub fn init(allocator: std.mem.Allocator, checkpoint: Checkpoint) !Self {
-    const input_buffer = try Tensor(1).init(allocator, [_]usize{checkpoint.embedding_size});
-
-    errdefer input_buffer.deinit();
-
-    const gate_buffer = try Tensor(1).init(allocator, [_]usize{checkpoint.ffn_hidden_size});
-
-    errdefer gate_buffer.deinit();
-
-    const hidden_buffer = try Tensor(1).init(allocator, [_]usize{checkpoint.ffn_hidden_size});
-
-    errdefer hidden_buffer.deinit();
-
-    const output_buffer = try Tensor(1).init(allocator, [_]usize{checkpoint.embedding_size});
-
-    errdefer output_buffer.deinit();
+input: Vector,
+gate: Vector,
+hidden: Vector,
+output: Vector,
 
+pub fn createLeaky(allocator: std.mem.Allocator, checkpoint: Checkpoint) !Self {
     return .{
-        .allocator = allocator,
         .checkpoint = checkpoint,
-        .input_buffer = input_buffer,
-        .gate_buffer = gate_buffer,
-        .hidden_buffer = hidden_buffer,
-        .output_buffer = output_buffer,
+        .input = try Vector.createLeaky(allocator, checkpoint.embedding_size),
+        .gate = try Vector.createLeaky(allocator, checkpoint.ffn_hidden_size),
+        .hidden = try Vector.createLeaky(allocator, checkpoint.ffn_hidden_size),
+        .output = try Vector.createLeaky(allocator, checkpoint.embedding_size),
     };
 }
 
-pub fn deinit(self: Self) void {
-    self.input_buffer.deinit();
-    self.gate_buffer.deinit();
-    self.hidden_buffer.deinit();
-    self.output_buffer.deinit();
-}
-
 // SwiGLU activation function: https://arxiv.org/abs/2002.05202
-pub fn forward(self: Self, layer: usize) void {
+pub fn forward(self: Self, layer: usize) !void {
     @setFloatMode(.Optimized);
 
-    const weights = self.checkpoint.weights;
-    const gate_matrix = weights.ffn_gate_matrices.slice(layer);
-    const up_matrix = weights.ffn_up_matrices.slice(layer);
-    const down_matrix = weights.ffn_down_matrices.slice(layer);
+    const gate_weight = self.checkpoint.ffn_gate_weights[layer];
+    const up_weight = self.checkpoint.ffn_up_weights[layer];
+    const down_weight = self.checkpoint.ffn_down_weights[layer];
 
-    gate_matrix.computeMatrixVectorMultiplication(self.input_buffer, self.gate_buffer);
-    up_matrix.computeMatrixVectorMultiplication(self.input_buffer, self.hidden_buffer);
+    try gate_weight.multiplyVector(self.input, self.gate);
+    try up_weight.multiplyVector(self.input, self.hidden);
 
     for (0..self.checkpoint.ffn_hidden_size) |index| {
-        self.hidden_buffer.values[index] *= swish(self.gate_buffer.values[index]);
+        self.hidden.values[index] *= swish(self.gate.values[index]);
     }
 
-    down_matrix.computeMatrixVectorMultiplication(self.hidden_buffer, self.output_buffer);
+    try down_weight.multiplyVector(self.hidden, self.output);
 }
 
 // Swish activation function: https://arxiv.org/abs/1710.05941
diff --git a/src/generator.zig b/src/generator.zig
index 1dc62fd..66f30aa 100644
--- a/src/generator.zig
+++ b/src/generator.zig
@@ -7,46 +7,31 @@ const Sampler = @import("sampler.zig");
 const Tokenizer = @import("tokenizer.zig");
 const Transformer = @import("transformer.zig");
 
-allocator: std.mem.Allocator,
 transformer: Transformer,
 tokenizer: Tokenizer,
 sampler: Sampler,
 prompt_tokens: []usize,
 verbose: bool,
 
-pub fn init(allocator: std.mem.Allocator, args: GeneratorArgs) !Self {
-    const transformer = try Transformer.init(allocator, args.model_path, args.sequence_length);
-
-    errdefer transformer.deinit();
+pub fn createLeaky(allocator: std.mem.Allocator, args: GeneratorArgs) !Self {
+    const transformer = try Transformer.createLeaky(
+        allocator,
+        args.model_path,
+        args.sequence_length,
+    );
 
     const vocab_size = transformer.checkpoint.vocab_size;
-    const tokenizer = try Tokenizer.init(allocator, args.model_path, vocab_size);
-
-    errdefer tokenizer.deinit();
-
-    const sampler = try Sampler.init(allocator, args, vocab_size);
-
-    errdefer sampler.deinit();
-
-    const prompt_tokens = try tokenizer.encode(allocator, args.prompt);
+    const tokenizer = try Tokenizer.readLeaky(allocator, args.model_path, vocab_size);
 
     return .{
-        .allocator = allocator,
         .transformer = transformer,
         .tokenizer = tokenizer,
-        .sampler = sampler,
-        .prompt_tokens = prompt_tokens,
+        .sampler = try Sampler.createLeaky(allocator, args, vocab_size),
+        .prompt_tokens = try tokenizer.encode(allocator, args.prompt),
         .verbose = args.verbose,
     };
 }
 
-pub fn deinit(self: Self) void {
-    self.transformer.deinit();
-    self.tokenizer.deinit();
-    self.sampler.deinit();
-    self.allocator.free(self.prompt_tokens);
-}
-
 const bos_token = 1; // beginning of sequence
 const eos_token = 2; // end of sequence
 
@@ -64,7 +49,7 @@ pub fn generate(self: *Self, writer: anytype) !void {
             start_time = std.time.milliTimestamp();
         }
 
-        self.transformer.forward(token, position);
+        try self.transformer.forward(token, position);
 
         if (start_time > 0) {
             total_time += std.time.milliTimestamp() - start_time;
@@ -74,7 +59,7 @@ pub fn generate(self: *Self, writer: anytype) !void {
             next_token = self.prompt_tokens[prompt_tokens_index];
             prompt_tokens_index += 1;
         } else {
-            next_token = self.sampler.sample(self.transformer.output_buffer.values);
+            next_token = self.sampler.sample(self.transformer.output.values);
         }
 
         if (next_token == bos_token or next_token == eos_token) {
@@ -99,16 +84,13 @@ pub fn generate(self: *Self, writer: anytype) !void {
 }
 
 test "generate tiny story" {
-    var output = std.ArrayList(u8).init(std.testing.allocator);
+    var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
 
-    defer output.deinit();
+    defer arena.deinit();
 
-    var arg_iterator = try std.process.argsWithAllocator(std.testing.allocator);
-
-    defer arg_iterator.deinit();
+    var output = std.ArrayList(u8).init(arena.allocator());
 
     const args = GeneratorArgs{
-        .arg_iterator = arg_iterator,
         .model_path = "models/tinystories_260k",
         .temperature = 1,
         .top_p = 0.9,
@@ -118,9 +100,7 @@ test "generate tiny story" {
         .verbose = false,
     };
 
-    var generator = try Self.init(std.testing.allocator, args);
-
-    defer generator.deinit();
+    var generator = try Self.createLeaky(arena.allocator(), args);
 
     try generator.generate(output.writer());
 
diff --git a/src/generator_args.zig b/src/generator_args.zig
index b95fe05..57c93d0 100644
--- a/src/generator_args.zig
+++ b/src/generator_args.zig
@@ -2,7 +2,6 @@ const Self = @This();
 
 const std = @import("std");
 
-arg_iterator: std.process.ArgIterator,
 model_path: []const u8,
 temperature: f32,
 top_p: f32,
@@ -13,11 +12,9 @@ verbose: bool,
 
 const Option = enum { temperature, top_p, random_seed, sequence_length, prompt };
 
-pub fn init(allocator: std.mem.Allocator) !Self {
+pub fn createLeaky(allocator: std.mem.Allocator) !Self {
     var arg_iterator = try std.process.argsWithAllocator(allocator);
 
-    errdefer arg_iterator.deinit();
-
     _ = arg_iterator.next().?;
 
     const model_path = arg_iterator.next() orelse try help(1);
@@ -69,7 +66,6 @@ pub fn init(allocator: std.mem.Allocator) !Self {
     }
 
     return .{
-        .arg_iterator = arg_iterator,
         .model_path = model_path,
         .temperature = @max(@min(temperature orelse 1, 1), 0),
         .top_p = @max(@min(top_p orelse 0.9, 1), 0),
@@ -80,10 +76,6 @@ pub fn init(allocator: std.mem.Allocator) !Self {
     };
 }
 
-pub fn deinit(self: *Self) void {
-    self.arg_iterator.deinit();
-}
-
 fn help(exit_status: u8) !noreturn {
     const console = if (exit_status == 0)
         std.io.getStdOut().writer()
diff --git a/src/generator_main.zig b/src/generator_main.zig
index 6b420ad..2bf605d 100644
--- a/src/generator_main.zig
+++ b/src/generator_main.zig
@@ -3,15 +3,13 @@ const Generator = @import("generator.zig");
 const GeneratorArgs = @import("generator_args.zig");
 
 pub fn main() !void {
-    const allocator = std.heap.page_allocator;
+    var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
 
-    var args = try GeneratorArgs.init(allocator);
+    defer arena.deinit();
 
-    defer args.deinit();
+    const args = try GeneratorArgs.createLeaky(arena.allocator());
 
-    var generator = try Generator.init(allocator, args);
-
-    defer generator.deinit();
+    var generator = try Generator.createLeaky(arena.allocator(), args);
 
     try generator.generate(std.io.getStdOut().writer());
 }
diff --git a/src/matrix.zig b/src/matrix.zig
new file mode 100644
index 0000000..ad372b5
--- /dev/null
+++ b/src/matrix.zig
@@ -0,0 +1,39 @@
+const Self = @This();
+
+const std = @import("std");
+const Vector = @import("vector.zig");
+
+rows: []const Vector,
+
+pub fn readLeaky(
+    allocator: std.mem.Allocator,
+    file: std.fs.File,
+    m_rows: usize,
+    n_cols: usize,
+) !Self {
+    return .{ .rows = try Vector.readMultipleLeaky(allocator, file, m_rows, n_cols) };
+}
+
+pub fn readMultipleLeaky(
+    allocator: std.mem.Allocator,
+    file: std.fs.File,
+    n_matrices: usize,
+    m_rows: usize,
+    n_cols: usize,
+) ![]Self {
+    const matrices = try allocator.alloc(Self, n_matrices);
+
+    for (matrices) |*matrix| {
+        matrix.* = try readLeaky(allocator, file, m_rows, n_cols);
+    }
+
+    return matrices;
+}
+
+pub fn multiplyVector(self: Self, input: Vector, output: Vector) !void {
+    std.debug.assert(self.rows.len == output.values.len);
+
+    for (output.values, 0..) |*value, index| {
+        value.* = try self.rows[index].computeScalarProduct(input);
+    }
+}
diff --git a/src/quantized_tensor.zig b/src/quantized_tensor.zig
deleted file mode 100644
index b41bd64..0000000
--- a/src/quantized_tensor.zig
+++ /dev/null
@@ -1,117 +0,0 @@
-const std = @import("std");
-
-pub fn QuantizedTensor(comptime n_dims: comptime_int) type {
-    comptime if (n_dims < 1) @compileError("n_dims < 1");
-
-    return struct {
-        const Self = @This();
-
-        allocator: ?std.mem.Allocator,
-        sub_dims: [n_dims - 1]usize,
-        group_size: usize,
-        values: []i8,
-        scaling_factors: []f32,
-
-        pub fn init(allocator: std.mem.Allocator, dims: [n_dims]usize, group_size: usize) !Self {
-            const n_values = @reduce(.Mul, @as(@Vector(n_dims, usize), dims));
-
-            if (n_values % group_size != 0) {
-                return error.InvalidGroupSize;
-            }
-
-            const n_groups = n_values / group_size;
-
-            return .{
-                .allocator = allocator,
-                .sub_dims = dims[1..].*,
-                .group_size = group_size,
-                .values = try allocator.alloc(i8, n_values),
-                .scaling_factors = try allocator.alloc(f32, n_groups),
-            };
-        }
-
-        pub fn deinit(self: Self) void {
-            if (self.allocator) |allocator| {
-                allocator.free(self.values);
-                allocator.free(self.scaling_factors);
-            }
-        }
-
-        pub fn slice(self: Self, index: usize) !QuantizedTensor(n_dims - 1) {
-            comptime if (n_dims < 2) @compileError("n_dims < 2");
-
-            const n_sub_values = @reduce(.Mul, @as(@Vector(n_dims - 1, usize), self.sub_dims));
-
-            if (n_sub_values % self.group_size != 0) {
-                return error.InvalidGroupSize;
-            }
-
-            const n_sub_groups = n_sub_values / self.group_size;
-
-            return .{
-                .allocator = null,
-                .sub_dims = self.sub_dims[1..].*,
-                .group_size = self.group_size,
-                .values = self.values[index * n_sub_values ..][0..n_sub_values],
-                .scaling_factors = self.scaling_factors[index * n_sub_groups ..][0..n_sub_groups],
-            };
-        }
-
-        pub fn computeMatrixVectorMultiplication(
-            self: Self,
-            input: anytype,
-            output: anytype,
-        ) !void {
-            for (output.values, 0..) |*value, index| {
-                value.* = try (try self.slice(index)).computeScalarProduct(&input);
-            }
-        }
-
-        fn computeScalarProduct(self: Self, other: anytype) !f32 {
-            // https://github.com/karpathy/llama2.c/pull/312#issuecomment-1684140683
-            if (self.group_size == 32) {
-                return _computeScalarProduct(32, self, other);
-            }
-
-            if (self.group_size == 16) {
-                return _computeScalarProduct(16, self, other);
-            }
-
-            if (self.group_size == 8) {
-                return _computeScalarProduct(8, self, other);
-            }
-
-            if (self.group_size == 4) {
-                return _computeScalarProduct(4, self, other);
-            }
-
-            return error.UnsupportedGroupSize;
-        }
-    };
-}
-
-fn _computeScalarProduct(
-    comptime vector_size: comptime_int,
-    input_1: anytype,
-    input_2: anytype,
-) f32 {
-    @setFloatMode(.Optimized);
-
-    std.debug.assert(input_1.values.len == input_2.values.len);
-    std.debug.assert(input_1.scaling_factors.len == input_2.scaling_factors.len);
-
-    var output_value: f32 = 0;
-    var index: usize = 0;
-
-    while (index < input_1.values.len) : (index += vector_size) {
-        const values: @Vector(vector_size, i32) =
-            @as(@Vector(vector_size, i8), input_1.values[index..][0..vector_size].*) *
-            @as(@Vector(vector_size, i8), input_2.values[index..][0..vector_size].*);
-
-        output_value += @as(f32, @floatFromInt(@reduce(.Add, values))) *
-            input_1.scaling_factors[index / vector_size] *
-            input_2.scaling_factors[index / vector_size];
-    }
-
-    return output_value;
-}
diff --git a/src/sampler.zig b/src/sampler.zig
index 6001921..c9e74fa 100644
--- a/src/sampler.zig
+++ b/src/sampler.zig
@@ -4,29 +4,20 @@ const builtin = @import("builtin");
 const std = @import("std");
 const math = @import("math.zig");
 
-allocator: std.mem.Allocator,
-probability_index_pairs_buffer: []ProbabilityIndexPair,
+probability_index_pairs: []ProbabilityIndexPair,
+rng_state: u64,
 temperature: f32,
 top_p: f32,
-rng_state: u64,
-
-pub fn init(allocator: std.mem.Allocator, args: anytype, vocab_size: usize) !Self {
-    const probability_index_pairs_buffer =
-        try allocator.alloc(ProbabilityIndexPair, vocab_size);
 
+pub fn createLeaky(allocator: std.mem.Allocator, args: anytype, vocab_size: usize) !Self {
     return .{
-        .allocator = allocator,
-        .probability_index_pairs_buffer = probability_index_pairs_buffer,
+        .probability_index_pairs = try allocator.alloc(ProbabilityIndexPair, vocab_size),
+        .rng_state = args.random_seed,
         .temperature = args.temperature,
         .top_p = args.top_p,
-        .rng_state = args.random_seed,
     };
 }
 
-pub fn deinit(self: Self) void {
-    self.allocator.free(self.probability_index_pairs_buffer);
-}
-
 pub fn sample(self: *Self, probability_distribution: []f32) usize {
     if (self.temperature == 0) {
         return math.argmax(probability_distribution);
@@ -42,11 +33,7 @@ pub fn sample(self: *Self, probability_distribution: []f32) usize {
         return self.sampleMultinomial(probability_distribution);
     }
 
-    return self.sampleNucleus(
-        probability_distribution,
-        self.top_p,
-        self.probability_index_pairs_buffer,
-    );
+    return self.sampleNucleus(probability_distribution);
 }
 
 const tolerance: comptime_float = std.math.sqrt(std.math.floatEps(f32));
@@ -82,31 +69,26 @@ fn sampleMultinomial(self: *Self, probability_distribution: []const f32) usize {
 const ProbabilityIndexPair = struct { probability: f32, index: usize };
 
 // Nucleus sampling: https://arxiv.org/abs/1904.09751
-fn sampleNucleus(
-    self: *Self,
-    probability_distribution: []const f32,
-    top_p: f32,
-    probability_index_pairs_buffer: []ProbabilityIndexPair,
-) usize {
+fn sampleNucleus(self: *Self, probability_distribution: []const f32) usize {
     @setFloatMode(.Optimized);
 
     std.debug.assert(probability_distribution.len > 0);
 
     // https://github.com/karpathy/llama2.c/commit/d421a95b2bfe593b2d9e5c147f3efc8d128afe0e
     var probability_threshold: f32 =
-        (1 - top_p) / @as(f32, @floatFromInt(probability_distribution.len - 1));
+        (1 - self.top_p) / @as(f32, @floatFromInt(probability_distribution.len - 1));
 
     var n_probability_index_pairs: usize = 0;
 
     for (probability_distribution, 0..) |probability, index| {
-        if (probability_threshold < probability) {
-            probability_index_pairs_buffer[n_probability_index_pairs].probability = probability;
-            probability_index_pairs_buffer[n_probability_index_pairs].index = index;
+        if (probability >= probability_threshold) {
+            self.probability_index_pairs[n_probability_index_pairs].probability = probability;
+            self.probability_index_pairs[n_probability_index_pairs].index = index;
             n_probability_index_pairs += 1;
         }
     }
 
-    var probability_index_pairs = probability_index_pairs_buffer[0..n_probability_index_pairs];
+    var probability_index_pairs = self.probability_index_pairs[0..n_probability_index_pairs];
 
     std.sort.block(ProbabilityIndexPair, probability_index_pairs, {}, lessThan);
 
@@ -115,7 +97,7 @@ fn sampleNucleus(
     for (probability_index_pairs, 0..) |probability_index_pair, index| {
         cumulative_probability += probability_index_pair.probability;
 
-        if (cumulative_probability > top_p) {
+        if (cumulative_probability > self.top_p) {
             probability_index_pairs = probability_index_pairs[0 .. index + 1];
 
             break;
diff --git a/src/simd.zig b/src/simd.zig
index 37c637f..539e4ff 100644
--- a/src/simd.zig
+++ b/src/simd.zig
@@ -2,96 +2,104 @@ const std = @import("std");
 
 // Pre-normalization using RMSNorm: https://arxiv.org/abs/1910.07467
 pub fn computeRMSNorm(
-    comptime TValue: type,
-    comptime vector_size: comptime_int,
-    input_values: []const TValue,
-    weight_values: []const TValue,
-    output_values: []TValue,
-) void {
+    input_values: []const f32,
+    weight_values: []const f32,
+    output_values: []f32,
+) !void {
     @setFloatMode(.Optimized);
 
-    var rms_scaling_factor = computeScalarProduct(TValue, vector_size, input_values, input_values);
+    var scaling_factor = try computeScalarProduct(input_values, input_values);
 
-    rms_scaling_factor /= @floatFromInt(input_values.len);
-    rms_scaling_factor += 1e-5;
-    rms_scaling_factor = 1 / std.math.sqrt(rms_scaling_factor);
+    scaling_factor /= @floatFromInt(input_values.len);
+    scaling_factor += 1e-5;
+    scaling_factor = 1 / std.math.sqrt(scaling_factor);
 
-    computeVectorMultiplication(
-        TValue,
-        vector_size,
-        rms_scaling_factor,
-        input_values,
-        weight_values,
-        output_values,
-    );
+    try computeVectorMultiplication(scaling_factor, input_values, weight_values, output_values);
 }
 
-pub fn computeScalarProduct(
-    comptime TValue: type,
-    comptime vector_size: comptime_int,
-    values_1: []const TValue,
-    values_2: []const TValue,
-) f32 {
+pub fn computeScalarProduct(input_values_1: []const f32, input_values_2: []const f32) !f32 {
     @setFloatMode(.Optimized);
 
-    std.debug.assert(values_1.len == values_2.len);
-    std.debug.assert(values_1.len % vector_size == 0);
+    std.debug.assert(input_values_1.len == input_values_2.len);
+
+    comptime var vector_len = std.atomic.cache_line / @sizeOf(f32);
+
+    inline while (vector_len >= 4) : (vector_len /= 2) {
+        if (input_values_1.len % vector_len == 0) {
+            var output_values: @Vector(vector_len, f32) = @splat(0);
+            var index: usize = 0;
 
-    var output_values: @Vector(vector_size, f32) = @splat(0.0);
-    var index: usize = 0;
+            while (index < input_values_1.len) : (index += vector_len) {
+                output_values +=
+                    @as(@Vector(vector_len, f32), input_values_1[index..][0..vector_len].*) *
+                    @as(@Vector(vector_len, f32), input_values_2[index..][0..vector_len].*);
+            }
 
-    while (index < values_1.len) : (index += vector_size) {
-        output_values +=
-            @as(@Vector(vector_size, f32), values_1[index..][0..vector_size].*) *
-            @as(@Vector(vector_size, f32), values_2[index..][0..vector_size].*);
+            return @reduce(.Add, output_values);
+        }
     }
 
-    return @reduce(.Add, output_values);
+    return error.UnsupportedVectorSize;
 }
 
 pub fn computeVectorAddition(
-    comptime TValue: type,
-    comptime vector_size: comptime_int,
-    input_values_1: []const TValue,
-    input_values_2: []const TValue,
-    output_values: []TValue,
-) void {
+    input_values_1: []const f32,
+    input_values_2: []const f32,
+    output_values: []f32,
+) !void {
     @setFloatMode(.Optimized);
 
     std.debug.assert(input_values_1.len == input_values_2.len);
-    std.debug.assert(input_values_1.len % vector_size == 0);
+    std.debug.assert(input_values_1.len == output_values.len);
+
+    comptime var vector_len = std.atomic.cache_line / @sizeOf(f32);
 
-    var index: usize = 0;
+    inline while (vector_len >= 4) : (vector_len /= 2) {
+        if (input_values_1.len % vector_len == 0) {
+            var index: usize = 0;
 
-    while (index < input_values_1.len) : (index += vector_size) {
-        output_values[index..][0..vector_size].* =
-            @as(@Vector(vector_size, TValue), input_values_1[index..][0..vector_size].*) +
-            @as(@Vector(vector_size, TValue), input_values_2[index..][0..vector_size].*);
+            while (index < input_values_1.len) : (index += vector_len) {
+                output_values[index..][0..vector_len].* =
+                    @as(@Vector(vector_len, f32), input_values_1[index..][0..vector_len].*) +
+                    @as(@Vector(vector_len, f32), input_values_2[index..][0..vector_len].*);
+            }
+
+            return;
+        }
     }
+
+    return error.UnsupportedVectorSize;
 }
 
 pub fn computeVectorMultiplication(
-    comptime TValue: type,
-    comptime vector_size: comptime_int,
     scaling_factor: f32,
-    input_values_1: []const TValue,
-    input_values_2: []const TValue,
-    output_values: []TValue,
-) void {
+    input_values_1: []const f32,
+    input_values_2: []const f32,
+    output_values: []f32,
+) !void {
     @setFloatMode(.Optimized);
 
     std.debug.assert(input_values_1.len == input_values_2.len);
     std.debug.assert(input_values_1.len == output_values.len);
-    std.debug.assert(input_values_1.len % vector_size == 0);
 
-    const scaling_factors: @Vector(vector_size, f32) = @splat(scaling_factor);
+    comptime var vector_len = std.atomic.cache_line / @sizeOf(f32);
+
+    inline while (vector_len >= 4) : (vector_len /= 2) {
+        if (input_values_1.len % vector_len == 0) {
+            const scaling_factors: @Vector(vector_len, f32) = @splat(scaling_factor);
 
-    var index: usize = 0;
+            var index: usize = 0;
 
-    while (index < input_values_1.len) : (index += vector_size) {
-        output_values[index..][0..vector_size].* =
-            scaling_factors *
-            @as(@Vector(vector_size, TValue), input_values_1[index..][0..vector_size].*) *
-            @as(@Vector(vector_size, TValue), input_values_2[index..][0..vector_size].*);
+            while (index < input_values_1.len) : (index += vector_len) {
+                output_values[index..][0..vector_len].* =
+                    scaling_factors *
+                    @as(@Vector(vector_len, f32), input_values_1[index..][0..vector_len].*) *
+                    @as(@Vector(vector_len, f32), input_values_2[index..][0..vector_len].*);
+            }
+
+            return;
+        }
     }
+
+    return error.UnsupportedVectorSize;
 }
diff --git a/src/tensor.zig b/src/tensor.zig
deleted file mode 100644
index 2874bec..0000000
--- a/src/tensor.zig
+++ /dev/null
@@ -1,93 +0,0 @@
-const std = @import("std");
-const simd = @import("simd.zig");
-
-pub fn Tensor(comptime n_dims: comptime_int) type {
-    comptime if (n_dims < 1) @compileError("n_dims < 1");
-
-    return struct {
-        const Self = @This();
-
-        allocator: ?std.mem.Allocator,
-        sub_dims: [n_dims - 1]usize,
-        values: []f32,
-
-        pub fn init(allocator: std.mem.Allocator, dims: [n_dims]usize) !Self {
-            const n_values = @reduce(.Mul, @as(@Vector(n_dims, usize), dims));
-
-            return .{
-                .allocator = allocator,
-                .sub_dims = dims[1..].*,
-                .values = try allocator.alloc(f32, n_values),
-            };
-        }
-
-        pub fn deinit(self: Self) void {
-            if (self.allocator) |allocator| {
-                allocator.free(self.values);
-            }
-        }
-
-        pub fn read(self: Self, file: std.fs.File) !void {
-            const values: [*]u8 = @ptrCast(self.values);
-
-            try file.reader().readNoEof(values[0 .. self.values.len * @sizeOf(f32)]);
-        }
-
-        pub fn write(self: Self, file: std.fs.File) !void {
-            const values: [*]u8 = @ptrCast(self.values);
-
-            try file.writer().writeAll(values[0 .. self.values.len * @sizeOf(f32)]);
-        }
-
-        pub fn slice(self: Self, index: usize) Tensor(n_dims - 1) {
-            comptime if (n_dims < 2) @compileError("n_dims < 2");
-
-            const n_sub_values = @reduce(.Mul, @as(@Vector(n_dims - 1, usize), self.sub_dims));
-
-            return .{
-                .allocator = null,
-                .sub_dims = self.sub_dims[1..].*,
-                .values = self.values[index * n_sub_values ..][0..n_sub_values],
-            };
-        }
-
-        pub fn computeMatrixVectorMultiplication(self: Self, input: anytype, output: anytype) void {
-            for (output.values, 0..) |*value, index| {
-                value.* = self.slice(index).computeScalarProduct(input);
-            }
-        }
-
-        pub fn computeRMSNorm(self: Self, weight: anytype, output: anytype) void {
-            if (self.values.len % 32 == 0)
-                simd.computeRMSNorm(f32, 32, self.values, weight.values, output.values)
-            else if (self.values.len % 16 == 0)
-                simd.computeRMSNorm(f32, 16, self.values, weight.values, output.values)
-            else if (self.values.len % 8 == 0)
-                simd.computeRMSNorm(f32, 8, self.values, weight.values, output.values)
-            else
-                simd.computeRMSNorm(f32, 4, self.values, weight.values, output.values);
-        }
-
-        pub fn computeScalarProduct(self: Self, other: anytype) f32 {
-            return if (self.values.len % 32 == 0)
-                simd.computeScalarProduct(f32, 32, self.values, other.values)
-            else if (self.values.len % 16 == 0)
-                simd.computeScalarProduct(f32, 16, self.values, other.values)
-            else if (self.values.len % 8 == 0)
-                simd.computeScalarProduct(f32, 8, self.values, other.values)
-            else
-                simd.computeScalarProduct(f32, 4, self.values, other.values);
-        }
-
-        pub fn computeVectorAddition(self: Self, other: anytype) void {
-            if (self.values.len % 32 == 0)
-                simd.computeVectorAddition(f32, 32, self.values, other.values, self.values)
-            else if (self.values.len % 16 == 0)
-                simd.computeVectorAddition(f32, 16, self.values, other.values, self.values)
-            else if (self.values.len % 8 == 0)
-                simd.computeVectorAddition(f32, 8, self.values, other.values, self.values)
-            else
-                simd.computeVectorAddition(f32, 4, self.values, other.values, self.values);
-        }
-    };
-}
diff --git a/src/tokenizer.zig b/src/tokenizer.zig
index 055161a..735f483 100644
--- a/src/tokenizer.zig
+++ b/src/tokenizer.zig
@@ -2,25 +2,12 @@ const Self = @This();
 
 const std = @import("std");
 
-allocator: std.mem.Allocator,
 max_word_length: usize,
 vocab: []const []const u8,
 word_scores: []const f32,
 sorted_vocab: []const VocabEntry,
 
-pub fn init(allocator: std.mem.Allocator, model_path: []const u8, vocab_size: usize) !Self {
-    var vocab = try allocator.alloc([]u8, vocab_size);
-
-    errdefer for (vocab) |word| {
-        allocator.free(word);
-    };
-
-    errdefer allocator.free(vocab);
-
-    var word_scores = try allocator.alloc(f32, vocab_size);
-
-    errdefer allocator.free(word_scores);
-
+pub fn readLeaky(allocator: std.mem.Allocator, model_path: []const u8, vocab_size: usize) !Self {
     const path = try std.fs.path.join(allocator, &[_][]const u8{ model_path, "tokenizer.bin" });
 
     defer allocator.free(path);
@@ -29,41 +16,30 @@ pub fn init(allocator: std.mem.Allocator, model_path: []const u8, vocab_size: us
 
     defer file.close();
 
-    const reader = file.reader();
-    const max_word_length = try reader.readIntLittle(u32);
+    const max_word_length = try file.reader().readIntLittle(u32);
+
+    var vocab = try allocator.alloc([]u8, vocab_size);
+    var word_scores = try allocator.alloc(f32, vocab_size);
 
-    for (word_scores, 0..) |*word_score, word_index| {
-        word_score.* = @bitCast(try reader.readIntLittle(u32));
+    for (word_scores, 0..) |*word_score, index| {
+        word_score.* = @bitCast(try file.reader().readIntLittle(u32));
 
-        const word_length = try reader.readIntLittle(u32);
+        const word_length = try file.reader().readIntLittle(u32);
         const word = try allocator.alloc(u8, word_length);
 
-        try reader.readNoEof(word);
+        try file.reader().readNoEof(word);
 
-        vocab[word_index] = word;
+        vocab[index] = word;
     }
 
-    const sorted_vocab = try sortVocab(allocator, vocab);
-
     return .{
-        .allocator = allocator,
         .max_word_length = max_word_length,
         .vocab = vocab,
         .word_scores = word_scores,
-        .sorted_vocab = sorted_vocab,
+        .sorted_vocab = try sortVocab(allocator, vocab),
     };
 }
 
-pub fn deinit(self: Self) void {
-    for (self.vocab) |word| {
-        self.allocator.free(word);
-    }
-
-    self.allocator.free(self.vocab);
-    self.allocator.free(self.word_scores);
-    self.allocator.free(self.sorted_vocab);
-}
-
 pub fn encode(self: Self, allocator: std.mem.Allocator, text: []const u8) ![]usize {
     var double_word_buffer = try allocator.alloc(u8, self.max_word_length * 2);
 
@@ -100,10 +76,10 @@ fn encodeCodepoints(self: Self, allocator: std.mem.Allocator, text: []const u8)
 
     var text_view = try std.unicode.Utf8View.init(text);
     var text_iterator = text_view.iterator();
-    var token_index: usize = 0;
+    var index: usize = 0;
 
-    while (text_iterator.nextCodepointSlice()) |codepoints| : (token_index += 1) {
-        if (token_index == 0) {
+    while (text_iterator.nextCodepointSlice()) |codepoints| : (index += 1) {
+        if (index == 0) {
             // https://github.com/karpathy/llama2.c/blob/7ac65cb2c2b169050747be92011b7bebdd1b4544/run.c#L483
             try tokens.append(self.lookupToken(" ") orelse return error.BadVocab);
         }
@@ -127,12 +103,12 @@ fn mergeBestWordPair(self: Self, tokens: []usize, double_word_buffer: []u8) bool
     }
 
     var best_token: ?usize = null;
-    var best_token_index: ?usize = null;
+    var best_index: ?usize = null;
     var best_word_score = -std.math.floatMax(f32);
 
-    for (0..tokens.len - 1) |token_index| {
-        const word1 = self.vocab[tokens[token_index]];
-        const word2 = self.vocab[tokens[token_index + 1]];
+    for (0..tokens.len - 1) |index| {
+        const word1 = self.vocab[tokens[index]];
+        const word2 = self.vocab[tokens[index + 1]];
 
         @memcpy(double_word_buffer[0..word1.len], word1);
         @memcpy(double_word_buffer[word1.len .. word1.len + word2.len], word2);
@@ -144,19 +120,19 @@ fn mergeBestWordPair(self: Self, tokens: []usize, double_word_buffer: []u8) bool
 
         if (word_score > best_word_score) {
             best_token = token;
-            best_token_index = token_index;
+            best_index = index;
             best_word_score = word_score;
         }
     }
 
-    if (best_token_index) |token_index| {
+    if (best_index) |index| {
         std.mem.copyForwards(
             usize,
-            tokens[token_index + 1 .. tokens.len - 1],
-            tokens[token_index + 2 ..],
+            tokens[index + 1 .. tokens.len - 1],
+            tokens[index + 2 ..],
         );
 
-        tokens[token_index] = best_token.?;
+        tokens[index] = best_token.?;
 
         return true;
     }
@@ -217,118 +193,110 @@ const tinystories_260k_path = "models/tinystories_260k";
 // https://github.com/karpathy/llama2.c/pull/226
 // https://github.com/karpathy/llama2.c/pull/297
 test "encode utf-8" {
-    const tokenizer = try Self.init(std.testing.allocator, tinystories_15m_path, 32000);
+    var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
 
-    defer tokenizer.deinit();
+    defer arena.deinit();
 
+    const tokenizer = try Self.readLeaky(arena.allocator(), tinystories_15m_path, 32000);
     const expected = [_]usize{ 365, 1691, 1018, 3963, 669, 29871, 31409, 30607, 30437, 30564 };
-    const actual = try tokenizer.encode(std.testing.allocator, "Lets try ö & 株式会社");
-
-    defer std.testing.allocator.free(actual);
+    const actual = try tokenizer.encode(arena.allocator(), "Lets try ö & 株式会社");
 
     try std.testing.expectEqualSlices(usize, expected[0..], actual);
 }
 
 test "encode empty string" {
-    const tokenizer = try Self.init(std.testing.allocator, tinystories_15m_path, 32000);
+    var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
 
-    defer tokenizer.deinit();
+    defer arena.deinit();
 
+    const tokenizer = try Self.readLeaky(arena.allocator(), tinystories_15m_path, 32000);
     const expected = [_]usize{};
-    const actual = try tokenizer.encode(std.testing.allocator, "");
-
-    defer std.testing.allocator.free(actual);
+    const actual = try tokenizer.encode(arena.allocator(), "");
 
     try std.testing.expectEqualSlices(usize, expected[0..], actual);
 }
 
 test "encode unknown codepoint" {
-    const tokenizer = try Self.init(std.testing.allocator, tinystories_15m_path, 32000);
+    var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
 
-    defer tokenizer.deinit();
+    defer arena.deinit();
 
+    const tokenizer = try Self.readLeaky(arena.allocator(), tinystories_15m_path, 32000);
     const expected = [_]usize{ 29871, 243, 149, 145, 154, 243, 150, 147, 144 };
-    const actual = try tokenizer.encode(std.testing.allocator, "𒎗𓐍");
-
-    defer std.testing.allocator.free(actual);
+    const actual = try tokenizer.encode(arena.allocator(), "𒎗𓐍");
 
     try std.testing.expectEqualSlices(usize, expected[0..], actual);
 }
 
 test "encode single chars" {
-    const tokenizer = try Self.init(std.testing.allocator, tinystories_260k_path, 512);
+    var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
 
-    defer tokenizer.deinit();
+    defer arena.deinit();
 
+    const tokenizer = try Self.readLeaky(arena.allocator(), tinystories_260k_path, 512);
     const expected = [_]usize{ 261, 430, 429, 418, 411, 431, 428, 415 };
-    const actual = try tokenizer.encode(std.testing.allocator, "abcdefgh");
-
-    defer std.testing.allocator.free(actual);
+    const actual = try tokenizer.encode(arena.allocator(), "abcdefgh");
 
     try std.testing.expectEqualSlices(usize, expected[0..], actual);
 }
 
 // https://github.com/facebookresearch/llama/blob/ea9f33d6d3ea8ed7d560d270986407fd6c2e52b7/example_text_completion.py
 test "meta encoding example 1" {
-    const tokenizer = try Self.init(std.testing.allocator, tinystories_15m_path, 32000);
+    var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
 
-    defer tokenizer.deinit();
+    defer arena.deinit();
 
+    const tokenizer = try Self.readLeaky(arena.allocator(), tinystories_15m_path, 32000);
     const expected = [_]usize{ 306, 4658, 278, 6593, 310, 2834, 338 };
-    const actual = try tokenizer.encode(std.testing.allocator, "I believe the meaning of life is");
-
-    defer std.testing.allocator.free(actual);
+    const actual = try tokenizer.encode(arena.allocator(), "I believe the meaning of life is");
 
     try std.testing.expectEqualSlices(usize, expected[0..], actual);
 }
 
 test "meta encoding example 2" {
-    const tokenizer = try Self.init(std.testing.allocator, tinystories_15m_path, 32000);
+    var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
 
-    defer tokenizer.deinit();
+    defer arena.deinit();
 
+    const tokenizer = try Self.readLeaky(arena.allocator(), tinystories_15m_path, 32000);
     const expected = [_]usize{ 3439, 17632, 1925, 29892, 278, 6368, 310, 14215, 537, 5922, 393, 29871 };
 
     const actual = try tokenizer.encode(
-        std.testing.allocator,
+        arena.allocator(),
         "Simply put, the theory of relativity states that ",
     );
 
-    defer std.testing.allocator.free(actual);
-
     try std.testing.expectEqualSlices(usize, expected[0..], actual);
 }
 
 test "meta encoding example 3" {
-    const tokenizer = try Self.init(std.testing.allocator, tinystories_15m_path, 32000);
+    var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
 
-    defer tokenizer.deinit();
+    defer arena.deinit();
 
+    const tokenizer = try Self.readLeaky(arena.allocator(), tinystories_15m_path, 32000);
     const expected = [_]usize{ 319, 11473, 2643, 378, 629, 271, 18099, 278, 3815, 373, 278, 6826, 29901, 13, 13, 4706, 6324, 14332, 29892, 13, 13, 4706, 306, 925, 29871 };
 
     const actual = try tokenizer.encode(
-        std.testing.allocator,
+        arena.allocator(),
         "A brief message congratulating the team on the launch:\n\n        Hi everyone,\n\n        I just ",
     );
 
-    defer std.testing.allocator.free(actual);
-
     try std.testing.expectEqualSlices(usize, expected[0..], actual);
 }
 
 test "meta encoding example 4" {
-    const tokenizer = try Self.init(std.testing.allocator, tinystories_15m_path, 32000);
+    var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
 
-    defer tokenizer.deinit();
+    defer arena.deinit();
 
+    const tokenizer = try Self.readLeaky(arena.allocator(), tinystories_15m_path, 32000);
     const expected = [_]usize{ 4103, 9632, 4223, 304, 5176, 29901, 13, 13, 4706, 7205, 4932, 357, 1149, 301, 449, 276, 316, 2778, 13, 4706, 1236, 407, 837, 524, 1149, 6042, 354, 772, 440, 29878, 1318, 13, 4706, 715, 1878, 330, 3055, 1725, 1149, 330, 3055, 1725, 4639, 28754, 13, 4706, 923, 968, 1149 };
 
     const actual = try tokenizer.encode(
-        std.testing.allocator,
+        arena.allocator(),
         "Translate English to French:\n\n        sea otter => loutre de mer\n        peppermint => menthe poivrée\n        plush girafe => girafe peluche\n        cheese =>",
     );
 
-    defer std.testing.allocator.free(actual);
-
     try std.testing.expectEqualSlices(usize, expected[0..], actual);
 }
diff --git a/src/transformer.zig b/src/transformer.zig
index e9d7efb..2d2d64b 100644
--- a/src/transformer.zig
+++ b/src/transformer.zig
@@ -4,88 +4,54 @@ const std = @import("std");
 const Attention = @import("attention.zig");
 const Checkpoint = @import("checkpoint.zig");
 const FFN = @import("ffn.zig");
-const Tensor = @import("./tensor.zig").Tensor;
+const Vector = @import("vector.zig");
 
-allocator: std.mem.Allocator,
 checkpoint: Checkpoint,
 sequence_length: usize,
 attention: Attention,
 ffn: FFN,
-hidden_buffer: Tensor(1),
-output_buffer: Tensor(1),
+hidden: Vector,
+output: Vector,
 
-pub fn init(
+pub fn createLeaky(
     allocator: std.mem.Allocator,
     model_path: []const u8,
     custom_sequence_length: usize,
 ) !Self {
-    const checkpoint = try Checkpoint.init(allocator, model_path);
-
-    errdefer checkpoint.deinit();
+    const checkpoint = try Checkpoint.readLeaky(allocator, model_path);
 
     const sequence_length = if (custom_sequence_length == 0)
         checkpoint.max_sequence_length
     else
-        custom_sequence_length;
-
-    const attention = try Attention.init(allocator, checkpoint, sequence_length);
-
-    errdefer attention.deinit();
-
-    const ffn = try FFN.init(allocator, checkpoint);
-
-    errdefer ffn.deinit();
-
-    const hidden_buffer = try Tensor(1).init(allocator, [_]usize{checkpoint.embedding_size});
-
-    errdefer hidden_buffer.deinit();
-
-    const output_buffer = try Tensor(1).init(allocator, [_]usize{checkpoint.vocab_size});
-
-    errdefer output_buffer.deinit();
+        @min(custom_sequence_length, checkpoint.max_sequence_length);
 
     return .{
-        .allocator = allocator,
         .checkpoint = checkpoint,
         .sequence_length = sequence_length,
-        .attention = attention,
-        .ffn = ffn,
-        .hidden_buffer = hidden_buffer,
-        .output_buffer = output_buffer,
+        .attention = try Attention.createLeaky(allocator, checkpoint, sequence_length),
+        .ffn = try FFN.createLeaky(allocator, checkpoint),
+        .hidden = try Vector.createLeaky(allocator, checkpoint.embedding_size),
+        .output = try Vector.createLeaky(allocator, checkpoint.vocab_size),
     };
 }
 
-pub fn deinit(self: Self) void {
-    self.checkpoint.deinit();
-    self.attention.deinit();
-    self.ffn.deinit();
-    self.hidden_buffer.deinit();
-    self.output_buffer.deinit();
-}
-
-pub fn forward(self: Self, token: usize, position: usize) void {
-    const weights = self.checkpoint.weights;
+pub fn forward(self: Self, token: usize, position: usize) !void {
+    const token_embedding_weight = self.checkpoint.token_embedding_weights[token];
 
-    @memcpy(self.hidden_buffer.values, weights.token_embedding_vectors.slice(token).values);
+    @memcpy(self.hidden.values, token_embedding_weight.values);
 
     for (0..self.checkpoint.n_layers) |layer| {
-        self.hidden_buffer.computeRMSNorm(
-            weights.attention_norm_vectors.slice(layer),
-            self.attention.input_buffer,
-        );
-
-        self.attention.forward(layer, position);
-        self.hidden_buffer.computeVectorAddition(self.attention.output_buffer);
-
-        self.hidden_buffer.computeRMSNorm(
-            weights.ffn_norm_vectors.slice(layer),
-            self.ffn.input_buffer,
-        );
-
-        self.ffn.forward(layer);
-        self.hidden_buffer.computeVectorAddition(self.ffn.output_buffer);
+        const attention_norm_weight = self.checkpoint.attention_norm_weights[layer];
+        const ffn_norm_weight = self.checkpoint.ffn_norm_weights[layer];
+
+        try self.hidden.computeRMSNorm(attention_norm_weight, self.attention.input);
+        try self.attention.forward(layer, position);
+        try self.hidden.addVector(self.attention.output);
+        try self.hidden.computeRMSNorm(ffn_norm_weight, self.ffn.input);
+        try self.ffn.forward(layer);
+        try self.hidden.addVector(self.ffn.output);
     }
 
-    self.hidden_buffer.computeRMSNorm(weights.output_norm_vector, self.hidden_buffer);
-    weights.output_matrix.computeMatrixVectorMultiplication(self.hidden_buffer, self.output_buffer);
+    try self.hidden.computeRMSNorm(self.checkpoint.output_norm_weight, self.hidden);
+    try self.checkpoint.output_weight.multiplyVector(self.hidden, self.output);
 }
diff --git a/src/vector.zig b/src/vector.zig
new file mode 100644
index 0000000..3ce2a7d
--- /dev/null
+++ b/src/vector.zig
@@ -0,0 +1,60 @@
+const Self = @This();
+
+const std = @import("std");
+const simd = @import("simd.zig");
+
+values: []f32,
+
+pub fn createLeaky(allocator: std.mem.Allocator, n_values: usize) !Self {
+    return .{ .values = try allocator.alignedAlloc(f32, std.atomic.cache_line, n_values) };
+}
+
+pub fn createMultipleLeaky(
+    allocator: std.mem.Allocator,
+    n_vectors: usize,
+    n_values: usize,
+) ![]Self {
+    const vectors = try allocator.alloc(Self, n_vectors);
+
+    for (vectors) |*vector| {
+        vector.* = try createLeaky(allocator, n_values);
+    }
+
+    return vectors;
+}
+
+pub fn readLeaky(allocator: std.mem.Allocator, file: std.fs.File, n_values: usize) !Self {
+    const vector = try createLeaky(allocator, n_values);
+    const bytes: [*]u8 = @ptrCast(vector.values);
+
+    try file.reader().readNoEof(bytes[0 .. vector.values.len * @sizeOf(f32)]);
+
+    return vector;
+}
+
+pub fn readMultipleLeaky(
+    allocator: std.mem.Allocator,
+    file: std.fs.File,
+    n_vectors: usize,
+    n_values: usize,
+) ![]Self {
+    const vectors = try allocator.alloc(Self, n_vectors);
+
+    for (vectors) |*vector| {
+        vector.* = try readLeaky(allocator, file, n_values);
+    }
+
+    return vectors;
+}
+
+pub fn addVector(self: Self, other: Self) !void {
+    try simd.computeVectorAddition(self.values, other.values, self.values);
+}
+
+pub fn computeRMSNorm(self: Self, weight: Self, output: Self) !void {
+    try simd.computeRMSNorm(self.values, weight.values, output.values);
+}
+
+pub fn computeScalarProduct(self: Self, other: Self) !f32 {
+    return simd.computeScalarProduct(self.values, other.values);
+}