diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..fa13dee
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+.venv
+results.txt
+*.egg-info
+__pycache__
diff --git a/README.md b/README.md
index de6da80..c252bc2 100644
--- a/README.md
+++ b/README.md
@@ -16,23 +16,6 @@ Python package versions for each framework.
 
 ## Permission setups
 
-### HuggingFace setup
-
-On the [HuggingFace Gemma model page](https://huggingface.co/google/gemma-7b),
-make sure you have accepted the license near the top of the page.
-
-```shell
-pip install --upgrade huggingface_hub
-```
-
-```shell
-huggingface-cli login
-```
-
-It may require you to input a token.
-[More information about tokens.](https://huggingface.co/docs/hub/en/security-tokens)
-
-
 ### Kaggle setup
 
 On the [Kaggle Gemma model page](https://www.kaggle.com/models/keras/gemma),
@@ -82,7 +65,7 @@ run `shell/cleanup.sh`.
   structured as a Python package. I needs `pip install -e .` before using. Most
   of the settings are in `benchmark/__init__.py`. You can run a single benchmark
   by calling each script, for example,
-  `python benchmark/gemma/keras/predict.py results.txt`
+  `python benchmark/gemma/predict.py results.txt`
 * `shell` contains all the shell scripts for benchmarking.
 * `requirements` contains the version requirements for the PyPI packages in the
   dependencies.
diff --git a/benchmark/__init__.py b/benchmark/__init__.py
index f6ef342..15a5b94 100644
--- a/benchmark/__init__.py
+++ b/benchmark/__init__.py
@@ -40,7 +40,7 @@ def append_to_file(file_path, content):
 
 def benchmark(run):
     if len(sys.argv) not in (2, 3):
-        print("Usage: python bert/keras/fit.py <file_path> [batch_size]")
+        print("Usage: python bert/fit.py <file_path> [batch_size]")
     else:
         if len(sys.argv) == 3:
             batch_size = int(sys.argv[2])
diff --git a/benchmark/bert/keras/fit.py b/benchmark/bert/fit.py
similarity index 78%
rename from benchmark/bert/keras/fit.py
rename to benchmark/bert/fit.py
index 79756a5..eec9a73 100644
--- a/benchmark/bert/keras/fit.py
+++ b/benchmark/bert/fit.py
@@ -2,14 +2,14 @@
 import keras_nlp
 
 import benchmark
-from benchmark import keras_utils
+from benchmark import utils
 
 
 def run(batch_size=benchmark.BERT_FIT_BATCH_SIZE):
     preprocessor = keras_nlp.models.BertPreprocessor.from_preset(
         "bert_base_en", sequence_length=benchmark.BERT_SEQ_LENGTH
     )
-    dataset = keras_utils.get_train_dataset_for_text_classification(
+    dataset = utils.get_train_dataset_for_text_classification(
         preprocessor=preprocessor,
         batch_size=batch_size,
         seq_len=benchmark.BERT_SEQ_LENGTH,
@@ -22,10 +22,10 @@ def run(batch_size=benchmark.BERT_FIT_BATCH_SIZE):
     model.compile(
         loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
         optimizer=keras.optimizers.AdamW(),
-        jit_compile=keras_utils.use_jit(),
+        jit_compile=utils.use_jit(),
     )
 
-    return keras_utils.fit(model, dataset)
+    return utils.fit(model, dataset)
 
 
 if __name__ == "__main__":
diff --git a/benchmark/bert/keras/__init__.py b/benchmark/bert/keras/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/benchmark/bert/keras/predict.py b/benchmark/bert/predict.py
similarity index 74%
rename from benchmark/bert/keras/predict.py
rename to benchmark/bert/predict.py
index 13fdbbe..a47d59d 100644
--- a/benchmark/bert/keras/predict.py
+++ b/benchmark/bert/predict.py
@@ -1,14 +1,14 @@
 import keras_nlp
 
 import benchmark
-from benchmark import keras_utils
+from benchmark import utils
 
 
 def run(batch_size=benchmark.BERT_BATCH_SIZE):
     preprocessor = keras_nlp.models.BertPreprocessor.from_preset(
         "bert_base_en", sequence_length=benchmark.BERT_SEQ_LENGTH
     )
-    dataset = keras_utils.get_train_dataset_for_text_classification(
+    dataset = utils.get_train_dataset_for_text_classification(
         preprocessor=preprocessor,
         batch_size=batch_size,
         seq_len=benchmark.BERT_SEQ_LENGTH,
@@ -19,10 +19,10 @@ def run(batch_size=benchmark.BERT_BATCH_SIZE):
         preprocessor=None,
     )
     model.compile(
-        jit_compile=keras_utils.use_jit(),
+        jit_compile=utils.use_jit(),
     )
 
-    return keras_utils.predict(model, dataset)
+    return utils.predict(model, dataset)
 
 
 if __name__ == "__main__":
diff --git a/benchmark/bert/torch/__init__.py b/benchmark/bert/torch/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/benchmark/bert/torch/fit.py b/benchmark/bert/torch/fit.py
deleted file mode 100644
index 9f78601..0000000
--- a/benchmark/bert/torch/fit.py
+++ /dev/null
@@ -1,52 +0,0 @@
-from transformers import AutoModelForSequenceClassification
-from transformers import AutoTokenizer
-from transformers import Trainer
-from transformers import TrainingArguments
-
-import benchmark
-from benchmark import torch_utils
-
-
-def run(batch_size=benchmark.BERT_FIT_BATCH_SIZE):
-    dataset = torch_utils.get_train_dataset_for_text_classification(
-        AutoTokenizer.from_pretrained("bert-base-cased"),
-        batch_size=batch_size,
-        seq_len=benchmark.BERT_SEQ_LENGTH,
-    )
-    model = AutoModelForSequenceClassification.from_pretrained(
-        "bert-base-cased",
-        num_labels=2,
-    )
-
-    training_args = TrainingArguments(
-        output_dir="test_trainer",
-        per_device_train_batch_size=batch_size,
-        num_train_epochs=1.0,
-        max_steps=benchmark.NUM_STEPS + 2,
-        torch_compile=torch_utils.use_compile(),
-        torch_compile_mode=(
-            torch_utils.COMPILE_MODE if torch_utils.use_compile() else None
-        ),
-    )
-
-    timing_callback = torch_utils.TimingCallback()
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=dataset,
-        callbacks=[timing_callback],
-    )
-
-    trainer.train()
-
-    # Calculate overall training time
-    overall_training_time = (
-        timing_callback.end_time - timing_callback.start_time
-    )
-    training_per_step = overall_training_time / benchmark.NUM_STEPS * 1000
-
-    return training_per_step
-
-
-if __name__ == "__main__":
-    benchmark.benchmark(run)
diff --git a/benchmark/bert/torch/predict.py b/benchmark/bert/torch/predict.py
deleted file mode 100644
index ea735a4..0000000
--- a/benchmark/bert/torch/predict.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import time
-
-from transformers import AutoModelForSequenceClassification
-from transformers import AutoTokenizer
-from transformers import Trainer
-from transformers import TrainingArguments
-
-import benchmark
-from benchmark import torch_utils
-
-
-def run(batch_size=benchmark.BERT_BATCH_SIZE):
-    dataset = torch_utils.get_train_dataset_for_text_classification(
-        AutoTokenizer.from_pretrained("bert-base-cased"),
-        batch_size=batch_size,
-        seq_len=benchmark.BERT_SEQ_LENGTH,
-    )
-    model = AutoModelForSequenceClassification.from_pretrained(
-        "bert-base-cased",
-        num_labels=2,
-    )
-
-    training_args = TrainingArguments(
-        output_dir="test_trainer",
-        per_device_eval_batch_size=batch_size,
-        torch_compile=torch_utils.use_compile(),
-        torch_compile_mode=(
-            torch_utils.COMPILE_MODE if torch_utils.use_compile() else None
-        ),
-    )
-
-    trainer = Trainer(model=model, args=training_args)
-
-    # Predict twice to build the model.
-    trainer.predict(dataset.select(list(range(batch_size))))
-    trainer.predict(dataset.select(list(range(batch_size))))
-
-    start_time = time.time()
-    trainer.predict(
-        dataset.select(list(range((benchmark.NUM_STEPS + 1) * batch_size)))
-    )
-    end_time = time.time()
-    total_time = end_time - start_time
-
-    start_time = time.time()
-    trainer.predict(dataset.select(list(range(batch_size))))
-    end_time = time.time()
-    total_time -= end_time - start_time
-
-    inferencing_per_step = total_time / benchmark.NUM_STEPS * 1000
-    return inferencing_per_step
-
-
-if __name__ == "__main__":
-    benchmark.benchmark(run)
diff --git a/benchmark/gemma/keras/fit.py b/benchmark/gemma/fit.py
similarity index 83%
rename from benchmark/gemma/keras/fit.py
rename to benchmark/gemma/fit.py
index a262bfb..f6947e0 100644
--- a/benchmark/gemma/keras/fit.py
+++ b/benchmark/gemma/fit.py
@@ -2,7 +2,7 @@
 import keras_nlp
 
 import benchmark
-from benchmark import keras_utils
+from benchmark import utils
 
 
 def get_model():
@@ -22,7 +22,7 @@ def run(batch_size=benchmark.GEMMA_FIT_BATCH_SIZE):
     preprocessor = keras_nlp.models.GemmaCausalLMPreprocessor.from_preset(
         "gemma_7b_en", sequence_length=benchmark.GEMMA_SEQ_LENGTH
     )
-    dataset = keras_utils.get_train_dataset_for_text_gen(
+    dataset = utils.get_train_dataset_for_text_gen(
         preprocessor, batch_size, seq_len=benchmark.GEMMA_SEQ_LENGTH
     )
     model = get_model()
@@ -30,9 +30,9 @@ def run(batch_size=benchmark.GEMMA_FIT_BATCH_SIZE):
     model.compile(
         loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
         optimizer=keras.optimizers.AdamW(),
-        jit_compile=keras_utils.use_jit(),
+        jit_compile=utils.use_jit(),
     )
-    return keras_utils.fit(model, dataset)
+    return utils.fit(model, dataset)
 
 
 if __name__ == "__main__":
diff --git a/benchmark/gemma/keras/__init__.py b/benchmark/gemma/keras/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/benchmark/gemma/keras/predict.py b/benchmark/gemma/predict.py
similarity index 88%
rename from benchmark/gemma/keras/predict.py
rename to benchmark/gemma/predict.py
index e7392c1..8940394 100644
--- a/benchmark/gemma/keras/predict.py
+++ b/benchmark/gemma/predict.py
@@ -2,7 +2,7 @@
 import keras_nlp
 
 import benchmark
-from benchmark import keras_utils
+from benchmark import utils
 
 
 def run(batch_size=benchmark.GEMMA_BATCH_SIZE):
@@ -12,7 +12,7 @@ def run(batch_size=benchmark.GEMMA_BATCH_SIZE):
         keras.mixed_precision.set_global_policy(benchmark.FLOAT_A100)
     model = keras_nlp.models.GemmaCausalLM.from_preset("gemma_7b_en")
     model.compile(sampler="greedy")
-    return keras_utils.generate(
+    return utils.generate(
         model=model,
         batch_size=batch_size,
         max_length=benchmark.GEMMA_MAX_LENGTH,
diff --git a/benchmark/gemma/torch/__init__.py b/benchmark/gemma/torch/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/benchmark/gemma/torch/fit.py b/benchmark/gemma/torch/fit.py
deleted file mode 100644
index bdeddb2..0000000
--- a/benchmark/gemma/torch/fit.py
+++ /dev/null
@@ -1,56 +0,0 @@
-from peft import LoraConfig
-from peft import get_peft_model
-from transformers import AutoModelForCausalLM
-from transformers import AutoTokenizer
-from transformers import Trainer
-from transformers import TrainingArguments
-
-import benchmark
-from benchmark import torch_utils
-
-
-def run(batch_size=benchmark.GEMMA_FIT_BATCH_SIZE):
-    preset = "google/gemma-7b"
-    tokenizer = AutoTokenizer.from_pretrained(preset)
-    tokenizer.pad_token = tokenizer.eos_token
-    dataset = torch_utils.get_train_dataset_for_text_gen(
-        tokenizer, batch_size, seq_len=benchmark.GEMMA_SEQ_LENGTH
-    )
-    model = AutoModelForCausalLM.from_pretrained(
-        preset, torch_dtype=torch_utils.get_torch_dtype(benchmark.FLOAT_A100)
-    ).cuda()
-    config = LoraConfig(r=4)
-    model = get_peft_model(model, config)
-
-    training_args = TrainingArguments(
-        output_dir="test_trainer",
-        per_device_train_batch_size=batch_size,
-        num_train_epochs=1.0,
-        torch_compile=torch_utils.use_compile(),
-        torch_compile_mode=(
-            torch_utils.COMPILE_MODE if torch_utils.use_compile() else None
-        ),
-        max_steps=benchmark.NUM_STEPS + 2,
-    )
-
-    timing_callback = torch_utils.TimingCallback()
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=dataset,
-        callbacks=[timing_callback],
-    )
-
-    trainer.train()
-
-    # Calculate overall training time
-    overall_training_time = (
-        timing_callback.end_time - timing_callback.start_time
-    )
-    training_per_step = overall_training_time / benchmark.NUM_STEPS * 1000
-
-    return training_per_step
-
-
-if __name__ == "__main__":
-    benchmark.benchmark(run)
diff --git a/benchmark/gemma/torch/predict.py b/benchmark/gemma/torch/predict.py
deleted file mode 100644
index 3263a69..0000000
--- a/benchmark/gemma/torch/predict.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import torch
-from transformers import AutoModelForCausalLM
-from transformers import AutoTokenizer
-
-import benchmark
-from benchmark import torch_utils
-
-
-def run(batch_size=benchmark.GEMMA_BATCH_SIZE):
-    preset = "google/gemma-7b"
-    model = AutoModelForCausalLM.from_pretrained(
-        preset, torch_dtype=torch_utils.get_torch_dtype(benchmark.FLOAT_A100)
-    ).cuda()
-    if torch_utils.use_compile():
-        model = torch.compile(model, mode=torch_utils.COMPILE_MODE)
-    tokenizer = AutoTokenizer.from_pretrained(preset)
-    tokenizer.pad_token = tokenizer.eos_token
-
-    return torch_utils.generate(
-        model=model,
-        tokenizer=tokenizer,
-        batch_size=batch_size,
-        max_length=benchmark.GEMMA_MAX_LENGTH,
-    )
-
-
-if __name__ == "__main__":
-    benchmark.benchmark(run)
diff --git a/benchmark/mistral/keras/fit.py b/benchmark/mistral/fit.py
similarity index 83%
rename from benchmark/mistral/keras/fit.py
rename to benchmark/mistral/fit.py
index 08d0529..07d781c 100644
--- a/benchmark/mistral/keras/fit.py
+++ b/benchmark/mistral/fit.py
@@ -2,7 +2,7 @@
 import keras_nlp
 
 import benchmark
-from benchmark import keras_utils
+from benchmark import utils
 
 
 def get_model():
@@ -23,7 +23,7 @@ def run(batch_size=benchmark.MISTRAL_FIT_BATCH_SIZE):
         "mistral_7b_en",
         sequence_length=benchmark.MISTRAL_SEQ_LENGTH,
     )
-    dataset = keras_utils.get_train_dataset_for_text_gen(
+    dataset = utils.get_train_dataset_for_text_gen(
         preprocessor, batch_size, seq_len=benchmark.MISTRAL_SEQ_LENGTH
     )
     model = get_model()
@@ -31,9 +31,9 @@ def run(batch_size=benchmark.MISTRAL_FIT_BATCH_SIZE):
     model.compile(
         loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
         optimizer=keras.optimizers.AdamW(),
-        jit_compile=keras_utils.use_jit(),
+        jit_compile=utils.use_jit(),
     )
-    return keras_utils.fit(model, dataset)
+    return utils.fit(model, dataset)
 
 
 if __name__ == "__main__":
diff --git a/benchmark/mistral/keras/__init__.py b/benchmark/mistral/keras/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/benchmark/mistral/keras/predict.py b/benchmark/mistral/predict.py
similarity index 89%
rename from benchmark/mistral/keras/predict.py
rename to benchmark/mistral/predict.py
index 717746c..ea9d51b 100644
--- a/benchmark/mistral/keras/predict.py
+++ b/benchmark/mistral/predict.py
@@ -2,7 +2,7 @@
 import keras_nlp
 
 import benchmark
-from benchmark import keras_utils
+from benchmark import utils
 
 
 def run(batch_size=benchmark.MISTRAL_BATCH_SIZE):
@@ -12,7 +12,7 @@ def run(batch_size=benchmark.MISTRAL_BATCH_SIZE):
         keras.mixed_precision.set_global_policy(benchmark.FLOAT_A100)
     model = keras_nlp.models.MistralCausalLM.from_preset("mistral_7b_en")
     model.compile(sampler="greedy")
-    return keras_utils.generate(
+    return utils.generate(
         model=model,
         batch_size=batch_size,
         max_length=benchmark.MISTRAL_MAX_LENGTH,
diff --git a/benchmark/mistral/torch/__init__.py b/benchmark/mistral/torch/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/benchmark/mistral/torch/fit.py b/benchmark/mistral/torch/fit.py
deleted file mode 100644
index d8b1904..0000000
--- a/benchmark/mistral/torch/fit.py
+++ /dev/null
@@ -1,56 +0,0 @@
-from peft import LoraConfig
-from peft import get_peft_model
-from transformers import AutoModelForCausalLM
-from transformers import AutoTokenizer
-from transformers import Trainer
-from transformers import TrainingArguments
-
-import benchmark
-from benchmark import torch_utils
-
-
-def run(batch_size=benchmark.MISTRAL_FIT_BATCH_SIZE):
-    preset = "mistralai/Mistral-7B-v0.1"
-    tokenizer = AutoTokenizer.from_pretrained(preset)
-    tokenizer.pad_token = tokenizer.eos_token
-    dataset = torch_utils.get_train_dataset_for_text_gen(
-        tokenizer, batch_size, seq_len=benchmark.MISTRAL_SEQ_LENGTH
-    )
-    model = AutoModelForCausalLM.from_pretrained(
-        preset, torch_dtype=torch_utils.get_torch_dtype(benchmark.FLOAT_A100)
-    ).cuda()
-    config = LoraConfig(r=4)
-    model = get_peft_model(model, config)
-
-    training_args = TrainingArguments(
-        output_dir="test_trainer",
-        per_device_train_batch_size=batch_size,
-        num_train_epochs=1.0,
-        torch_compile=torch_utils.use_compile(),
-        torch_compile_mode=(
-            torch_utils.COMPILE_MODE if torch_utils.use_compile() else None
-        ),
-        max_steps=benchmark.NUM_STEPS + 2,
-    )
-
-    timing_callback = torch_utils.TimingCallback()
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=dataset,
-        callbacks=[timing_callback],
-    )
-
-    trainer.train()
-
-    # Calculate overall training time
-    overall_training_time = (
-        timing_callback.end_time - timing_callback.start_time
-    )
-    training_per_step = overall_training_time / benchmark.NUM_STEPS * 1000
-
-    return training_per_step
-
-
-if __name__ == "__main__":
-    benchmark.benchmark(run)
diff --git a/benchmark/mistral/torch/predict.py b/benchmark/mistral/torch/predict.py
deleted file mode 100644
index 9ccc875..0000000
--- a/benchmark/mistral/torch/predict.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import torch
-from transformers import AutoModelForCausalLM
-from transformers import AutoTokenizer
-
-import benchmark
-from benchmark import torch_utils
-
-
-def run(batch_size=benchmark.MISTRAL_BATCH_SIZE):
-    preset = "mistralai/Mistral-7B-v0.1"
-    model = AutoModelForCausalLM.from_pretrained(
-        preset, torch_dtype=torch_utils.get_torch_dtype(benchmark.FLOAT_A100)
-    ).cuda()
-    if torch_utils.use_compile():
-        model = torch.compile(model, mode=torch_utils.COMPILE_MODE)
-    tokenizer = AutoTokenizer.from_pretrained(preset)
-    tokenizer.pad_token = tokenizer.eos_token
-
-    return torch_utils.generate(
-        model=model,
-        tokenizer=tokenizer,
-        batch_size=batch_size,
-        max_length=benchmark.MISTRAL_MAX_LENGTH,
-    )
-
-
-if __name__ == "__main__":
-    benchmark.benchmark(run)
diff --git a/benchmark/sam/keras/fit.py b/benchmark/sam/fit.py
similarity index 84%
rename from benchmark/sam/keras/fit.py
rename to benchmark/sam/fit.py
index 185dd04..70d45ff 100644
--- a/benchmark/sam/keras/fit.py
+++ b/benchmark/sam/fit.py
@@ -4,7 +4,7 @@
 import tensorflow as tf
 
 import benchmark
-from benchmark import keras_utils
+from benchmark import utils
 
 
 def get_train_dataset(batch_size):
@@ -27,9 +27,9 @@ def run(batch_size=benchmark.SAM_FIT_BATCH_SIZE):
     model = keras_cv.models.SegmentAnythingModel.from_preset("sam_huge_sa1b")
     backbone = model.backbone
     backbone.compile(
-        loss="mse", optimizer="adam", jit_compile=keras_utils.use_jit()
+        loss="mse", optimizer="adam", jit_compile=utils.use_jit()
     )
-    return keras_utils.fit(backbone, train_dataset)
+    return utils.fit(backbone, train_dataset)
 
 
 if __name__ == "__main__":
diff --git a/benchmark/sam/keras/__init__.py b/benchmark/sam/keras/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/benchmark/sam/keras/predict.py b/benchmark/sam/predict.py
similarity index 85%
rename from benchmark/sam/keras/predict.py
rename to benchmark/sam/predict.py
index 1135a38..704ab99 100644
--- a/benchmark/sam/keras/predict.py
+++ b/benchmark/sam/predict.py
@@ -4,7 +4,7 @@
 import tensorflow as tf
 
 import benchmark
-from benchmark import keras_utils
+from benchmark import utils
 
 
 def get_dataset(batch_size):
@@ -29,8 +29,8 @@ def run(batch_size=benchmark.SAM_BATCH_SIZE):
     dataset = get_dataset(batch_size)
     model = keras_cv.models.SegmentAnythingModel.from_preset("sam_huge_sa1b")
     backbone = model.backbone
-    backbone.compile(jit_compile=keras_utils.use_jit())
-    return keras_utils.predict(model, dataset)
+    backbone.compile(jit_compile=utils.use_jit())
+    return utils.predict(model, dataset)
 
 
 if __name__ == "__main__":
diff --git a/benchmark/sam/torch/__init__.py b/benchmark/sam/torch/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/benchmark/sam/torch/fit.py b/benchmark/sam/torch/fit.py
deleted file mode 100644
index 95c9bf6..0000000
--- a/benchmark/sam/torch/fit.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import time
-
-import segment_anything
-import torch
-
-import benchmark
-from benchmark import torch_utils
-
-HUGE_URL = (
-    "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth"
-)
-HUGE_BUILD = segment_anything.build_sam_vit_h
-HUGE_LOCAL = "/tmp/sam_h.pth"
-LARGE_URL = (
-    "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth"
-)
-LARGE_BUILD = segment_anything.build_sam_vit_l
-LARGE_LOCAL = "/tmp/sam_l.pth"
-BASE_URL = (
-    "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth"
-)
-BASE_BUILD = segment_anything.build_sam_vit_b
-BASE_LOCAL = "/tmp/sam_b.pth"
-
-URL = HUGE_URL
-LOCAL = HUGE_LOCAL
-build_sam = HUGE_BUILD
-
-
-def get_dataset(batch_size):
-    input_image = torch.Tensor(batch_size, 3, 1024, 1024).cuda()
-    y_true = torch.Tensor(batch_size, 256, 64, 64).cuda()
-    return input_image, y_true
-
-
-def train(model, input_image, y_true):
-    optimizer = torch.optim.Adam(model.parameters())
-
-    def train_fn(model, input_image, y_true):
-        optimizer.zero_grad()
-        y_pred = model(input_image)
-        loss = torch.nn.MSELoss()(y_pred, y_true)
-        loss.backward()
-        optimizer.step()
-
-    if torch_utils.use_compile():
-        train_fn = torch.compile(train_fn, mode=torch_utils.COMPILE_MODE)
-
-    train_fn(model, input_image, y_true)
-    train_fn(model, input_image, y_true)
-
-    start_time = time.time()
-    for _ in range(benchmark.NUM_STEPS):
-        train_fn(model, input_image, y_true)
-    end_time = time.time()
-
-    return (end_time - start_time) / benchmark.NUM_STEPS * 1000
-
-
-def run(batch_size=benchmark.SAM_FIT_BATCH_SIZE):
-    benchmark.download_file(URL, LOCAL)
-    model = build_sam(checkpoint=LOCAL).cuda()
-    input_image, y_true = get_dataset(batch_size)
-
-    return train(model.image_encoder, input_image, y_true)
-
-
-if __name__ == "__main__":
-    benchmark.benchmark(run)
diff --git a/benchmark/sam/torch/predict.py b/benchmark/sam/torch/predict.py
deleted file mode 100644
index 85d2c8e..0000000
--- a/benchmark/sam/torch/predict.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import time
-
-import segment_anything
-import torch
-
-import benchmark
-from benchmark import torch_utils
-
-HUGE_URL = (
-    "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth"
-)
-HUGE_BUILD = segment_anything.build_sam_vit_h
-HUGE_LOCAL = "/tmp/sam_h.pth"
-LARGE_URL = (
-    "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth"
-)
-LARGE_BUILD = segment_anything.build_sam_vit_l
-LARGE_LOCAL = "/tmp/sam_l.pth"
-BASE_URL = (
-    "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth"
-)
-BASE_BUILD = segment_anything.build_sam_vit_b
-BASE_LOCAL = "/tmp/sam_b.pth"
-
-URL = HUGE_URL
-LOCAL = HUGE_LOCAL
-build_sam = HUGE_BUILD
-
-
-def get_dataset(batch_size):
-    input_image = torch.Tensor(batch_size, 3, 1024, 1024).cuda()
-    input_point = torch.Tensor([[[500, 375], [250, 375]]]).cuda()
-    input_label = torch.Tensor([[1, 2]]).cuda()
-    return input_image, input_point, input_label
-
-
-@torch.no_grad
-def inference(model, input_image, input_point, input_label):
-    features = model.image_encoder(input_image)
-    sparse_embeddings, dense_embeddings = model.prompt_encoder(
-        points=(input_point, input_label), boxes=None, masks=None
-    )
-    return model.mask_decoder(
-        image_embeddings=features,
-        image_pe=model.prompt_encoder.get_dense_pe(),
-        sparse_prompt_embeddings=sparse_embeddings,
-        dense_prompt_embeddings=dense_embeddings,
-        multimask_output=True,
-    )
-
-
-def run(batch_size=benchmark.SAM_BATCH_SIZE):
-    benchmark.download_file(URL, LOCAL)
-    model = build_sam(checkpoint=LOCAL).cuda()
-    input_image, input_point, input_label = get_dataset(batch_size)
-    inference_fn = inference
-    if torch_utils.use_compile():
-        inference_fn = torch.compile(
-            inference_fn, mode=torch_utils.COMPILE_MODE
-        )
-
-    # Inference twice to build the model
-    inference_fn(model, input_image, input_point, input_label)
-    inference_fn(model, input_image, input_point, input_label)
-
-    start_time = time.time()
-    for i in range(benchmark.NUM_STEPS + 1):
-        inference_fn(model, input_image, input_point, input_label)
-    end_time = time.time()
-    total_time = end_time - start_time
-
-    start_time = time.time()
-    inference_fn(model, input_image, input_point, input_label)
-    end_time = time.time()
-    total_time -= end_time - start_time
-
-    inference_time = total_time / benchmark.NUM_STEPS * 1000
-    return inference_time
-
-
-if __name__ == "__main__":
-    benchmark.benchmark(run)
diff --git a/benchmark/stable_diffusion/keras/fit.py b/benchmark/stable_diffusion/fit.py
similarity index 84%
rename from benchmark/stable_diffusion/keras/fit.py
rename to benchmark/stable_diffusion/fit.py
index 5decc19..110fd23 100644
--- a/benchmark/stable_diffusion/keras/fit.py
+++ b/benchmark/stable_diffusion/fit.py
@@ -4,7 +4,7 @@
 import tensorflow as tf
 
 import benchmark
-from benchmark import keras_utils
+from benchmark import utils
 
 
 def get_train_dataset(batch_size):
@@ -24,12 +24,12 @@ def get_train_dataset(batch_size):
 
 def run(batch_size=benchmark.SD_FIT_BATCH_SIZE):
     train_dataset = get_train_dataset(batch_size=batch_size)
-    model = keras_cv.models.StableDiffusion(jit_compile=keras_utils.use_jit())
+    model = keras_cv.models.StableDiffusion(jit_compile=utils.use_jit())
     backbone = keras.Model(
         model.image_encoder.inputs, model.image_encoder.layers[-3].output
     )
     backbone.compile(loss="mse", optimizer="adam")
-    return keras_utils.fit(backbone, train_dataset)
+    return utils.fit(backbone, train_dataset)
 
 
 if __name__ == "__main__":
diff --git a/benchmark/stable_diffusion/keras/__init__.py b/benchmark/stable_diffusion/keras/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/benchmark/stable_diffusion/keras/predict.py b/benchmark/stable_diffusion/predict.py
similarity index 88%
rename from benchmark/stable_diffusion/keras/predict.py
rename to benchmark/stable_diffusion/predict.py
index e1db18c..6f82f62 100644
--- a/benchmark/stable_diffusion/keras/predict.py
+++ b/benchmark/stable_diffusion/predict.py
@@ -3,11 +3,11 @@
 import keras_cv
 
 import benchmark
-from benchmark import keras_utils
+from benchmark import utils
 
 
 def run(batch_size=benchmark.SD_BATCH_SIZE):
-    model = keras_cv.models.StableDiffusion(jit_compile=keras_utils.use_jit())
+    model = keras_cv.models.StableDiffusion(jit_compile=utils.use_jit())
     prompts = "a photograph of an astronaut riding a horse"
 
     # Build the model by running.
diff --git a/benchmark/stable_diffusion/torch/__init__.py b/benchmark/stable_diffusion/torch/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/benchmark/stable_diffusion/torch/fit.py b/benchmark/stable_diffusion/torch/fit.py
deleted file mode 100644
index 7b74d75..0000000
--- a/benchmark/stable_diffusion/torch/fit.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import time
-
-import torch
-from diffusers import StableDiffusionPipeline
-
-import benchmark
-from benchmark import torch_utils
-
-
-def train(model, input_image, y_true):
-    optimizer = torch.optim.Adam(model.parameters())
-
-    def train_fn(model, input_image, y_true):
-        optimizer.zero_grad()
-        y_pred = model(input_image)
-        loss = torch.nn.MSELoss()(y_pred, y_true)
-        loss.backward()
-        optimizer.step()
-
-    if torch_utils.use_compile():
-        train_fn = torch.compile(train_fn, mode=torch_utils.COMPILE_MODE)
-
-    train_fn(model, input_image, y_true)
-    train_fn(model, input_image, y_true)
-
-    start_time = time.time()
-    for _ in range(benchmark.NUM_STEPS):
-        train_fn(model, input_image, y_true)
-    end_time = time.time()
-
-    return (end_time - start_time) / benchmark.NUM_STEPS * 1000
-
-
-def run(batch_size=benchmark.SD_FIT_BATCH_SIZE):
-    model = StableDiffusionPipeline.from_pretrained(
-        "CompVis/stable-diffusion-v1-4"
-    ).to("cuda")
-    return train(
-        model.vae.encoder,
-        torch.rand(batch_size, 3, 512, 512).to("cuda"),
-        torch.rand(batch_size, 8, 64, 64).to("cuda"),
-    )
-
-
-if __name__ == "__main__":
-    benchmark.benchmark(run)
diff --git a/benchmark/stable_diffusion/torch/predict.py b/benchmark/stable_diffusion/torch/predict.py
deleted file mode 100644
index 0efa295..0000000
--- a/benchmark/stable_diffusion/torch/predict.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import time
-
-import torch
-from diffusers import StableDiffusionPipeline
-
-import benchmark
-from benchmark import torch_utils
-
-
-@torch.no_grad
-def inference(model, batch_size):
-    prompts = ["a photograph of an astronaut riding a horse"] * batch_size
-
-    # Generate once to build the model.
-    model(prompts, height=512, width=512, num_inference_steps=1)
-    model(prompts, height=512, width=512, num_inference_steps=1)
-
-    start_time = time.time()
-    model(
-        prompts,
-        height=512,
-        width=512,
-        num_inference_steps=benchmark.NUM_STEPS + 1,
-    )
-    end_time = time.time()
-    total_time = end_time - start_time
-
-    start_time = time.time()
-    model(prompts, height=512, width=512, num_inference_steps=1)
-    end_time = time.time()
-    total_time -= end_time - start_time
-
-    return total_time / benchmark.NUM_STEPS * 1000
-
-
-def run(batch_size=benchmark.SD_BATCH_SIZE):
-    model = StableDiffusionPipeline.from_pretrained(
-        "CompVis/stable-diffusion-v1-4"
-    ).to("cuda")
-    if torch_utils.use_compile():
-        model = torch.compile(model, mode=torch_utils.COMPILE_MODE)
-    return inference(model, batch_size=batch_size)
-
-
-if __name__ == "__main__":
-    benchmark.benchmark(run)
diff --git a/benchmark/torch_utils.py b/benchmark/torch_utils.py
deleted file mode 100644
index d2bd72c..0000000
--- a/benchmark/torch_utils.py
+++ /dev/null
@@ -1,141 +0,0 @@
-import os
-import random
-import time
-
-import torch
-from datasets import Dataset
-from transformers import TrainerCallback
-
-import benchmark
-
-TORCH_DTYPES = {
-    "bfloat16": torch.bfloat16,
-    "float16": torch.float16,
-    "float32": torch.float32,
-}
-
-COMPILE_MODE = "reduce-overhead"
-
-
-class TimingCallback(TrainerCallback):
-    def __init__(self):
-        super().__init__()
-        self.start_time = None
-        self.end_time = None
-
-    def on_step_begin(self, args, state, control, **kwargs):
-        # Record start time only once at the beginning of the second step
-        # Steps are [0, 101].
-        if state.global_step == 2 and self.start_time is None:
-            self.start_time = time.time()
-        super().on_step_begin(args, state, control, **kwargs)
-
-    def on_step_end(self, args, state, control, **kwargs):
-        super().on_step_end(args, state, control, **kwargs)
-        # Record end time at the end of the last step
-        # Steps are [0, 101].
-        if state.global_step == benchmark.NUM_STEPS + 1:
-            self.end_time = time.time()
-
-
-def generate(
-    model,
-    tokenizer,
-    batch_size,
-    max_length,
-):
-    inputs = benchmark.get_prompts(batch_size, benchmark.NUM_WORDS)
-    num_input_tokens = benchmark.NUM_WORDS
-
-    def generate_once():
-        tokenized_inputs = tokenizer(
-            inputs,
-            padding=True,
-            return_tensors="pt",
-        ).to("cuda")
-        outputs = model.generate(
-            **tokenized_inputs,
-            max_new_tokens=max_length - num_input_tokens,
-            pad_token_id=tokenizer.eos_token_id
-        )
-        tokenizer.decode(outputs[0])
-
-    # Generate twice to build the model.
-    generate_once()
-    generate_once()
-
-    start_time = time.time()
-    for _ in range(benchmark.NUM_STEPS + 1):
-        generate_once()
-    end_time = time.time()
-    total_time = end_time - start_time
-
-    start_time = time.time()
-    generate_once()
-    end_time = time.time()
-    total_time -= end_time - start_time
-
-    return total_time / benchmark.NUM_STEPS * 1000
-
-
-def get_torch_dtype(dtype):
-    return TORCH_DTYPES[dtype]
-
-
-def _get_text_and_label(num_prompts, num_words):
-    def gen():
-        for prompt in benchmark.get_prompts(
-            num_prompts=num_prompts,
-            num_words=num_words,
-        ):
-            yield {"text": prompt, "label": random.randint(0, 1)}
-
-    return Dataset.from_generator(gen)
-
-
-def get_train_dataset_for_text_classification(tokenizer, batch_size, seq_len):
-    dataset = _get_text_and_label(
-        num_prompts=batch_size * (benchmark.NUM_STEPS + 1),
-        num_words=seq_len,
-    )
-
-    tokenized_datasets = dataset.map(
-        lambda examples: tokenizer(
-            examples["text"],
-            padding="max_length",
-            max_length=seq_len,
-            truncation=True,
-        ),
-        batched=True,
-    )
-
-    return tokenized_datasets
-
-
-def get_train_dataset_for_text_gen(tokenizer, batch_size, seq_len):
-    dataset = _get_text_and_label(
-        num_prompts=batch_size * (benchmark.NUM_STEPS + 1),
-        num_words=seq_len,
-    )
-
-    # Tokenize the dataset
-    def tokenize_batch(batch):
-        batch = tokenizer(
-            batch["text"],
-            padding="max_length",
-            max_length=seq_len,
-            truncation=True,
-        )
-        batch["labels"] = batch["input_ids"].copy()
-        return batch
-
-    tokenized_dataset = dataset.map(tokenize_batch, batched=True)
-    tokenized_dataset.set_format(
-        "torch", columns=["input_ids", "attention_mask", "labels"]
-    )
-
-    return tokenized_dataset
-
-
-def use_compile():
-    return os.environ.get("TORCH_COMPILE", "0") == "1"
diff --git a/benchmark/keras_utils.py b/benchmark/utils.py
similarity index 100%
rename from benchmark/keras_utils.py
rename to benchmark/utils.py
diff --git a/requirements/torch.txt b/requirements/torch.txt
deleted file mode 100644
index 9034d4d..0000000
--- a/requirements/torch.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-torch==2.2.1
-torchvision==0.17.1
-transformers[torch]==4.38.2
-datasets==2.15.0
-diffusers==0.25.0
-peft==0.9.0
-
-# Using a stable commit that hasn't been updated for 8 months
-# because there is no available snapshot or release
-git+https://github.com/facebookresearch/segment-anything.git@6fdee8f2727f4506cfbbe553e23b895e27956588
-
diff --git a/shell/install.sh b/shell/install.sh
index 1d807ef..0fff046 100644
--- a/shell/install.sh
+++ b/shell/install.sh
@@ -1,7 +1,6 @@
 #!/bin/bash
 
 venvs=(
-    "torch"
     "tensorflow"
     "keras-tensorflow"
     "keras-jax"
diff --git a/shell/run.sh b/shell/run.sh
index 2d2177e..bf74589 100644
--- a/shell/run.sh
+++ b/shell/run.sh
@@ -2,7 +2,6 @@
 
 venv_path=~/.venv
 venvs=(
-    "torch"
     "tensorflow"
     "keras-tensorflow"
     "keras-jax"
@@ -15,6 +14,7 @@ if [ -e "$output_file" ]; then
 fi
 
 export LD_LIBRARY_PATH=
+export NVIDIA_TF32_OVERRIDE=0
 
 models=(
     "bert"
@@ -27,11 +27,6 @@ models=(
 for venv_name in "${venvs[@]}"; do
     printf "# Benchmarking $venv_name\n\n" | tee -a $output_file
     source $venv_path/$venv_name/bin/activate
-    if [[ $venv_name == torch ]]; then
-        file_name=torch
-    else
-        file_name=keras
-    fi
 
     if [[ $venv_name == tensorflow ]]; then
         export KERAS_HOME=configs/tensorflow
@@ -41,27 +36,12 @@ for venv_name in "${venvs[@]}"; do
         export KERAS_HOME=configs/${venv_name#keras-}
     fi
 
-    printf "compiled\n\n"
-    if [[ $venv_name == torch ]]; then
-        export TORCH_COMPILE="1"
-        for model_name in "${models[@]}"; do
-            printf "$model_name:\n" | tee -a $output_file
-            printf "fit:\n" | tee -a $output_file
-            python benchmark/$model_name/$file_name/fit.py $output_file
-            printf "predict:\n" | tee -a $output_file
-            python benchmark/$model_name/$file_name/predict.py $output_file
-            printf "\n\n" | tee -a $output_file
-        done
-        export TORCH_COMPILE="0"
-        printf "not compiled\n\n"
-    fi
-
     for model_name in "${models[@]}"; do
         printf "$model_name:\n" | tee -a $output_file
         printf "fit:\n" | tee -a $output_file
-        python benchmark/$model_name/$file_name/fit.py $output_file
+        python benchmark/$model_name/fit.py $output_file
         printf "predict:\n" | tee -a $output_file
-        python benchmark/$model_name/$file_name/predict.py $output_file
+        python benchmark/$model_name/predict.py $output_file
         printf "\n\n" | tee -a $output_file
     done