diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fa13dee --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.venv +results.txt +*.egg-info +__pycache__ diff --git a/README.md b/README.md index de6da80..c252bc2 100644 --- a/README.md +++ b/README.md @@ -16,23 +16,6 @@ Python package versions for each framework. ## Permission setups -### HuggingFace setup - -On the [HuggingFace Gemma model page](https://huggingface.co/google/gemma-7b), -make sure you have accepted the license near the top of the page. - -```shell -pip install --upgrade huggingface_hub -``` - -```shell -huggingface-cli login -``` - -It may require you to input a token. -[More information about tokens.](https://huggingface.co/docs/hub/en/security-tokens) - - ### Kaggle setup On the [Kaggle Gemma model page](https://www.kaggle.com/models/keras/gemma), @@ -82,7 +65,7 @@ run `shell/cleanup.sh`. structured as a Python package. I needs `pip install -e .` before using. Most of the settings are in `benchmark/__init__.py`. You can run a single benchmark by calling each script, for example, - `python benchmark/gemma/keras/predict.py results.txt` + `python benchmark/gemma/predict.py results.txt` * `shell` contains all the shell scripts for benchmarking. * `requirements` contains the version requirements for the PyPI packages in the dependencies. diff --git a/benchmark/__init__.py b/benchmark/__init__.py index f6ef342..15a5b94 100644 --- a/benchmark/__init__.py +++ b/benchmark/__init__.py @@ -40,7 +40,7 @@ def append_to_file(file_path, content): def benchmark(run): if len(sys.argv) not in (2, 3): - print("Usage: python bert/keras/fit.py [batch_size]") + print("Usage: python bert/fit.py [batch_size]") else: if len(sys.argv) == 3: batch_size = int(sys.argv[2]) diff --git a/benchmark/bert/keras/fit.py b/benchmark/bert/fit.py similarity index 78% rename from benchmark/bert/keras/fit.py rename to benchmark/bert/fit.py index 79756a5..eec9a73 100644 --- a/benchmark/bert/keras/fit.py +++ b/benchmark/bert/fit.py @@ -2,14 +2,14 @@ import keras_nlp import benchmark -from benchmark import keras_utils +from benchmark import utils def run(batch_size=benchmark.BERT_FIT_BATCH_SIZE): preprocessor = keras_nlp.models.BertPreprocessor.from_preset( "bert_base_en", sequence_length=benchmark.BERT_SEQ_LENGTH ) - dataset = keras_utils.get_train_dataset_for_text_classification( + dataset = utils.get_train_dataset_for_text_classification( preprocessor=preprocessor, batch_size=batch_size, seq_len=benchmark.BERT_SEQ_LENGTH, @@ -22,10 +22,10 @@ def run(batch_size=benchmark.BERT_FIT_BATCH_SIZE): model.compile( loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=keras.optimizers.AdamW(), - jit_compile=keras_utils.use_jit(), + jit_compile=utils.use_jit(), ) - return keras_utils.fit(model, dataset) + return utils.fit(model, dataset) if __name__ == "__main__": diff --git a/benchmark/bert/keras/__init__.py b/benchmark/bert/keras/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/benchmark/bert/keras/predict.py b/benchmark/bert/predict.py similarity index 74% rename from benchmark/bert/keras/predict.py rename to benchmark/bert/predict.py index 13fdbbe..a47d59d 100644 --- a/benchmark/bert/keras/predict.py +++ b/benchmark/bert/predict.py @@ -1,14 +1,14 @@ import keras_nlp import benchmark -from benchmark import keras_utils +from benchmark import utils def run(batch_size=benchmark.BERT_BATCH_SIZE): preprocessor = keras_nlp.models.BertPreprocessor.from_preset( "bert_base_en", sequence_length=benchmark.BERT_SEQ_LENGTH ) - dataset = keras_utils.get_train_dataset_for_text_classification( + dataset = utils.get_train_dataset_for_text_classification( preprocessor=preprocessor, batch_size=batch_size, seq_len=benchmark.BERT_SEQ_LENGTH, @@ -19,10 +19,10 @@ def run(batch_size=benchmark.BERT_BATCH_SIZE): preprocessor=None, ) model.compile( - jit_compile=keras_utils.use_jit(), + jit_compile=utils.use_jit(), ) - return keras_utils.predict(model, dataset) + return utils.predict(model, dataset) if __name__ == "__main__": diff --git a/benchmark/bert/torch/__init__.py b/benchmark/bert/torch/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/benchmark/bert/torch/fit.py b/benchmark/bert/torch/fit.py deleted file mode 100644 index 9f78601..0000000 --- a/benchmark/bert/torch/fit.py +++ /dev/null @@ -1,52 +0,0 @@ -from transformers import AutoModelForSequenceClassification -from transformers import AutoTokenizer -from transformers import Trainer -from transformers import TrainingArguments - -import benchmark -from benchmark import torch_utils - - -def run(batch_size=benchmark.BERT_FIT_BATCH_SIZE): - dataset = torch_utils.get_train_dataset_for_text_classification( - AutoTokenizer.from_pretrained("bert-base-cased"), - batch_size=batch_size, - seq_len=benchmark.BERT_SEQ_LENGTH, - ) - model = AutoModelForSequenceClassification.from_pretrained( - "bert-base-cased", - num_labels=2, - ) - - training_args = TrainingArguments( - output_dir="test_trainer", - per_device_train_batch_size=batch_size, - num_train_epochs=1.0, - max_steps=benchmark.NUM_STEPS + 2, - torch_compile=torch_utils.use_compile(), - torch_compile_mode=( - torch_utils.COMPILE_MODE if torch_utils.use_compile() else None - ), - ) - - timing_callback = torch_utils.TimingCallback() - trainer = Trainer( - model=model, - args=training_args, - train_dataset=dataset, - callbacks=[timing_callback], - ) - - trainer.train() - - # Calculate overall training time - overall_training_time = ( - timing_callback.end_time - timing_callback.start_time - ) - training_per_step = overall_training_time / benchmark.NUM_STEPS * 1000 - - return training_per_step - - -if __name__ == "__main__": - benchmark.benchmark(run) diff --git a/benchmark/bert/torch/predict.py b/benchmark/bert/torch/predict.py deleted file mode 100644 index ea735a4..0000000 --- a/benchmark/bert/torch/predict.py +++ /dev/null @@ -1,55 +0,0 @@ -import time - -from transformers import AutoModelForSequenceClassification -from transformers import AutoTokenizer -from transformers import Trainer -from transformers import TrainingArguments - -import benchmark -from benchmark import torch_utils - - -def run(batch_size=benchmark.BERT_BATCH_SIZE): - dataset = torch_utils.get_train_dataset_for_text_classification( - AutoTokenizer.from_pretrained("bert-base-cased"), - batch_size=batch_size, - seq_len=benchmark.BERT_SEQ_LENGTH, - ) - model = AutoModelForSequenceClassification.from_pretrained( - "bert-base-cased", - num_labels=2, - ) - - training_args = TrainingArguments( - output_dir="test_trainer", - per_device_eval_batch_size=batch_size, - torch_compile=torch_utils.use_compile(), - torch_compile_mode=( - torch_utils.COMPILE_MODE if torch_utils.use_compile() else None - ), - ) - - trainer = Trainer(model=model, args=training_args) - - # Predict twice to build the model. - trainer.predict(dataset.select(list(range(batch_size)))) - trainer.predict(dataset.select(list(range(batch_size)))) - - start_time = time.time() - trainer.predict( - dataset.select(list(range((benchmark.NUM_STEPS + 1) * batch_size))) - ) - end_time = time.time() - total_time = end_time - start_time - - start_time = time.time() - trainer.predict(dataset.select(list(range(batch_size)))) - end_time = time.time() - total_time -= end_time - start_time - - inferencing_per_step = total_time / benchmark.NUM_STEPS * 1000 - return inferencing_per_step - - -if __name__ == "__main__": - benchmark.benchmark(run) diff --git a/benchmark/gemma/keras/fit.py b/benchmark/gemma/fit.py similarity index 83% rename from benchmark/gemma/keras/fit.py rename to benchmark/gemma/fit.py index a262bfb..f6947e0 100644 --- a/benchmark/gemma/keras/fit.py +++ b/benchmark/gemma/fit.py @@ -2,7 +2,7 @@ import keras_nlp import benchmark -from benchmark import keras_utils +from benchmark import utils def get_model(): @@ -22,7 +22,7 @@ def run(batch_size=benchmark.GEMMA_FIT_BATCH_SIZE): preprocessor = keras_nlp.models.GemmaCausalLMPreprocessor.from_preset( "gemma_7b_en", sequence_length=benchmark.GEMMA_SEQ_LENGTH ) - dataset = keras_utils.get_train_dataset_for_text_gen( + dataset = utils.get_train_dataset_for_text_gen( preprocessor, batch_size, seq_len=benchmark.GEMMA_SEQ_LENGTH ) model = get_model() @@ -30,9 +30,9 @@ def run(batch_size=benchmark.GEMMA_FIT_BATCH_SIZE): model.compile( loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=keras.optimizers.AdamW(), - jit_compile=keras_utils.use_jit(), + jit_compile=utils.use_jit(), ) - return keras_utils.fit(model, dataset) + return utils.fit(model, dataset) if __name__ == "__main__": diff --git a/benchmark/gemma/keras/__init__.py b/benchmark/gemma/keras/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/benchmark/gemma/keras/predict.py b/benchmark/gemma/predict.py similarity index 88% rename from benchmark/gemma/keras/predict.py rename to benchmark/gemma/predict.py index e7392c1..8940394 100644 --- a/benchmark/gemma/keras/predict.py +++ b/benchmark/gemma/predict.py @@ -2,7 +2,7 @@ import keras_nlp import benchmark -from benchmark import keras_utils +from benchmark import utils def run(batch_size=benchmark.GEMMA_BATCH_SIZE): @@ -12,7 +12,7 @@ def run(batch_size=benchmark.GEMMA_BATCH_SIZE): keras.mixed_precision.set_global_policy(benchmark.FLOAT_A100) model = keras_nlp.models.GemmaCausalLM.from_preset("gemma_7b_en") model.compile(sampler="greedy") - return keras_utils.generate( + return utils.generate( model=model, batch_size=batch_size, max_length=benchmark.GEMMA_MAX_LENGTH, diff --git a/benchmark/gemma/torch/__init__.py b/benchmark/gemma/torch/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/benchmark/gemma/torch/fit.py b/benchmark/gemma/torch/fit.py deleted file mode 100644 index bdeddb2..0000000 --- a/benchmark/gemma/torch/fit.py +++ /dev/null @@ -1,56 +0,0 @@ -from peft import LoraConfig -from peft import get_peft_model -from transformers import AutoModelForCausalLM -from transformers import AutoTokenizer -from transformers import Trainer -from transformers import TrainingArguments - -import benchmark -from benchmark import torch_utils - - -def run(batch_size=benchmark.GEMMA_FIT_BATCH_SIZE): - preset = "google/gemma-7b" - tokenizer = AutoTokenizer.from_pretrained(preset) - tokenizer.pad_token = tokenizer.eos_token - dataset = torch_utils.get_train_dataset_for_text_gen( - tokenizer, batch_size, seq_len=benchmark.GEMMA_SEQ_LENGTH - ) - model = AutoModelForCausalLM.from_pretrained( - preset, torch_dtype=torch_utils.get_torch_dtype(benchmark.FLOAT_A100) - ).cuda() - config = LoraConfig(r=4) - model = get_peft_model(model, config) - - training_args = TrainingArguments( - output_dir="test_trainer", - per_device_train_batch_size=batch_size, - num_train_epochs=1.0, - torch_compile=torch_utils.use_compile(), - torch_compile_mode=( - torch_utils.COMPILE_MODE if torch_utils.use_compile() else None - ), - max_steps=benchmark.NUM_STEPS + 2, - ) - - timing_callback = torch_utils.TimingCallback() - trainer = Trainer( - model=model, - args=training_args, - train_dataset=dataset, - callbacks=[timing_callback], - ) - - trainer.train() - - # Calculate overall training time - overall_training_time = ( - timing_callback.end_time - timing_callback.start_time - ) - training_per_step = overall_training_time / benchmark.NUM_STEPS * 1000 - - return training_per_step - - -if __name__ == "__main__": - benchmark.benchmark(run) diff --git a/benchmark/gemma/torch/predict.py b/benchmark/gemma/torch/predict.py deleted file mode 100644 index 3263a69..0000000 --- a/benchmark/gemma/torch/predict.py +++ /dev/null @@ -1,28 +0,0 @@ -import torch -from transformers import AutoModelForCausalLM -from transformers import AutoTokenizer - -import benchmark -from benchmark import torch_utils - - -def run(batch_size=benchmark.GEMMA_BATCH_SIZE): - preset = "google/gemma-7b" - model = AutoModelForCausalLM.from_pretrained( - preset, torch_dtype=torch_utils.get_torch_dtype(benchmark.FLOAT_A100) - ).cuda() - if torch_utils.use_compile(): - model = torch.compile(model, mode=torch_utils.COMPILE_MODE) - tokenizer = AutoTokenizer.from_pretrained(preset) - tokenizer.pad_token = tokenizer.eos_token - - return torch_utils.generate( - model=model, - tokenizer=tokenizer, - batch_size=batch_size, - max_length=benchmark.GEMMA_MAX_LENGTH, - ) - - -if __name__ == "__main__": - benchmark.benchmark(run) diff --git a/benchmark/mistral/keras/fit.py b/benchmark/mistral/fit.py similarity index 83% rename from benchmark/mistral/keras/fit.py rename to benchmark/mistral/fit.py index 08d0529..07d781c 100644 --- a/benchmark/mistral/keras/fit.py +++ b/benchmark/mistral/fit.py @@ -2,7 +2,7 @@ import keras_nlp import benchmark -from benchmark import keras_utils +from benchmark import utils def get_model(): @@ -23,7 +23,7 @@ def run(batch_size=benchmark.MISTRAL_FIT_BATCH_SIZE): "mistral_7b_en", sequence_length=benchmark.MISTRAL_SEQ_LENGTH, ) - dataset = keras_utils.get_train_dataset_for_text_gen( + dataset = utils.get_train_dataset_for_text_gen( preprocessor, batch_size, seq_len=benchmark.MISTRAL_SEQ_LENGTH ) model = get_model() @@ -31,9 +31,9 @@ def run(batch_size=benchmark.MISTRAL_FIT_BATCH_SIZE): model.compile( loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=keras.optimizers.AdamW(), - jit_compile=keras_utils.use_jit(), + jit_compile=utils.use_jit(), ) - return keras_utils.fit(model, dataset) + return utils.fit(model, dataset) if __name__ == "__main__": diff --git a/benchmark/mistral/keras/__init__.py b/benchmark/mistral/keras/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/benchmark/mistral/keras/predict.py b/benchmark/mistral/predict.py similarity index 89% rename from benchmark/mistral/keras/predict.py rename to benchmark/mistral/predict.py index 717746c..ea9d51b 100644 --- a/benchmark/mistral/keras/predict.py +++ b/benchmark/mistral/predict.py @@ -2,7 +2,7 @@ import keras_nlp import benchmark -from benchmark import keras_utils +from benchmark import utils def run(batch_size=benchmark.MISTRAL_BATCH_SIZE): @@ -12,7 +12,7 @@ def run(batch_size=benchmark.MISTRAL_BATCH_SIZE): keras.mixed_precision.set_global_policy(benchmark.FLOAT_A100) model = keras_nlp.models.MistralCausalLM.from_preset("mistral_7b_en") model.compile(sampler="greedy") - return keras_utils.generate( + return utils.generate( model=model, batch_size=batch_size, max_length=benchmark.MISTRAL_MAX_LENGTH, diff --git a/benchmark/mistral/torch/__init__.py b/benchmark/mistral/torch/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/benchmark/mistral/torch/fit.py b/benchmark/mistral/torch/fit.py deleted file mode 100644 index d8b1904..0000000 --- a/benchmark/mistral/torch/fit.py +++ /dev/null @@ -1,56 +0,0 @@ -from peft import LoraConfig -from peft import get_peft_model -from transformers import AutoModelForCausalLM -from transformers import AutoTokenizer -from transformers import Trainer -from transformers import TrainingArguments - -import benchmark -from benchmark import torch_utils - - -def run(batch_size=benchmark.MISTRAL_FIT_BATCH_SIZE): - preset = "mistralai/Mistral-7B-v0.1" - tokenizer = AutoTokenizer.from_pretrained(preset) - tokenizer.pad_token = tokenizer.eos_token - dataset = torch_utils.get_train_dataset_for_text_gen( - tokenizer, batch_size, seq_len=benchmark.MISTRAL_SEQ_LENGTH - ) - model = AutoModelForCausalLM.from_pretrained( - preset, torch_dtype=torch_utils.get_torch_dtype(benchmark.FLOAT_A100) - ).cuda() - config = LoraConfig(r=4) - model = get_peft_model(model, config) - - training_args = TrainingArguments( - output_dir="test_trainer", - per_device_train_batch_size=batch_size, - num_train_epochs=1.0, - torch_compile=torch_utils.use_compile(), - torch_compile_mode=( - torch_utils.COMPILE_MODE if torch_utils.use_compile() else None - ), - max_steps=benchmark.NUM_STEPS + 2, - ) - - timing_callback = torch_utils.TimingCallback() - trainer = Trainer( - model=model, - args=training_args, - train_dataset=dataset, - callbacks=[timing_callback], - ) - - trainer.train() - - # Calculate overall training time - overall_training_time = ( - timing_callback.end_time - timing_callback.start_time - ) - training_per_step = overall_training_time / benchmark.NUM_STEPS * 1000 - - return training_per_step - - -if __name__ == "__main__": - benchmark.benchmark(run) diff --git a/benchmark/mistral/torch/predict.py b/benchmark/mistral/torch/predict.py deleted file mode 100644 index 9ccc875..0000000 --- a/benchmark/mistral/torch/predict.py +++ /dev/null @@ -1,28 +0,0 @@ -import torch -from transformers import AutoModelForCausalLM -from transformers import AutoTokenizer - -import benchmark -from benchmark import torch_utils - - -def run(batch_size=benchmark.MISTRAL_BATCH_SIZE): - preset = "mistralai/Mistral-7B-v0.1" - model = AutoModelForCausalLM.from_pretrained( - preset, torch_dtype=torch_utils.get_torch_dtype(benchmark.FLOAT_A100) - ).cuda() - if torch_utils.use_compile(): - model = torch.compile(model, mode=torch_utils.COMPILE_MODE) - tokenizer = AutoTokenizer.from_pretrained(preset) - tokenizer.pad_token = tokenizer.eos_token - - return torch_utils.generate( - model=model, - tokenizer=tokenizer, - batch_size=batch_size, - max_length=benchmark.MISTRAL_MAX_LENGTH, - ) - - -if __name__ == "__main__": - benchmark.benchmark(run) diff --git a/benchmark/sam/keras/fit.py b/benchmark/sam/fit.py similarity index 84% rename from benchmark/sam/keras/fit.py rename to benchmark/sam/fit.py index 185dd04..70d45ff 100644 --- a/benchmark/sam/keras/fit.py +++ b/benchmark/sam/fit.py @@ -4,7 +4,7 @@ import tensorflow as tf import benchmark -from benchmark import keras_utils +from benchmark import utils def get_train_dataset(batch_size): @@ -27,9 +27,9 @@ def run(batch_size=benchmark.SAM_FIT_BATCH_SIZE): model = keras_cv.models.SegmentAnythingModel.from_preset("sam_huge_sa1b") backbone = model.backbone backbone.compile( - loss="mse", optimizer="adam", jit_compile=keras_utils.use_jit() + loss="mse", optimizer="adam", jit_compile=utils.use_jit() ) - return keras_utils.fit(backbone, train_dataset) + return utils.fit(backbone, train_dataset) if __name__ == "__main__": diff --git a/benchmark/sam/keras/__init__.py b/benchmark/sam/keras/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/benchmark/sam/keras/predict.py b/benchmark/sam/predict.py similarity index 85% rename from benchmark/sam/keras/predict.py rename to benchmark/sam/predict.py index 1135a38..704ab99 100644 --- a/benchmark/sam/keras/predict.py +++ b/benchmark/sam/predict.py @@ -4,7 +4,7 @@ import tensorflow as tf import benchmark -from benchmark import keras_utils +from benchmark import utils def get_dataset(batch_size): @@ -29,8 +29,8 @@ def run(batch_size=benchmark.SAM_BATCH_SIZE): dataset = get_dataset(batch_size) model = keras_cv.models.SegmentAnythingModel.from_preset("sam_huge_sa1b") backbone = model.backbone - backbone.compile(jit_compile=keras_utils.use_jit()) - return keras_utils.predict(model, dataset) + backbone.compile(jit_compile=utils.use_jit()) + return utils.predict(model, dataset) if __name__ == "__main__": diff --git a/benchmark/sam/torch/__init__.py b/benchmark/sam/torch/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/benchmark/sam/torch/fit.py b/benchmark/sam/torch/fit.py deleted file mode 100644 index 95c9bf6..0000000 --- a/benchmark/sam/torch/fit.py +++ /dev/null @@ -1,69 +0,0 @@ -import time - -import segment_anything -import torch - -import benchmark -from benchmark import torch_utils - -HUGE_URL = ( - "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth" -) -HUGE_BUILD = segment_anything.build_sam_vit_h -HUGE_LOCAL = "/tmp/sam_h.pth" -LARGE_URL = ( - "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth" -) -LARGE_BUILD = segment_anything.build_sam_vit_l -LARGE_LOCAL = "/tmp/sam_l.pth" -BASE_URL = ( - "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth" -) -BASE_BUILD = segment_anything.build_sam_vit_b -BASE_LOCAL = "/tmp/sam_b.pth" - -URL = HUGE_URL -LOCAL = HUGE_LOCAL -build_sam = HUGE_BUILD - - -def get_dataset(batch_size): - input_image = torch.Tensor(batch_size, 3, 1024, 1024).cuda() - y_true = torch.Tensor(batch_size, 256, 64, 64).cuda() - return input_image, y_true - - -def train(model, input_image, y_true): - optimizer = torch.optim.Adam(model.parameters()) - - def train_fn(model, input_image, y_true): - optimizer.zero_grad() - y_pred = model(input_image) - loss = torch.nn.MSELoss()(y_pred, y_true) - loss.backward() - optimizer.step() - - if torch_utils.use_compile(): - train_fn = torch.compile(train_fn, mode=torch_utils.COMPILE_MODE) - - train_fn(model, input_image, y_true) - train_fn(model, input_image, y_true) - - start_time = time.time() - for _ in range(benchmark.NUM_STEPS): - train_fn(model, input_image, y_true) - end_time = time.time() - - return (end_time - start_time) / benchmark.NUM_STEPS * 1000 - - -def run(batch_size=benchmark.SAM_FIT_BATCH_SIZE): - benchmark.download_file(URL, LOCAL) - model = build_sam(checkpoint=LOCAL).cuda() - input_image, y_true = get_dataset(batch_size) - - return train(model.image_encoder, input_image, y_true) - - -if __name__ == "__main__": - benchmark.benchmark(run) diff --git a/benchmark/sam/torch/predict.py b/benchmark/sam/torch/predict.py deleted file mode 100644 index 85d2c8e..0000000 --- a/benchmark/sam/torch/predict.py +++ /dev/null @@ -1,82 +0,0 @@ -import time - -import segment_anything -import torch - -import benchmark -from benchmark import torch_utils - -HUGE_URL = ( - "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth" -) -HUGE_BUILD = segment_anything.build_sam_vit_h -HUGE_LOCAL = "/tmp/sam_h.pth" -LARGE_URL = ( - "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth" -) -LARGE_BUILD = segment_anything.build_sam_vit_l -LARGE_LOCAL = "/tmp/sam_l.pth" -BASE_URL = ( - "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth" -) -BASE_BUILD = segment_anything.build_sam_vit_b -BASE_LOCAL = "/tmp/sam_b.pth" - -URL = HUGE_URL -LOCAL = HUGE_LOCAL -build_sam = HUGE_BUILD - - -def get_dataset(batch_size): - input_image = torch.Tensor(batch_size, 3, 1024, 1024).cuda() - input_point = torch.Tensor([[[500, 375], [250, 375]]]).cuda() - input_label = torch.Tensor([[1, 2]]).cuda() - return input_image, input_point, input_label - - -@torch.no_grad -def inference(model, input_image, input_point, input_label): - features = model.image_encoder(input_image) - sparse_embeddings, dense_embeddings = model.prompt_encoder( - points=(input_point, input_label), boxes=None, masks=None - ) - return model.mask_decoder( - image_embeddings=features, - image_pe=model.prompt_encoder.get_dense_pe(), - sparse_prompt_embeddings=sparse_embeddings, - dense_prompt_embeddings=dense_embeddings, - multimask_output=True, - ) - - -def run(batch_size=benchmark.SAM_BATCH_SIZE): - benchmark.download_file(URL, LOCAL) - model = build_sam(checkpoint=LOCAL).cuda() - input_image, input_point, input_label = get_dataset(batch_size) - inference_fn = inference - if torch_utils.use_compile(): - inference_fn = torch.compile( - inference_fn, mode=torch_utils.COMPILE_MODE - ) - - # Inference twice to build the model - inference_fn(model, input_image, input_point, input_label) - inference_fn(model, input_image, input_point, input_label) - - start_time = time.time() - for i in range(benchmark.NUM_STEPS + 1): - inference_fn(model, input_image, input_point, input_label) - end_time = time.time() - total_time = end_time - start_time - - start_time = time.time() - inference_fn(model, input_image, input_point, input_label) - end_time = time.time() - total_time -= end_time - start_time - - inference_time = total_time / benchmark.NUM_STEPS * 1000 - return inference_time - - -if __name__ == "__main__": - benchmark.benchmark(run) diff --git a/benchmark/stable_diffusion/keras/fit.py b/benchmark/stable_diffusion/fit.py similarity index 84% rename from benchmark/stable_diffusion/keras/fit.py rename to benchmark/stable_diffusion/fit.py index 5decc19..110fd23 100644 --- a/benchmark/stable_diffusion/keras/fit.py +++ b/benchmark/stable_diffusion/fit.py @@ -4,7 +4,7 @@ import tensorflow as tf import benchmark -from benchmark import keras_utils +from benchmark import utils def get_train_dataset(batch_size): @@ -24,12 +24,12 @@ def get_train_dataset(batch_size): def run(batch_size=benchmark.SD_FIT_BATCH_SIZE): train_dataset = get_train_dataset(batch_size=batch_size) - model = keras_cv.models.StableDiffusion(jit_compile=keras_utils.use_jit()) + model = keras_cv.models.StableDiffusion(jit_compile=utils.use_jit()) backbone = keras.Model( model.image_encoder.inputs, model.image_encoder.layers[-3].output ) backbone.compile(loss="mse", optimizer="adam") - return keras_utils.fit(backbone, train_dataset) + return utils.fit(backbone, train_dataset) if __name__ == "__main__": diff --git a/benchmark/stable_diffusion/keras/__init__.py b/benchmark/stable_diffusion/keras/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/benchmark/stable_diffusion/keras/predict.py b/benchmark/stable_diffusion/predict.py similarity index 88% rename from benchmark/stable_diffusion/keras/predict.py rename to benchmark/stable_diffusion/predict.py index e1db18c..6f82f62 100644 --- a/benchmark/stable_diffusion/keras/predict.py +++ b/benchmark/stable_diffusion/predict.py @@ -3,11 +3,11 @@ import keras_cv import benchmark -from benchmark import keras_utils +from benchmark import utils def run(batch_size=benchmark.SD_BATCH_SIZE): - model = keras_cv.models.StableDiffusion(jit_compile=keras_utils.use_jit()) + model = keras_cv.models.StableDiffusion(jit_compile=utils.use_jit()) prompts = "a photograph of an astronaut riding a horse" # Build the model by running. diff --git a/benchmark/stable_diffusion/torch/__init__.py b/benchmark/stable_diffusion/torch/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/benchmark/stable_diffusion/torch/fit.py b/benchmark/stable_diffusion/torch/fit.py deleted file mode 100644 index 7b74d75..0000000 --- a/benchmark/stable_diffusion/torch/fit.py +++ /dev/null @@ -1,46 +0,0 @@ -import time - -import torch -from diffusers import StableDiffusionPipeline - -import benchmark -from benchmark import torch_utils - - -def train(model, input_image, y_true): - optimizer = torch.optim.Adam(model.parameters()) - - def train_fn(model, input_image, y_true): - optimizer.zero_grad() - y_pred = model(input_image) - loss = torch.nn.MSELoss()(y_pred, y_true) - loss.backward() - optimizer.step() - - if torch_utils.use_compile(): - train_fn = torch.compile(train_fn, mode=torch_utils.COMPILE_MODE) - - train_fn(model, input_image, y_true) - train_fn(model, input_image, y_true) - - start_time = time.time() - for _ in range(benchmark.NUM_STEPS): - train_fn(model, input_image, y_true) - end_time = time.time() - - return (end_time - start_time) / benchmark.NUM_STEPS * 1000 - - -def run(batch_size=benchmark.SD_FIT_BATCH_SIZE): - model = StableDiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4" - ).to("cuda") - return train( - model.vae.encoder, - torch.rand(batch_size, 3, 512, 512).to("cuda"), - torch.rand(batch_size, 8, 64, 64).to("cuda"), - ) - - -if __name__ == "__main__": - benchmark.benchmark(run) diff --git a/benchmark/stable_diffusion/torch/predict.py b/benchmark/stable_diffusion/torch/predict.py deleted file mode 100644 index 0efa295..0000000 --- a/benchmark/stable_diffusion/torch/predict.py +++ /dev/null @@ -1,46 +0,0 @@ -import time - -import torch -from diffusers import StableDiffusionPipeline - -import benchmark -from benchmark import torch_utils - - -@torch.no_grad -def inference(model, batch_size): - prompts = ["a photograph of an astronaut riding a horse"] * batch_size - - # Generate once to build the model. - model(prompts, height=512, width=512, num_inference_steps=1) - model(prompts, height=512, width=512, num_inference_steps=1) - - start_time = time.time() - model( - prompts, - height=512, - width=512, - num_inference_steps=benchmark.NUM_STEPS + 1, - ) - end_time = time.time() - total_time = end_time - start_time - - start_time = time.time() - model(prompts, height=512, width=512, num_inference_steps=1) - end_time = time.time() - total_time -= end_time - start_time - - return total_time / benchmark.NUM_STEPS * 1000 - - -def run(batch_size=benchmark.SD_BATCH_SIZE): - model = StableDiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4" - ).to("cuda") - if torch_utils.use_compile(): - model = torch.compile(model, mode=torch_utils.COMPILE_MODE) - return inference(model, batch_size=batch_size) - - -if __name__ == "__main__": - benchmark.benchmark(run) diff --git a/benchmark/torch_utils.py b/benchmark/torch_utils.py deleted file mode 100644 index d2bd72c..0000000 --- a/benchmark/torch_utils.py +++ /dev/null @@ -1,141 +0,0 @@ -import os -import random -import time - -import torch -from datasets import Dataset -from transformers import TrainerCallback - -import benchmark - -TORCH_DTYPES = { - "bfloat16": torch.bfloat16, - "float16": torch.float16, - "float32": torch.float32, -} - -COMPILE_MODE = "reduce-overhead" - - -class TimingCallback(TrainerCallback): - def __init__(self): - super().__init__() - self.start_time = None - self.end_time = None - - def on_step_begin(self, args, state, control, **kwargs): - # Record start time only once at the beginning of the second step - # Steps are [0, 101]. - if state.global_step == 2 and self.start_time is None: - self.start_time = time.time() - super().on_step_begin(args, state, control, **kwargs) - - def on_step_end(self, args, state, control, **kwargs): - super().on_step_end(args, state, control, **kwargs) - # Record end time at the end of the last step - # Steps are [0, 101]. - if state.global_step == benchmark.NUM_STEPS + 1: - self.end_time = time.time() - - -def generate( - model, - tokenizer, - batch_size, - max_length, -): - inputs = benchmark.get_prompts(batch_size, benchmark.NUM_WORDS) - num_input_tokens = benchmark.NUM_WORDS - - def generate_once(): - tokenized_inputs = tokenizer( - inputs, - padding=True, - return_tensors="pt", - ).to("cuda") - outputs = model.generate( - **tokenized_inputs, - max_new_tokens=max_length - num_input_tokens, - pad_token_id=tokenizer.eos_token_id - ) - tokenizer.decode(outputs[0]) - - # Generate twice to build the model. - generate_once() - generate_once() - - start_time = time.time() - for _ in range(benchmark.NUM_STEPS + 1): - generate_once() - end_time = time.time() - total_time = end_time - start_time - - start_time = time.time() - generate_once() - end_time = time.time() - total_time -= end_time - start_time - - return total_time / benchmark.NUM_STEPS * 1000 - - -def get_torch_dtype(dtype): - return TORCH_DTYPES[dtype] - - -def _get_text_and_label(num_prompts, num_words): - def gen(): - for prompt in benchmark.get_prompts( - num_prompts=num_prompts, - num_words=num_words, - ): - yield {"text": prompt, "label": random.randint(0, 1)} - - return Dataset.from_generator(gen) - - -def get_train_dataset_for_text_classification(tokenizer, batch_size, seq_len): - dataset = _get_text_and_label( - num_prompts=batch_size * (benchmark.NUM_STEPS + 1), - num_words=seq_len, - ) - - tokenized_datasets = dataset.map( - lambda examples: tokenizer( - examples["text"], - padding="max_length", - max_length=seq_len, - truncation=True, - ), - batched=True, - ) - - return tokenized_datasets - - -def get_train_dataset_for_text_gen(tokenizer, batch_size, seq_len): - dataset = _get_text_and_label( - num_prompts=batch_size * (benchmark.NUM_STEPS + 1), - num_words=seq_len, - ) - - # Tokenize the dataset - def tokenize_batch(batch): - batch = tokenizer( - batch["text"], - padding="max_length", - max_length=seq_len, - truncation=True, - ) - batch["labels"] = batch["input_ids"].copy() - return batch - - tokenized_dataset = dataset.map(tokenize_batch, batched=True) - tokenized_dataset.set_format( - "torch", columns=["input_ids", "attention_mask", "labels"] - ) - - return tokenized_dataset - - -def use_compile(): - return os.environ.get("TORCH_COMPILE", "0") == "1" diff --git a/benchmark/keras_utils.py b/benchmark/utils.py similarity index 100% rename from benchmark/keras_utils.py rename to benchmark/utils.py diff --git a/requirements/torch.txt b/requirements/torch.txt deleted file mode 100644 index 9034d4d..0000000 --- a/requirements/torch.txt +++ /dev/null @@ -1,11 +0,0 @@ -torch==2.2.1 -torchvision==0.17.1 -transformers[torch]==4.38.2 -datasets==2.15.0 -diffusers==0.25.0 -peft==0.9.0 - -# Using a stable commit that hasn't been updated for 8 months -# because there is no available snapshot or release -git+https://github.com/facebookresearch/segment-anything.git@6fdee8f2727f4506cfbbe553e23b895e27956588 - diff --git a/shell/install.sh b/shell/install.sh index 1d807ef..0fff046 100644 --- a/shell/install.sh +++ b/shell/install.sh @@ -1,7 +1,6 @@ #!/bin/bash venvs=( - "torch" "tensorflow" "keras-tensorflow" "keras-jax" diff --git a/shell/run.sh b/shell/run.sh index 2d2177e..bf74589 100644 --- a/shell/run.sh +++ b/shell/run.sh @@ -2,7 +2,6 @@ venv_path=~/.venv venvs=( - "torch" "tensorflow" "keras-tensorflow" "keras-jax" @@ -15,6 +14,7 @@ if [ -e "$output_file" ]; then fi export LD_LIBRARY_PATH= +export NVIDIA_TF32_OVERRIDE=0 models=( "bert" @@ -27,11 +27,6 @@ models=( for venv_name in "${venvs[@]}"; do printf "# Benchmarking $venv_name\n\n" | tee -a $output_file source $venv_path/$venv_name/bin/activate - if [[ $venv_name == torch ]]; then - file_name=torch - else - file_name=keras - fi if [[ $venv_name == tensorflow ]]; then export KERAS_HOME=configs/tensorflow @@ -41,27 +36,12 @@ for venv_name in "${venvs[@]}"; do export KERAS_HOME=configs/${venv_name#keras-} fi - printf "compiled\n\n" - if [[ $venv_name == torch ]]; then - export TORCH_COMPILE="1" - for model_name in "${models[@]}"; do - printf "$model_name:\n" | tee -a $output_file - printf "fit:\n" | tee -a $output_file - python benchmark/$model_name/$file_name/fit.py $output_file - printf "predict:\n" | tee -a $output_file - python benchmark/$model_name/$file_name/predict.py $output_file - printf "\n\n" | tee -a $output_file - done - export TORCH_COMPILE="0" - printf "not compiled\n\n" - fi - for model_name in "${models[@]}"; do printf "$model_name:\n" | tee -a $output_file printf "fit:\n" | tee -a $output_file - python benchmark/$model_name/$file_name/fit.py $output_file + python benchmark/$model_name/fit.py $output_file printf "predict:\n" | tee -a $output_file - python benchmark/$model_name/$file_name/predict.py $output_file + python benchmark/$model_name/predict.py $output_file printf "\n\n" | tee -a $output_file done