From 89f44976ca1b873b1323b64f824aa26315573f5a Mon Sep 17 00:00:00 2001
From: MaanavD <maanavdalal@microsoft.com>
Date: Fri, 14 Jul 2023 12:59:08 -0700
Subject: [PATCH 1/8] Added huggingface Whisper.

---
 torchbenchmark/models/hf_Whisper/__init__.py  | 19 +++++++++++++++++++
 torchbenchmark/models/hf_Whisper/install.py   | 13 +++++++++++++
 .../models/hf_Whisper/metadata.yaml           | 10 ++++++++++
 .../models/hf_Whisper/requirements.txt        |  2 ++
 .../framework/huggingface/model_factory.py    |  3 ++-
 .../util/framework/huggingface/patch_hf.py    |  2 +-
 6 files changed, 47 insertions(+), 2 deletions(-)
 create mode 100644 torchbenchmark/models/hf_Whisper/__init__.py
 create mode 100644 torchbenchmark/models/hf_Whisper/install.py
 create mode 100644 torchbenchmark/models/hf_Whisper/metadata.yaml
 create mode 100644 torchbenchmark/models/hf_Whisper/requirements.txt

diff --git a/torchbenchmark/models/hf_Whisper/__init__.py b/torchbenchmark/models/hf_Whisper/__init__.py
new file mode 100644
index 0000000000..1f17863c9b
--- /dev/null
+++ b/torchbenchmark/models/hf_Whisper/__init__.py
@@ -0,0 +1,19 @@
+from torchbenchmark.util.framework.huggingface.model_factory import HuggingFaceModel
+from torchbenchmark.tasks import SPEECH
+import torch
+
+class Model(HuggingFaceModel):
+    task = SPEECH.RECOGNITION
+    # https://cdn.openai.com/papers/whisper.pdf Says for large-v2 they trained on 1024 batch sizes. 
+    DEFAULT_TRAIN_BSIZE = 8
+    DEFAULT_EVAL_BSIZE = 8
+
+    def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]):
+        super().__init__(name="hf_Whisper", test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args)
+        self.feature_size = 80
+        self.sequence_length = 3000
+        input_features = torch.randn(size=(self.batch_size, self.feature_size, self.sequence_length),device=self.device)
+        self.example_inputs = {"input_features": input_features.to(self.device)}
+
+    def eval(self):
+        super().eval()
\ No newline at end of file
diff --git a/torchbenchmark/models/hf_Whisper/install.py b/torchbenchmark/models/hf_Whisper/install.py
new file mode 100644
index 0000000000..1a49905932
--- /dev/null
+++ b/torchbenchmark/models/hf_Whisper/install.py
@@ -0,0 +1,13 @@
+import subprocess
+import sys
+import os
+from torchbenchmark.util.framework.huggingface.patch_hf import patch_transformers, cache_model
+
+def pip_install_requirements():
+    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', '-r', 'requirements.txt'])
+
+if __name__ == '__main__':
+    pip_install_requirements()
+    patch_transformers()
+    model_name = os.path.basename(os.path.dirname(os.path.abspath(__file__)))
+    cache_model(model_name)
\ No newline at end of file
diff --git a/torchbenchmark/models/hf_Whisper/metadata.yaml b/torchbenchmark/models/hf_Whisper/metadata.yaml
new file mode 100644
index 0000000000..df5ea16643
--- /dev/null
+++ b/torchbenchmark/models/hf_Whisper/metadata.yaml
@@ -0,0 +1,10 @@
+devices:
+  NVIDIA A100-SXM4-40GB:
+    eval_batch_size: 8
+eval_benchmark: false
+eval_deterministic: false
+eval_nograd: true
+not_implemented:
+- jit: true
+train_benchmark: false
+train_deterministic: false
\ No newline at end of file
diff --git a/torchbenchmark/models/hf_Whisper/requirements.txt b/torchbenchmark/models/hf_Whisper/requirements.txt
new file mode 100644
index 0000000000..20d34b196a
--- /dev/null
+++ b/torchbenchmark/models/hf_Whisper/requirements.txt
@@ -0,0 +1,2 @@
+sentencepiece
+datasets
\ No newline at end of file
diff --git a/torchbenchmark/util/framework/huggingface/model_factory.py b/torchbenchmark/util/framework/huggingface/model_factory.py
index 137f5dcd23..dd39a3a185 100644
--- a/torchbenchmark/util/framework/huggingface/model_factory.py
+++ b/torchbenchmark/util/framework/huggingface/model_factory.py
@@ -8,7 +8,7 @@
 from torchbenchmark.util.model import BenchmarkModel
 from torchbenchmark.tasks import NLP
 import transformers
-from transformers import AutoConfig, ReformerConfig, BertConfig, GenerationConfig
+from transformers import AutoConfig, ReformerConfig, BertConfig, GenerationConfig, WhisperConfig
 from typing import Tuple
 
 class_models = {
@@ -27,6 +27,7 @@
     'hf_Bert': (512, 512, 'BertConfig()', 'AutoModelForMaskedLM'),
     # see https://huggingface.co/bert-large-cased
     'hf_Bert_large': (512, 512, 'BertConfig(hidden_size=1024, num_hidden_layers=24, num_attention_heads=16)', 'AutoModelForMaskedLM'),
+    'hf_Whisper': (1024, 1024, 'WhisperConfig()', 'AutoModelForAudioClassification'),
 }
 
 cpu_input_slice = {
diff --git a/torchbenchmark/util/framework/huggingface/patch_hf.py b/torchbenchmark/util/framework/huggingface/patch_hf.py
index 013e1a8a3c..901288d121 100644
--- a/torchbenchmark/util/framework/huggingface/patch_hf.py
+++ b/torchbenchmark/util/framework/huggingface/patch_hf.py
@@ -5,7 +5,7 @@
 import subprocess
 import sys
 from .model_factory import class_models
-from transformers import AutoConfig, ReformerConfig, BigBirdConfig, BertConfig
+from transformers import AutoConfig, ReformerConfig, BigBirdConfig, BertConfig, WhisperConfig
 
 PATCH_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "patches")
 

From ba57c50aece1832c8315e37c0f3aeda8c059ab9f Mon Sep 17 00:00:00 2001
From: MaanavD <maanavdalal@microsoft.com>
Date: Fri, 14 Jul 2023 14:33:24 -0700
Subject: [PATCH 2/8] Updated requirements, batch size.

---
 torchbenchmark/models/hf_Whisper/__init__.py      | 6 +++---
 torchbenchmark/models/hf_Whisper/requirements.txt | 3 +--
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/torchbenchmark/models/hf_Whisper/__init__.py b/torchbenchmark/models/hf_Whisper/__init__.py
index 1f17863c9b..8975c6e381 100644
--- a/torchbenchmark/models/hf_Whisper/__init__.py
+++ b/torchbenchmark/models/hf_Whisper/__init__.py
@@ -4,9 +4,9 @@
 
 class Model(HuggingFaceModel):
     task = SPEECH.RECOGNITION
-    # https://cdn.openai.com/papers/whisper.pdf Says for large-v2 they trained on 1024 batch sizes. 
-    DEFAULT_TRAIN_BSIZE = 8
-    DEFAULT_EVAL_BSIZE = 8
+    # https://cdn.openai.com/papers/whisper.pdf Says for large-v2 they trained on 1024 batch sizes, with 16 GPUs
+    DEFAULT_TRAIN_BSIZE = 64
+    DEFAULT_EVAL_BSIZE = 64
 
     def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]):
         super().__init__(name="hf_Whisper", test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args)
diff --git a/torchbenchmark/models/hf_Whisper/requirements.txt b/torchbenchmark/models/hf_Whisper/requirements.txt
index 20d34b196a..fd0728f16f 100644
--- a/torchbenchmark/models/hf_Whisper/requirements.txt
+++ b/torchbenchmark/models/hf_Whisper/requirements.txt
@@ -1,2 +1 @@
-sentencepiece
-datasets
\ No newline at end of file
+numba
\ No newline at end of file

From bb3f33116fbbf512c783d2b15f8c2f7cb6414843 Mon Sep 17 00:00:00 2001
From: MaanavD <maanavdalal@microsoft.com>
Date: Fri, 14 Jul 2023 14:43:25 -0700
Subject: [PATCH 3/8] Updated to remove training.

---
 torchbenchmark/models/hf_Whisper/__init__.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/torchbenchmark/models/hf_Whisper/__init__.py b/torchbenchmark/models/hf_Whisper/__init__.py
index 8975c6e381..96b581dc40 100644
--- a/torchbenchmark/models/hf_Whisper/__init__.py
+++ b/torchbenchmark/models/hf_Whisper/__init__.py
@@ -5,9 +5,9 @@
 class Model(HuggingFaceModel):
     task = SPEECH.RECOGNITION
     # https://cdn.openai.com/papers/whisper.pdf Says for large-v2 they trained on 1024 batch sizes, with 16 GPUs
-    DEFAULT_TRAIN_BSIZE = 64
     DEFAULT_EVAL_BSIZE = 64
-
+    DEFAULT_Train_BSIZE = 64
+    
     def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]):
         super().__init__(name="hf_Whisper", test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args)
         self.feature_size = 80
@@ -16,4 +16,6 @@ def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]):
         self.example_inputs = {"input_features": input_features.to(self.device)}
 
     def eval(self):
-        super().eval()
\ No newline at end of file
+        super().eval()
+    def train(self):
+        raise NotImplementedError("Training is not implemented.")
\ No newline at end of file

From 116df9cb937b6921d16eba34fc504776bb40a6ee Mon Sep 17 00:00:00 2001
From: MaanavD <maanavdalal@microsoft.com>
Date: Fri, 14 Jul 2023 17:11:55 -0700
Subject: [PATCH 4/8] Removed default train size. No training implemented.

---
 torchbenchmark/models/hf_Whisper/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torchbenchmark/models/hf_Whisper/__init__.py b/torchbenchmark/models/hf_Whisper/__init__.py
index 96b581dc40..4d6b30772d 100644
--- a/torchbenchmark/models/hf_Whisper/__init__.py
+++ b/torchbenchmark/models/hf_Whisper/__init__.py
@@ -6,7 +6,6 @@ class Model(HuggingFaceModel):
     task = SPEECH.RECOGNITION
     # https://cdn.openai.com/papers/whisper.pdf Says for large-v2 they trained on 1024 batch sizes, with 16 GPUs
     DEFAULT_EVAL_BSIZE = 64
-    DEFAULT_Train_BSIZE = 64
     
     def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]):
         super().__init__(name="hf_Whisper", test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args)

From c77ad909fcba4dc83944a9bf7ee289b2b066534f Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Tue, 25 Jul 2023 17:58:14 +0000
Subject: [PATCH 5/8] fix tests

---
 torchbenchmark/models/hf_Whisper/__init__.py   | 8 +++++---
 torchbenchmark/models/hf_Whisper/metadata.yaml | 2 ++
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/torchbenchmark/models/hf_Whisper/__init__.py b/torchbenchmark/models/hf_Whisper/__init__.py
index 4d6b30772d..e5f1bcd5c9 100644
--- a/torchbenchmark/models/hf_Whisper/__init__.py
+++ b/torchbenchmark/models/hf_Whisper/__init__.py
@@ -11,10 +11,12 @@ def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]):
         super().__init__(name="hf_Whisper", test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args)
         self.feature_size = 80
         self.sequence_length = 3000
-        input_features = torch.randn(size=(self.batch_size, self.feature_size, self.sequence_length),device=self.device)
+        input_features = torch.randn(size=(self.batch_size, self.feature_size, self.sequence_length),device=self.device).half()
         self.example_inputs = {"input_features": input_features.to(self.device)}
+        self.model.to(self.device)
+
+    def get_module(self):
+        return self.model, (self.example_inputs)
 
-    def eval(self):
-        super().eval()
     def train(self):
         raise NotImplementedError("Training is not implemented.")
\ No newline at end of file
diff --git a/torchbenchmark/models/hf_Whisper/metadata.yaml b/torchbenchmark/models/hf_Whisper/metadata.yaml
index df5ea16643..04f4fd2918 100644
--- a/torchbenchmark/models/hf_Whisper/metadata.yaml
+++ b/torchbenchmark/models/hf_Whisper/metadata.yaml
@@ -6,5 +6,7 @@ eval_deterministic: false
 eval_nograd: true
 not_implemented:
 - jit: true
+- device: cpu
+  test: eval
 train_benchmark: false
 train_deterministic: false
\ No newline at end of file

From f232aac7a56d948eddaba7fdfed7b107b4d07830 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Tue, 25 Jul 2023 20:05:15 +0000
Subject: [PATCH 6/8] fix eval test

---
 torchbenchmark/models/hf_Whisper/__init__.py   | 3 +--
 torchbenchmark/models/hf_Whisper/metadata.yaml | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/torchbenchmark/models/hf_Whisper/__init__.py b/torchbenchmark/models/hf_Whisper/__init__.py
index e5f1bcd5c9..6ff94c461c 100644
--- a/torchbenchmark/models/hf_Whisper/__init__.py
+++ b/torchbenchmark/models/hf_Whisper/__init__.py
@@ -4,8 +4,7 @@
 
 class Model(HuggingFaceModel):
     task = SPEECH.RECOGNITION
-    # https://cdn.openai.com/papers/whisper.pdf Says for large-v2 they trained on 1024 batch sizes, with 16 GPUs
-    DEFAULT_EVAL_BSIZE = 64
+    DEFAULT_EVAL_BSIZE = 8
     
     def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]):
         super().__init__(name="hf_Whisper", test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args)
diff --git a/torchbenchmark/models/hf_Whisper/metadata.yaml b/torchbenchmark/models/hf_Whisper/metadata.yaml
index 04f4fd2918..1fadf0eafc 100644
--- a/torchbenchmark/models/hf_Whisper/metadata.yaml
+++ b/torchbenchmark/models/hf_Whisper/metadata.yaml
@@ -7,6 +7,5 @@ eval_nograd: true
 not_implemented:
 - jit: true
 - device: cpu
-  test: eval
 train_benchmark: false
 train_deterministic: false
\ No newline at end of file

From c3d5d100f27939af568acf874d4e50cd081eb96a Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Tue, 25 Jul 2023 21:30:20 +0000
Subject: [PATCH 7/8] push

---
 torchbenchmark/models/hf_Whisper/__init__.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/torchbenchmark/models/hf_Whisper/__init__.py b/torchbenchmark/models/hf_Whisper/__init__.py
index 6ff94c461c..a2d8126cfa 100644
--- a/torchbenchmark/models/hf_Whisper/__init__.py
+++ b/torchbenchmark/models/hf_Whisper/__init__.py
@@ -10,12 +10,10 @@ def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]):
         super().__init__(name="hf_Whisper", test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args)
         self.feature_size = 80
         self.sequence_length = 3000
-        input_features = torch.randn(size=(self.batch_size, self.feature_size, self.sequence_length),device=self.device).half()
-        self.example_inputs = {"input_features": input_features.to(self.device)}
+        self.input_features = torch.randn(size=(self.batch_size, self.feature_size, self.sequence_length),device=self.device).half()
+        self.example_inputs = {"input_features": self.input_features.to(self.device), "input_ids" : self.input_features.to(self.device)}
         self.model.to(self.device)
 
-    def get_module(self):
-        return self.model, (self.example_inputs)
-
     def train(self):
-        raise NotImplementedError("Training is not implemented.")
\ No newline at end of file
+        raise NotImplementedError("Training is not implemented.")
+

From 9bca12c1b686588bcd151ba916c710b4a8401269 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Wed, 26 Jul 2023 00:30:05 +0000
Subject: [PATCH 8/8] add support for half()

---
 torchbenchmark/models/hf_Whisper/__init__.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/torchbenchmark/models/hf_Whisper/__init__.py b/torchbenchmark/models/hf_Whisper/__init__.py
index a2d8126cfa..347caeb2bd 100644
--- a/torchbenchmark/models/hf_Whisper/__init__.py
+++ b/torchbenchmark/models/hf_Whisper/__init__.py
@@ -5,15 +5,24 @@
 class Model(HuggingFaceModel):
     task = SPEECH.RECOGNITION
     DEFAULT_EVAL_BSIZE = 8
+    DEFAULT_EVAL_CUDA_PRECISION = "fp16"
     
     def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]):
         super().__init__(name="hf_Whisper", test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args)
         self.feature_size = 80
         self.sequence_length = 3000
-        self.input_features = torch.randn(size=(self.batch_size, self.feature_size, self.sequence_length),device=self.device).half()
+        self.input_features = torch.randn(size=(self.batch_size, self.feature_size, self.sequence_length),device=self.device)
         self.example_inputs = {"input_features": self.input_features.to(self.device), "input_ids" : self.input_features.to(self.device)}
         self.model.to(self.device)
 
     def train(self):
         raise NotImplementedError("Training is not implemented.")
-
+    
+    def eval(self):
+        self.model.eval()
+        with torch.no_grad():
+            self.model(self.example_inputs["input_ids"])
+    
+    def enable_fp16_half(self):
+        self.model.half()
+        self.example_inputs = {"input_features": self.input_features.half().to(self.device), "input_ids" : self.input_features.half().to(self.device)}