From e00978d4c09e0048da88c2b3e855fd12d85a504e Mon Sep 17 00:00:00 2001
From: Ilango Rajagopal <quic_irajagop@quicinc.com>
Date: Wed, 3 Jul 2024 10:17:21 +0530
Subject: [PATCH] Lint & Format (#53)

* Lint & Format

- Added linting and formatting github actions
- Formatted entire codebase
- Fixed linter errors
- Removed `# noqa` with fix

Signed-off-by: Ilango Rajagopal <quic_irajagop@quicinc.com>

* Split test config into multiple-lines

Signed-off-by: Ilango Rajagopal <quic_irajagop@quicinc.com>

* Fix external repo for workflow

Signed-off-by: Ilango Rajagopal <quic_irajagop@quicinc.com>

* Format newly added files

Signed-off-by: Ilango Rajagopal <quic_irajagop@quicinc.com>

---------

Signed-off-by: Ilango Rajagopal <quic_irajagop@quicinc.com>
---
 .github/workflows/lint-format.yml             |  19 ++
 QEfficient/__init__.py                        |  18 +-
 QEfficient/cloud/export.py                    |  40 +--
 QEfficient/cloud/infer.py                     |  15 +-
 QEfficient/compile/compile_helper.py          |   2 +-
 QEfficient/exporter/export_utils.py           |   4 +-
 QEfficient/src/__init__.py                    |   6 +-
 QEfficient/src/_transformers/auto.py          |  44 ++--
 QEfficient/src/base.py                        |  15 +-
 QEfficient/src/common.py                      |  34 ++-
 QEfficient/transformers/modeling_outputs.py   |   5 +-
 .../transformers/models/gptj/modeling_gptj.py |   5 +-
 QEfficient/utils/__init__.py                  |   2 +-
 QEfficient/utils/_utils.py                    |  34 ++-
 QEfficient/utils/constants.py                 |   2 +-
 QEfficient/utils/device_utils.py              |  11 +-
 QEfficient/utils/generate_inputs.py           |  22 +-
 QEfficient/utils/logging_utils.py             |   4 +-
 QEfficient/utils/run_utils.py                 |   5 +-
 tests/cloud/conftest.py                       | 229 ++++++++++++------
 tests/cloud/test_compile.py                   |  27 ++-
 tests/cloud/test_execute.py                   |  28 ++-
 tests/cloud/test_export.py                    |   4 +-
 tests/cloud/test_infer.py                     |  45 ++--
 tests/test_loader.py                          |  12 +-
 25 files changed, 393 insertions(+), 239 deletions(-)
 create mode 100644 .github/workflows/lint-format.yml

diff --git a/.github/workflows/lint-format.yml b/.github/workflows/lint-format.yml
new file mode 100644
index 00000000..309bcbdf
--- /dev/null
+++ b/.github/workflows/lint-format.yml
@@ -0,0 +1,19 @@
+name: Lint & Format
+on: [pull_request]
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - run: pip3 install ruff
+      - run: ruff check
+    env:
+      RUFF_OUTPUT_FORMAT: github
+  format:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - run: pip3 install ruff
+      - run: ruff format --check
+    env:
+      RUFF_OUTPUT_FORMAT: github
diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 98e19e72..21344305 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -5,12 +5,22 @@
 #
 # -----------------------------------------------------------------------------
 
-from QEfficient.compile.compile_helper import compile  # noqa: F401
+from QEfficient.compile.compile_helper import compile
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
-from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv  # noqa: F401
-from QEfficient.src import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader  # noqa: F401
-from QEfficient.transformers.transform import transform  # noqa: F401
+from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
+from QEfficient.src import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader
+from QEfficient.transformers.transform import transform
 
 # Users can use QEfficient.export for exporting models to ONNX
 export = qualcomm_efficient_converter
 __version__ = "0.0.1.dev0"
+
+__all__ = [
+    "transform",
+    "export",
+    "compile",
+    "cloud_ai_100_exec_kv",
+    "QEffAutoModel",
+    "QEFFAutoModelForCausalLM",
+    "QEFFCommonLoader",
+]
diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py
index 5f3d07ee..17f5e58e 100644
--- a/QEfficient/cloud/export.py
+++ b/QEfficient/cloud/export.py
@@ -19,7 +19,13 @@
 ROOT_DIR = os.path.dirname(os.path.abspath(""))
 
 
-def get_onnx_model_path(model_name: str, cache_dir: Optional[str] = None, tokenizer: Optional[Union[PreTrainedTokenizerFast, PreTrainedTokenizer]]=None, hf_token: Optional[str] = None, local_model_dir: Optional[str] = None):
+def get_onnx_model_path(
+    model_name: str,
+    cache_dir: Optional[str] = None,
+    tokenizer: Optional[Union[PreTrainedTokenizerFast, PreTrainedTokenizer]] = None,
+    hf_token: Optional[str] = None,
+    local_model_dir: Optional[str] = None,
+):
     """
     exports the model to onnx if pre-exported file is not found and returns onnx_model_path
     """
@@ -33,19 +39,21 @@ def get_onnx_model_path(model_name: str, cache_dir: Optional[str] = None, tokeni
         # Export to the Onnx
         logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
         _, generated_onnx_model_path = qualcomm_efficient_converter(
-                model_name=model_name,
-                local_model_dir=local_model_dir,
-                tokenizer=tokenizer,
-                onnx_dir_path=onnx_dir_path,
-                kv=True,
-                form_factor="cloud",
-                hf_token=hf_token,
-                cache_dir=cache_dir
-            ) # type: ignore
-        logger.info(f"Generated Onnx_path {generated_onnx_model_path} \nOnnx_model_path {onnx_model_path} \nand Onnx_dir_path is {onnx_dir_path}")
+            model_name=model_name,
+            local_model_dir=local_model_dir,
+            tokenizer=tokenizer,
+            onnx_dir_path=onnx_dir_path,
+            kv=True,
+            form_factor="cloud",
+            hf_token=hf_token,
+            cache_dir=cache_dir,
+        )  # type: ignore
+        logger.info(
+            f"Generated Onnx_path {generated_onnx_model_path} \nOnnx_model_path {onnx_model_path} \nand Onnx_dir_path is {onnx_dir_path}"
+        )
         assert (
-                generated_onnx_model_path == onnx_model_path
-            ), f"ONNX files were generated at an unusual location, expected {onnx_model_path}, got {generated_onnx_model_path}"
+            generated_onnx_model_path == onnx_model_path
+        ), f"ONNX files were generated at an unusual location, expected {onnx_model_path}, got {generated_onnx_model_path}"
     return onnx_model_path
 
 
@@ -63,14 +71,16 @@ def main(
     :hf_token: str. HuggingFace login token to access private repos.
     :local_model_dir: str. Path to custom model weights and config files.
     """
-    cache_dir = check_and_assign_cache_dir(local_model_dir,cache_dir)
+    cache_dir = check_and_assign_cache_dir(local_model_dir, cache_dir)
     get_onnx_model_path(model_name=model_name, cache_dir=cache_dir, hf_token=hf_token, local_model_dir=local_model_dir)
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Export script.")
     parser.add_argument("--model_name", "--model-name", required=True, help="HF Model card name/id")
-    parser.add_argument("--local-model-dir", "--local_model_dir", required=False, help="Path to custom model weights and config files")
+    parser.add_argument(
+        "--local-model-dir", "--local_model_dir", required=False, help="Path to custom model weights and config files"
+    )
     parser.add_argument(
         "--cache_dir",
         "--cache-dir",
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index c531a85e..7068f785 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -30,7 +30,7 @@
 def main(
     model_name: str,
     num_cores: int,
-    prompt: Optional[str] = None, # type: ignore
+    prompt: Optional[str] = None,  # type: ignore
     local_model_dir: Optional[str] = None,
     prompts_txt_file_path: Optional[str] = None,
     aic_enable_depth_first: bool = False,
@@ -51,9 +51,14 @@ def main(
         num_cores, mos, batch_size, prompt_len, ctx_len, mxfp6, mxint8, device_group
     )
     prompt: List[str] = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size)
-    cache_dir = check_and_assign_cache_dir(local_model_dir,cache_dir)
+    cache_dir = check_and_assign_cache_dir(local_model_dir, cache_dir)
 
-    tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name), cache_dir=cache_dir, hf_token=hf_token, local_model_dir=local_model_dir)
+    tokenizer = load_hf_tokenizer(
+        pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
+        cache_dir=cache_dir,
+        hf_token=hf_token,
+        local_model_dir=local_model_dir,
+    )
 
     qpc_path_exists, qpc_dir_path = qpc_exists(model_name, qpc_base_dir_name)
     # Handle qpc generation
@@ -104,7 +109,9 @@ def main(
         description="Inference command, the model will be downloaded from HF, optmized, compiled, executed on Cloud AI 100"
     )
     parser.add_argument("--model-name", "--model_name", required=True, help="HF Model card name/id")
-    parser.add_argument("--local-model-dir", "--local_model_dir", required=False, help="Path to custom model weights and config files")
+    parser.add_argument(
+        "--local-model-dir", "--local_model_dir", required=False, help="Path to custom model weights and config files"
+    )
     parser.add_argument(
         "--cache-dir",
         "--cache_dir",
diff --git a/QEfficient/compile/compile_helper.py b/QEfficient/compile/compile_helper.py
index 8b5272e8..f2d412fb 100644
--- a/QEfficient/compile/compile_helper.py
+++ b/QEfficient/compile/compile_helper.py
@@ -109,7 +109,7 @@ def compile(
     ctx_len: int = 128,
     mxfp6: bool = True,
     mxint8: bool = False,
-    **kwargs
+    **kwargs,
 ) -> str:
     # Dynamically create the specializations JSON
     """
diff --git a/QEfficient/exporter/export_utils.py b/QEfficient/exporter/export_utils.py
index 43f8bb81..b6eeddf6 100644
--- a/QEfficient/exporter/export_utils.py
+++ b/QEfficient/exporter/export_utils.py
@@ -169,7 +169,7 @@ def fix_onnx_fp16(
     ort_outputs: List[np.ndarray],
     gen_models_path: str,
     model_base_name: str,
-    pt_outputs: Dict[str, torch.Tensor]
+    pt_outputs: Dict[str, torch.Tensor],
 ) -> str:
     finfo = np.finfo(np.float16)
     fp16_max = finfo.max
@@ -218,7 +218,7 @@ def fix_onnx_fp16(
             os.path.join(gen_models_path, f"{model_base_name}.onnx"),
             os.path.join(gen_models_path, f"{model_base_name}.onnxweights.data"),
         )
-        
+
         model_base_name += "_clipped_fp16"
         onnx.save_model(
             model,
diff --git a/QEfficient/src/__init__.py b/QEfficient/src/__init__.py
index 85468656..e4a46c4c 100644
--- a/QEfficient/src/__init__.py
+++ b/QEfficient/src/__init__.py
@@ -5,5 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
-from QEfficient.src._transformers.auto import QEffAutoModel, QEFFAutoModelForCausalLM  # noqa: F401
-from QEfficient.src.common import QEFFCommonLoader  # noqa: F401
+from QEfficient.src._transformers.auto import QEffAutoModel, QEFFAutoModelForCausalLM
+from QEfficient.src.common import QEFFCommonLoader
+
+__all__ = ["QEffAutoModel", "QEFFAutoModelForCausalLM", "QEFFCommonLoader"]
diff --git a/QEfficient/src/_transformers/auto.py b/QEfficient/src/_transformers/auto.py
index cb71eda2..6a2d824b 100644
--- a/QEfficient/src/_transformers/auto.py
+++ b/QEfficient/src/_transformers/auto.py
@@ -26,17 +26,21 @@ class QEFFTransformersBase(QEFFBaseModel):
     """
     Parent class for models QEFF provides from transformers i.e. (AutoModel, AutoModelForCausalLM, AutoModelForAudioClassification etc.) from src/transformers/models/auto/modeling_auto.py file.
     """
-    def __init__(self, model: nn.Module, transform:bool = True) -> None:
-        assert (model.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING.values() or
-                # FIXME: Use model architectures here instead of complete dictionary TransformersToQEffModulesDict
-                model.__class__ in TransformersToQEffModulesDict.values()), f"Given model{model.__class__.__name__} could not be found in transformers library i.e. {MODEL_FOR_CAUSAL_LM_MAPPING.values()}" # type: ignore
+
+    def __init__(self, model: nn.Module, transform: bool = True) -> None:
+        assert (
+            model.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING.values()
+            or
+            # FIXME: Use model architectures here instead of complete dictionary TransformersToQEffModulesDict
+            model.__class__ in TransformersToQEffModulesDict.values()
+        ), f"Given model{model.__class__.__name__} could not be found in transformers library i.e. {MODEL_FOR_CAUSAL_LM_MAPPING.values()}"  # type: ignore
         self.model: nn.Module = model
         if transform:
             self.transform()
 
     def __repr__(self) -> str:
         return self.model.__repr__()
-    
+
     @property
     def is_transformed(self) -> bool:
         return getattr(self.model, "qeff_transformed", False)
@@ -49,19 +53,22 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs):
         :param transform:bool. Whether to optimize model for KV retention; default is True. Pass False to get BertStyle model.
         """
         transform: bool = kwargs.get("transform", True)
-        kwargs.update({"use_cache": True})  # Always pass use_cache = True, to get KV values as output during ONNX export 
-        kwargs.update({"attn_implementation" : "eager"}) # Always use eager mode for attention implementation
-        
-        model = QEFFAutoModelToTransformersAutoModelMap[cls.__name__].from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+        kwargs.update(
+            {"use_cache": True}
+        )  # Always pass use_cache = True, to get KV values as output during ONNX export
+        kwargs.update({"attn_implementation": "eager"})  # Always use eager mode for attention implementation
+
+        model = QEFFAutoModelToTransformersAutoModelMap[cls.__name__].from_pretrained(
+            pretrained_model_name_or_path, *args, **kwargs
+        )
         return cls(model, transform=transform)
-        
 
     def transform_export(self, *args, **kwargs) -> Any:
         raise NotImplementedError("Reached too far!!")
-    
+
     def transform_export_compile(self, *args, **kwargs) -> Any:
         raise NotImplementedError("Reached too far!!")
-        
+
     def transform(self):
         # FIXME: break down transform into optmization passes i.e. HW specific optimization(RMSNorm), KV retention pass etc.
         QEfficient.transform(self)
@@ -72,22 +79,23 @@ class QEFFAutoModelForCausalLM(QEFFTransformersBase):
     """
     QEFF class for manipulating any causal language model from HuggingFace hub.
     """
-    def execute(self, *args, **kwargs): # type: ignore
+
+    def execute(self, *args, **kwargs):  # type: ignore
         raise NotImplementedError("Reached too far!!")
-    
+
     def export(self):
         raise NotImplementedError("Reached too far!!")
-    
+
     def compile(self, *args, **kwargs) -> Any:
         raise NotImplementedError("Reached too far!!")
 
 
 class QEffAutoModel(QEFFTransformersBase):
-    def execute(self, *args, **kwargs): # type: ignore
+    def execute(self, *args, **kwargs):  # type: ignore
         raise NotImplementedError("Reached too far!!")
-    
+
     def export(self):
         raise NotImplementedError("Reached too far!!")
-    
+
     def compile(self, *args, **kwargs) -> Any:
         raise NotImplementedError("Reached too far!!")
diff --git a/QEfficient/src/base.py b/QEfficient/src/base.py
index ddc23fc8..6a441cce 100644
--- a/QEfficient/src/base.py
+++ b/QEfficient/src/base.py
@@ -18,17 +18,17 @@
 
                                                                                             QEFFBaseModel
                                                  ________________________________________________|________________________________________________________________
-                                                |                                                                                                                 |  
+                                                |                                                                                                                 |
                                             QEFFTransformersBase                                                                                           QEFFDiffusersBase
                                                 |                                                                                                                 |
                                     ____________|________________________________________________________ ________________                       _________________|______________
-                   _____           |                              |                                      |                |                     |                                |         
+                   _____           |                              |                                      |                |                     |                                |
                   |          QEFFAutoModel             QEFFAutoModelForCausalLM              QEFFAWQModelForCausalLM     ...                   ...                              ...
-QEFFCommonLoader -|       [Provides way to          [Provides way to do 1-5 on                 [Supports 1-5 for 
+QEFFCommonLoader -|       [Provides way to          [Provides way to do 1-5 on                 [Supports 1-5 for
 [Provides         |        do steps 1-5 on           transformers.AutoModelForCausalLM]         AWQ Models]
 interface to      |_____   transformers.AutoModel]
-Load any of 
-These models       
+Load any of
+These models
 by automatically
 detecting the type
 of the model]
@@ -42,7 +42,7 @@
 from typing import Any
 
 
-#Defining placeholder ENUM for execute function
+# Defining placeholder ENUM for execute function
 class Runtime(Enum):
     CPU_ORT = "CPU ONNX Runtime"
     CPU_PT = "CPU PyTorch Runtime"
@@ -56,6 +56,7 @@ class QEFFBaseModel(ABC):
 
     All the child classes must provide way to load, transform(optimize), exoprt to ONNX etc. capabilities.
     """
+
     def __init__(self) -> None:
         super().__init__()
         # Users can call generate or execute
@@ -96,4 +97,4 @@ def export(self, *args, **kwargs) -> Any:
 
     @abstractmethod
     def compile(self, *args, **kwargs) -> Any:
-        pass
\ No newline at end of file
+        pass
diff --git a/QEfficient/src/common.py b/QEfficient/src/common.py
index bca39109..de890ae3 100644
--- a/QEfficient/src/common.py
+++ b/QEfficient/src/common.py
@@ -6,11 +6,12 @@
 # -----------------------------------------------------------------------------
 
 """
-MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP dictionary defines the mapping between names of the varities of Transformer model defined in 
+MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP dictionary defines the mapping between names of the varities of Transformer model defined in
 QEFF_MODEL_TYPE and the classes that implement the methods i.e.(compile, export etc.) for those types.
 
 QEFFAutoModel provides a common interface for loading the HuggingFace models using either the HF card name of local path of downloaded model.
 """
+
 import os
 from enum import Enum
 from typing import Any, Dict, Type
@@ -27,6 +28,7 @@ class QEFF_MODEL_TYPE(Enum):
     """
     Defines Names of the different varities of transformer models.
     """
+
     CAUSALLM = "LLM"
     DIFFUSION = "STABLE_DIFFUSION"
     AWQ = "AWQ"
@@ -36,17 +38,22 @@ class QEFF_MODEL_TYPE(Enum):
     QEFF_MODEL_TYPE.CAUSALLM: QEFFAutoModelForCausalLM
 }
 
-AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP: Dict[Type[QEFFBaseModel], QEFF_MODEL_TYPE] = {v:k for k,v in MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP.items()}
+AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP: Dict[Type[QEFFBaseModel], QEFF_MODEL_TYPE] = {
+    v: k for k, v in MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP.items()
+}
+
 
 def get_hf_model_type(hf_model_path: str) -> QEFF_MODEL_TYPE:
     """
     Loads model config file and returns the type of the model (i.e. LLMs, SD, quantized etc.) as supported by the library.
     """
-    assert os.path.isdir(hf_model_path), "Pleae pass local dir path where the model is downloaded; use `QEfficient.utils.login_and_download_hf_lm` for downloading hf model"
+    assert os.path.isdir(
+        hf_model_path
+    ), "Pleae pass local dir path where the model is downloaded; use `QEfficient.utils.login_and_download_hf_lm` for downloading hf model"
     config, kwargs = AutoConfig.from_pretrained(
-                hf_model_path,
-                return_unused_kwargs=True,
-            )
+        hf_model_path,
+        return_unused_kwargs=True,
+    )
 
     if config.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING:
         # FIXME: Add logic to handle if quantization config is stored in separate quant_config.json outside of config, also create a separate function for this and below lines
@@ -67,20 +74,27 @@ class QEFFCommonLoader:
     Provides HuggingFace model loading interface same as transformers APIs.
     Supports loading any model on HuggingFace.
     """
+
     def __init__(self, *args: Any, **kwds: Any) -> None:
         raise EnvironmentError(
             f"{self.__class__.__name__} is designed to be instantiated "
-            f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)`")
-    
+            f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)`"
+        )
+
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> QEFFBaseModel:
         """
         Downloads HuggingFace model if already doesn't exist locally, returns QEffAutoModel object based on type of model.
         """
-        pretrained_model_name_or_path = pretrained_model_name_or_path if os.path.isdir(pretrained_model_name_or_path) \
+        pretrained_model_name_or_path = (
+            pretrained_model_name_or_path
+            if os.path.isdir(pretrained_model_name_or_path)
             else login_and_download_hf_lm(pretrained_model_name_or_path, *args, **kwargs)
+        )
         model_type = get_hf_model_type(hf_model_path=pretrained_model_name_or_path)
         qeff_auto_model_class = MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP[model_type]
-        assert issubclass(qeff_auto_model_class, QEFFBaseModel), f"Expected class that inherits {QEFFBaseModel}, got {type(qeff_auto_model_class)}"
+        assert issubclass(
+            qeff_auto_model_class, QEFFBaseModel
+        ), f"Expected class that inherits {QEFFBaseModel}, got {type(qeff_auto_model_class)}"
 
         return qeff_auto_model_class.from_pretrained(pretrained_model_name_or_path=pretrained_model_name_or_path)
diff --git a/QEfficient/transformers/modeling_outputs.py b/QEfficient/transformers/modeling_outputs.py
index 1ade8a3b..36572fe5 100644
--- a/QEfficient/transformers/modeling_outputs.py
+++ b/QEfficient/transformers/modeling_outputs.py
@@ -179,7 +179,7 @@ class QEffCausalLMOutputWithPast(ModelOutput):
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     attention_mask_RetainedState: Optional[torch.BoolTensor] = None
-    
+
 
 @dataclass
 class QEffMoeModelOutputWithPast(ModelOutput):
@@ -222,7 +222,8 @@ class QEffMoeModelOutputWithPast(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     router_logits: Optional[Tuple[torch.FloatTensor]] = None
     attention_mask_RetainedState: Optional[torch.BoolTensor] = None
-    
+
+
 @dataclass
 class QEffMoeCausalLMOutputWithPast(ModelOutput):
     """
diff --git a/QEfficient/transformers/models/gptj/modeling_gptj.py b/QEfficient/transformers/models/gptj/modeling_gptj.py
index 5bd061bb..9fd55416 100644
--- a/QEfficient/transformers/models/gptj/modeling_gptj.py
+++ b/QEfficient/transformers/models/gptj/modeling_gptj.py
@@ -55,9 +55,6 @@ def _attn(
         attention_mask=None,
         head_mask=None,
     ):
-        # compute causal mask from causal mask buffer
-        query_length, key_length = query.size(-2), key.size(-2)
-
         # Keep the attention weights computation in fp32 to avoid overflow issues
         query = query.to(torch.float32)
         key = key.to(torch.float32)
@@ -110,7 +107,7 @@ def forward(
             embed_positions = self._get_embed_positions(position_ids)
 
         repeated_position_ids = position_ids.unsqueeze(-1).repeat(1, 1, embed_positions.shape[-1])
-        repeated_position_ids = torch.where(repeated_position_ids==-1, 0, repeated_position_ids)
+        repeated_position_ids = torch.where(repeated_position_ids == -1, 0, repeated_position_ids)
         sincos = torch.gather(embed_positions, 1, repeated_position_ids)
         sin, cos = torch.split(sincos, sincos.shape[-1] // 2, dim=-1)
 
diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py
index c65ff2d1..6eee3928 100755
--- a/QEfficient/utils/__init__.py
+++ b/QEfficient/utils/__init__.py
@@ -6,6 +6,7 @@
 # -----------------------------------------------------------------------------
 
 from QEfficient.utils._utils import (  # noqa: F401
+    check_and_assign_cache_dir,
     get_qpc_dir_name_infer,
     hf_download,
     load_hf_tokenizer,
@@ -13,5 +14,4 @@
     onnx_exists,
     padding_check_and_fix,
     qpc_exists,
-    check_and_assign_cache_dir
 )
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index 5db8259c..1708ba44 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -20,7 +20,7 @@
 def login_and_download_hf_lm(model_name, *args, **kwargs):
     logger.info(f"loading HuggingFace model for {model_name}")
     hf_token = kwargs.pop("hf_token", None)
-    cache_dir = kwargs.pop("cache_dir", None)   
+    cache_dir = kwargs.pop("cache_dir", None)
     if hf_token is not None:
         login(hf_token)
     model_name = hf_download(
@@ -119,15 +119,29 @@ def onnx_exists(model_name: str) -> Tuple[bool, str, str]:
     return onnx_exists_bool, onnx_dir_path, onnx_model_path
 
 
-def load_hf_tokenizer(pretrained_model_name_or_path: str, cache_dir: Optional[str] = None, hf_token: Optional[str] = None, padding_side:str = "right", **kwargs) -> Union[PreTrainedTokenizerFast, PreTrainedTokenizer]:
+def load_hf_tokenizer(
+    pretrained_model_name_or_path: str,
+    cache_dir: Optional[str] = None,
+    hf_token: Optional[str] = None,
+    padding_side: str = "right",
+    **kwargs,
+) -> Union[PreTrainedTokenizerFast, PreTrainedTokenizer]:
     logger.info("Loading Tokenizer")
     if hf_token is not None:
         login(hf_token)
     # Download tokenizer along with model if it doesn't exist
-    model_hf_path = pretrained_model_name_or_path if os.path.isdir(pretrained_model_name_or_path) else hf_download(repo_id=pretrained_model_name_or_path, cache_dir=cache_dir, allow_patterns=["*.json", "*.py", "*token*"])
-    tokenizer = AutoTokenizer.from_pretrained(model_hf_path, padding_side=padding_side, trust_remote_code=True, **kwargs)
+    model_hf_path = (
+        pretrained_model_name_or_path
+        if os.path.isdir(pretrained_model_name_or_path)
+        else hf_download(
+            repo_id=pretrained_model_name_or_path, cache_dir=cache_dir, allow_patterns=["*.json", "*.py", "*token*"]
+        )
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_hf_path, padding_side=padding_side, trust_remote_code=True, **kwargs
+    )
     padding_check_and_fix(tokenizer)  # Check and fix tokenizer viability
-    
+
     return tokenizer
 
 
@@ -145,22 +159,24 @@ def get_qpc_dir_name_infer(num_cores, mos, batch_size, prompt_len, ctx_len, mxfp
 def check_and_assign_cache_dir(local_model_dir, cache_dir):
     if local_model_dir is not None:
         if cache_dir is not None:
-            logger.warning(f"Both local_model_dir ({local_model_dir}) and cache_dir ({cache_dir}) given. Using local_model_dir.")
+            logger.warning(
+                f"Both local_model_dir ({local_model_dir}) and cache_dir ({cache_dir}) given. Using local_model_dir."
+            )
         return None
     return cache_dir if cache_dir else Constants.CACHE_DIR
 
 
 def padding_check_and_fix(tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]) -> None:
     """
-    Checks and fixes tokenizer paddding side and pad_token_id viability. 
+    Checks and fixes tokenizer paddding side and pad_token_id viability.
     --------
-    
+
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]. Pass model tokenizer to check and fix.
     """
     if tokenizer.padding_side != "right":
         logger.warning(f"Setting tokenizer padding_side to 'right', got {tokenizer.padding_side}")
         tokenizer.padding_side = "right"
-    
+
     if tokenizer.pad_token_id is None:
         assert tokenizer.eos_token_id is not None, "Found tokenizer.eos_token_id to be None, expected int"
         # If Pad token is out of range of vocab size
diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
index 86d28c0e..ed679bbc 100644
--- a/QEfficient/utils/constants.py
+++ b/QEfficient/utils/constants.py
@@ -23,6 +23,6 @@ class Constants:
     INPUT_STRING = ["My name is"]
 
     CACHE_DIR = os.path.join(ROOT_DIR, "cache_dir")
-    
+
     GB = 2**30
     MAX_QPC_LIMIT = 30
diff --git a/QEfficient/utils/device_utils.py b/QEfficient/utils/device_utils.py
index 74d7e3fc..8faaf5f1 100644
--- a/QEfficient/utils/device_utils.py
+++ b/QEfficient/utils/device_utils.py
@@ -5,10 +5,12 @@
 #
 # -----------------------------------------------------------------------------
 
-import subprocess
 import math
-from QEfficient.utils.logging_utils import logger
+import subprocess
+
 from QEfficient.utils.constants import Constants
+from QEfficient.utils.logging_utils import logger
+
 
 def get_available_device_id():
     device_id = 0
@@ -29,7 +31,8 @@ def get_available_device_id():
             elif "Failed to find requested device ID" in result.stdout:
                 print("Failed to find requested device ID")
                 return None
-            
+
+
 def is_qpc_size_gt_32gb(params: int, mxfp6: bool) -> bool:
     if mxfp6:
         qpc_size = math.ceil((params * 1) / Constants.GB)
@@ -38,7 +41,7 @@ def is_qpc_size_gt_32gb(params: int, mxfp6: bool) -> bool:
 
     logger.warning(f"Approximate QPC size is: {qpc_size} GB")
     num_devices = math.ceil(qpc_size / Constants.MAX_QPC_LIMIT)
-    logger.warning(f"Number of Devices required: {num_devices}" )
+    logger.warning(f"Number of Devices required: {num_devices}")
     return qpc_size > Constants.MAX_QPC_LIMIT
 
 
diff --git a/QEfficient/utils/generate_inputs.py b/QEfficient/utils/generate_inputs.py
index d09e1be5..b0c35d72 100644
--- a/QEfficient/utils/generate_inputs.py
+++ b/QEfficient/utils/generate_inputs.py
@@ -20,7 +20,7 @@ def __init__(self, tokenizer, input_str, prompt_len, ctx_len):
         :param prompt_len: int
         :param ctx_len: int
         """
-        #check and fix tokenizer viability
+        # check and fix tokenizer viability
         padding_check_and_fix(tokenizer)
         self.tokenizer = tokenizer
         self.input_str = input_str
@@ -105,17 +105,11 @@ def prepare_ort_inputs(self, n_layer, padding_shape):
         inputs.pop("attention_mask")
         position_ids = np.arange(input_len).reshape(1, -1)
         inputs["input_ids"] = np.concatenate(
-            [
-                input_ids,
-                np.full((batch_size, self.prompt_len - input_len), self.tokenizer.pad_token_id)
-            ],
+            [input_ids, np.full((batch_size, self.prompt_len - input_len), self.tokenizer.pad_token_id)],
             axis=1,
         ).astype(np.int64)
         inputs["position_ids"] = np.concatenate(
-            [
-                position_ids,
-                np.full((batch_size, self.prompt_len - input_len), -1)
-            ],
+            [position_ids, np.full((batch_size, self.prompt_len - input_len), -1)],
             axis=1,
         ).astype(np.int64)
 
@@ -162,17 +156,11 @@ def prepare_cloud_ai_100_inputs(self, n_layer, padding_shape):
         inputs.pop("attention_mask")
         position_ids = np.arange(input_len).reshape(1, -1)
         inputs["input_ids"] = np.concatenate(
-            [
-                input_ids,
-                np.full((batch_size, self.prompt_len - input_len), self.tokenizer.pad_token_id)
-            ],
+            [input_ids, np.full((batch_size, self.prompt_len - input_len), self.tokenizer.pad_token_id)],
             axis=1,
         ).astype(np.int64)
         inputs["position_ids"] = np.concatenate(
-            [
-                position_ids,
-                np.full((batch_size, self.prompt_len - input_len), -1)
-            ],
+            [position_ids, np.full((batch_size, self.prompt_len - input_len), -1)],
             axis=1,
         ).astype(np.int64)
 
diff --git a/QEfficient/utils/logging_utils.py b/QEfficient/utils/logging_utils.py
index 044e6e83..8dbfd378 100644
--- a/QEfficient/utils/logging_utils.py
+++ b/QEfficient/utils/logging_utils.py
@@ -18,8 +18,8 @@ class QEffFormatter(logging.Formatter):
     red: str = "\x1b[31;20m"
     bold_red: str = "\x1b[31;1m"
     reset: str = "\x1b[0m"
-    common_format: str = "%(levelname)s - %(name)s - %(message)s" # type: ignore
-    format_with_line_info = "%(levelname)s - %(name)s - %(message)s  (%(filename)s:%(lineno)d)" # type: ignore
+    common_format: str = "%(levelname)s - %(name)s - %(message)s"  # type: ignore
+    format_with_line_info = "%(levelname)s - %(name)s - %(message)s  (%(filename)s:%(lineno)d)"  # type: ignore
 
     FORMATS = {
         logging.DEBUG: cyan + format_with_line_info + reset,
diff --git a/QEfficient/utils/run_utils.py b/QEfficient/utils/run_utils.py
index 573685bf..46786ae8 100644
--- a/QEfficient/utils/run_utils.py
+++ b/QEfficient/utils/run_utils.py
@@ -31,7 +31,7 @@ def __init__(self, tokenizer, prompt, prompt_len, ctx_len):
         :param prompt_len: int
         :param ctx_len: int
         """
-        
+
         self.tokenizer = tokenizer
         self.prompt = prompt
         self.prompt_len = prompt_len
@@ -64,7 +64,6 @@ def run_hf_model_on_pytorch(self, model_hf):
         print("Completion:", repr(generated_text))
         return generated_ids
 
-
     def run_kv_model_on_pytorch(self, model, n_layer, padding_shape):
         """
         Function responsible for running KV PyTorch model and return the output tokens
@@ -140,7 +139,7 @@ def run_kv_model_on_ort(self, model_path, n_layer, padding_shape):
                 np_tensor = onnx.numpy_helper.to_array(node.attribute[0].t)
                 if len(np_tensor.shape) == 0 and np_tensor.item() == 65504:
                     node.attribute[0].t.raw_data = np.array(-1).tobytes()
-        
+
         onnxruntime_model = model_path[:-5] + "_ort.onnx"
         onnx.save(m, onnxruntime_model)
         session = onnxruntime.InferenceSession(onnxruntime_model)
diff --git a/tests/cloud/conftest.py b/tests/cloud/conftest.py
index 678d0990..6ea5a88b 100644
--- a/tests/cloud/conftest.py
+++ b/tests/cloud/conftest.py
@@ -18,17 +18,33 @@
 
 
 def pytest_addoption(parser):
-    parser.addoption(
-        "--all", action="store_true",default=False, help="Run all test without skipping any test"
-    )
+    parser.addoption("--all", action="store_true", default=False, help="Run all test without skipping any test")
+
 
 class ModelSetup:
     """
-    model_setup is a set up class for all the High Level testing script, 
-    which provides all neccessary objects needed for checking the flow and creation 
+    model_setup is a set up class for all the High Level testing script,
+    which provides all neccessary objects needed for checking the flow and creation
     of the HL API code.
     """
-    def __init__(self,model_name,num_cores,prompt,prompts_txt_file_path,aic_enable_depth_first,mos,cache_dir,hf_token,batch_size,prompt_len,ctx_len,mxfp6,mxint8,device_group):
+
+    def __init__(
+        self,
+        model_name,
+        num_cores,
+        prompt,
+        prompts_txt_file_path,
+        aic_enable_depth_first,
+        mos,
+        cache_dir,
+        hf_token,
+        batch_size,
+        prompt_len,
+        ctx_len,
+        mxfp6,
+        mxint8,
+        device_group,
+    ):
         """
         Initialization set up
         ------
@@ -36,21 +52,23 @@ def __init__(self,model_name,num_cores,prompt,prompts_txt_file_path,aic_enable_d
         param: num_cores: int
         param: prompt: str
         param: prompts_txt_file_path: str
-        param: aic_enable_depth_first: bool 
+        param: aic_enable_depth_first: bool
         param: mos: int
-        param: cache_dir: str 
-        param: hf_token: str 
+        param: cache_dir: str
+        param: hf_token: str
         param: batch_size: int
-        param: prompt_len: int 
-        param: ctx_len: int 
-        param: mxfp6: bool 
+        param: prompt_len: int
+        param: ctx_len: int
+        param: mxfp6: bool
         param: mxint8: bool
-        param: device_group: List[int] 
+        param: device_group: List[int]
         """
         self.model_name = model_name
         self.num_cores = num_cores
         self.prompt = prompt
-        self.prompts_txt_file_path = os.path.join(ROOT_DIR,prompts_txt_file_path) if prompts_txt_file_path is not None  else None
+        self.prompts_txt_file_path = (
+            os.path.join(ROOT_DIR, prompts_txt_file_path) if prompts_txt_file_path is not None else None
+        )
         self.aic_enable_depth_first = aic_enable_depth_first
         self.mos = mos
         self.cache_dir = cache_dir
@@ -64,42 +82,70 @@ def __init__(self,model_name,num_cores,prompt,prompts_txt_file_path,aic_enable_d
 
     def model_card_dir(self):
         return str(os.path.join(QEFF_MODELS_DIR, str(self.model_name)))
-    
+
     def qpc_base_dir_name(self):
-        return get_qpc_dir_name_infer(self.num_cores, self.mos, self.batch_size, self.prompt_len, self.ctx_len, self.mxfp6, self.mxint8, self.device_group)
-    
+        return get_qpc_dir_name_infer(
+            self.num_cores,
+            self.mos,
+            self.batch_size,
+            self.prompt_len,
+            self.ctx_len,
+            self.mxfp6,
+            self.mxint8,
+            self.device_group,
+        )
+
     def qpc_dir_path(self):
         return str(os.path.join(self.model_card_dir(), self.qpc_base_dir_name(), "qpcs"))
-    
+
     def onnx_dir_path(self):
         return str(os.path.join(self.model_card_dir(), "onnx"))
-    
+
     def onnx_model_path(self):
         return str(os.path.join(self.onnx_dir_path(), self.model_name.replace("/", "_") + "_kv_clipped_fp16.onnx"))
-    
+
     def model_hf_path(self):
-        return str(os.path.join(self.cache_dir,self.model_name))
-    
+        return str(os.path.join(self.cache_dir, self.model_name))
+
     def base_path_and_generated_onnx_path(self):
-        return str(self.onnx_dir_path()), str(os.path.join(self.onnx_dir_path(), self.model_name.replace("/", "_") + "_kv_clipped_fp16.onnx"))
-    
+        return str(self.onnx_dir_path()), str(
+            os.path.join(self.onnx_dir_path(), self.model_name.replace("/", "_") + "_kv_clipped_fp16.onnx")
+        )
+
     def specialization_json_path(self):
         return str(os.path.join(self.model_card_dir(), self.qpc_base_dir_name(), "specializations.json"))
-    
+
     def custom_io_file_path(self):
         if self.mxint8:
             return str(os.path.join(self.onnx_dir_path(), "custom_io_int8.yaml"))
         else:
             return str(os.path.join(self.onnx_dir_path(), "custom_io_fp16.yaml"))
+
     def check_batch_size_for_asserion_error(self):
         try:
             result = check_batch_size_and_num_prompts(self.prompt, self.prompts_txt_file_path, self.batch_size)
-            return {"result":result,"error":None}
+            return {"result": result, "error": None}
         except AssertionError as e:
-            return {"result":None,"error":str(e)}
+            return {"result": None, "error": str(e)}
+
 
 @pytest.fixture
-def setup(model_name,num_cores,prompt,prompts_txt_file_path,aic_enable_depth_first,mos,cache_dir,hf_token,batch_size,prompt_len,ctx_len,mxfp6,mxint8,device_group):
+def setup(
+    model_name,
+    num_cores,
+    prompt,
+    prompts_txt_file_path,
+    aic_enable_depth_first,
+    mos,
+    cache_dir,
+    hf_token,
+    batch_size,
+    prompt_len,
+    ctx_len,
+    mxfp6,
+    mxint8,
+    device_group,
+):
     """
     It is a fixture or shared object of all testing script within or inner folder,
     Args are coming from the dynamically generated tests method i.e, pytest_generate_tests via testing script or method
@@ -107,9 +153,25 @@ def setup(model_name,num_cores,prompt,prompts_txt_file_path,aic_enable_depth_fir
     Args: same as set up initialization
     Return: model_setup class object
     """
-    yield ModelSetup(model_name,num_cores,prompt,prompts_txt_file_path,bool(aic_enable_depth_first),mos,cache_dir,hf_token,batch_size,prompt_len,ctx_len,bool(mxfp6),bool(mxint8),device_group)
+    yield ModelSetup(
+        model_name,
+        num_cores,
+        prompt,
+        prompts_txt_file_path,
+        bool(aic_enable_depth_first),
+        mos,
+        cache_dir,
+        hf_token,
+        batch_size,
+        prompt_len,
+        ctx_len,
+        bool(mxfp6),
+        bool(mxint8),
+        device_group,
+    )
+
 
-def pytest_generate_tests(metafunc):  
+def pytest_generate_tests(metafunc):
     """
     pytest_generate_tests hook is used to create our own input parametrization,
     It generates all the test cases of different combination of input parameters which are read from the json file,
@@ -117,37 +179,42 @@ def pytest_generate_tests(metafunc):
     -----------
     Ref: https://docs.pytest.org/en/7.3.x/how-to/parametrize.html
     """
-    json_file  = os.path.join(ROOT_DIR,"tests","cloud","high_level_testing.json")
-    with open(json_file,'r') as file:
-        json_data =  json.load(file)
-
-    metafunc.parametrize("model_name", json_data['model_name'], ids=lambda x: "model_name=" + str(x))
-    metafunc.parametrize("num_cores", json_data['num_cores'],ids=lambda x: "num_cores=" + str(x))
-    metafunc.parametrize("prompt",json_data['prompt'],ids=lambda x: "prompt=" + str(x))
-    metafunc.parametrize("prompts_txt_file_path",json_data['prompts_txt_file_path'],ids=lambda x: "prompts_txt_file_path=" + str(x))
-    metafunc.parametrize("aic_enable_depth_first",json_data['aic_enable_depth_first'],ids=lambda x: "aic_enable_depth_first=" + str(x))
-    metafunc.parametrize("mos",json_data['mos'],ids=lambda x: "mos=" + str(x))
-    metafunc.parametrize("cache_dir",[Constants.CACHE_DIR],ids=lambda x: "cache_dir=" + str(x))
-    metafunc.parametrize("hf_token",json_data['hf_token'],ids=lambda x: "hf_token=" + str(x))
-    metafunc.parametrize("batch_size",json_data['batch_size'],ids=lambda x: "batch_size=" + str(x))
-    metafunc.parametrize("prompt_len",json_data['prompt_len'],ids=lambda x: "prompt_len=" + str(x))
-    metafunc.parametrize("ctx_len",json_data['ctx_len'],ids=lambda x: "ctx_len=" + str(x))
-    metafunc.parametrize("mxfp6",json_data['mxfp6'],ids=lambda x: "mxfp6=" + str(x))
-    metafunc.parametrize("mxint8",json_data['mxint8'],ids=lambda x: "mxint8=" + str(x))
-    metafunc.parametrize("device_group",json_data['device_group'],ids=lambda x: "device_group=" + str(x))
-
-def pytest_collection_modifyitems(config,items):
+    json_file = os.path.join(ROOT_DIR, "tests", "cloud", "high_level_testing.json")
+    with open(json_file, "r") as file:
+        json_data = json.load(file)
+
+    metafunc.parametrize("model_name", json_data["model_name"], ids=lambda x: "model_name=" + str(x))
+    metafunc.parametrize("num_cores", json_data["num_cores"], ids=lambda x: "num_cores=" + str(x))
+    metafunc.parametrize("prompt", json_data["prompt"], ids=lambda x: "prompt=" + str(x))
+    metafunc.parametrize(
+        "prompts_txt_file_path", json_data["prompts_txt_file_path"], ids=lambda x: "prompts_txt_file_path=" + str(x)
+    )
+    metafunc.parametrize(
+        "aic_enable_depth_first", json_data["aic_enable_depth_first"], ids=lambda x: "aic_enable_depth_first=" + str(x)
+    )
+    metafunc.parametrize("mos", json_data["mos"], ids=lambda x: "mos=" + str(x))
+    metafunc.parametrize("cache_dir", [Constants.CACHE_DIR], ids=lambda x: "cache_dir=" + str(x))
+    metafunc.parametrize("hf_token", json_data["hf_token"], ids=lambda x: "hf_token=" + str(x))
+    metafunc.parametrize("batch_size", json_data["batch_size"], ids=lambda x: "batch_size=" + str(x))
+    metafunc.parametrize("prompt_len", json_data["prompt_len"], ids=lambda x: "prompt_len=" + str(x))
+    metafunc.parametrize("ctx_len", json_data["ctx_len"], ids=lambda x: "ctx_len=" + str(x))
+    metafunc.parametrize("mxfp6", json_data["mxfp6"], ids=lambda x: "mxfp6=" + str(x))
+    metafunc.parametrize("mxint8", json_data["mxint8"], ids=lambda x: "mxint8=" + str(x))
+    metafunc.parametrize("device_group", json_data["device_group"], ids=lambda x: "device_group=" + str(x))
+
+
+def pytest_collection_modifyitems(config, items):
     """
     pytest_collection_modifyitems is pytest a hook,
-    which is used to re-order the execution order of the testing script/methods 
-    with various combination of inputs. 
+    which is used to re-order the execution order of the testing script/methods
+    with various combination of inputs.
     called after collection has been performed, may filter or re-order the items in-place.
-    Parameters:	
+    Parameters:
     items (List[_pytest.nodes.Item]) list of item objects
     ----------
     Ref: https://docs.pytest.org/en/4.6.x/reference.html#collection-hooks
     """
-    run_first = ["test_export","test_compile","test_execute","test_infer"]
+    run_first = ["test_export", "test_compile", "test_execute", "test_infer"]
     modules_name = {item.module.__name__ for item in items}
     cloud_modules = []
     non_cloud_modules = []
@@ -156,69 +223,79 @@ def pytest_collection_modifyitems(config,items):
             cloud_modules.append(module)
         else:
             non_cloud_modules.append(module)
-    
-    if len(cloud_modules)>1:
+
+    if len(cloud_modules) > 1:
         modules = {item: item.module.__name__ for item in items}
         items[:] = sorted(items, key=lambda x: run_first.index(modules[x]) if modules[x] in run_first else len(items))
-        
+
         non_cloud_tests = []
 
         for itm in items:
             if modules[itm] not in cloud_modules:
                 non_cloud_tests.append(itm)
-        
+
         num_cloud_tests = len(items) - len(non_cloud_tests)
-        num_cloud_test_cases = num_cloud_tests//len(cloud_modules)
+        num_cloud_test_cases = num_cloud_tests // len(cloud_modules)
         final_items = []
-        
+
         for i in range(num_cloud_test_cases):
             for j in range(len(cloud_modules)):
-                final_items.append(items[i+j*num_cloud_test_cases])
-        
+                final_items.append(items[i + j * num_cloud_test_cases])
+
         final_items.extend(non_cloud_tests)
         items[:] = final_items
 
         if config.getoption("--all"):
             return
-        
-        first_model = items[0].callspec.params['model_name'] if hasattr(items[0],"callspec") else None
-        
+
+        first_model = items[0].callspec.params["model_name"] if hasattr(items[0], "callspec") else None
+
         for item in items:
-            if item.module.__name__ in ["test_export","test_compile","test_execute"]:
-                if hasattr(item,"callspec"):
+            if item.module.__name__ in ["test_export", "test_compile", "test_execute"]:
+                if hasattr(item, "callspec"):
                     params = item.callspec.params
-                    if "model_name" in params and params['model_name'] != first_model:
+                    if "model_name" in params and params["model_name"] != first_model:
                         item.add_marker(pytest.mark.skip(reason="Skipping because not needed now..."))
-                    if "prompt_len" in params and params['prompt_len'] == 2:
+                    if "prompt_len" in params and params["prompt_len"] == 2:
                         item.add_marker(pytest.mark.skip(reason="Skipping because not needed now..."))
-            
+
             if item.module.__name__ in ["test_infer"]:
-                if hasattr(item,"callspec"):
+                if hasattr(item, "callspec"):
                     params = item.callspec.params
-                    if "prompt_len" in params and params['prompt_len'] == 2 and "model_name" in params and params['model_name'] != first_model:
+                    if (
+                        "prompt_len" in params
+                        and params["prompt_len"] == 2
+                        and "model_name" in params
+                        and params["model_name"] != first_model
+                    ):
                         item.add_marker(pytest.mark.skip(reason="Skipping because not needed now..."))
 
+
 def cache_clean_up():
     if os.path.exists(Constants.CACHE_DIR):
         shutil.rmtree(Constants.CACHE_DIR)
-        logger.info(f'\n.............Cleaned up {Constants.CACHE_DIR}')
+        logger.info(f"\n.............Cleaned up {Constants.CACHE_DIR}")
+
 
 def qeff_models_clean_up():
     if os.path.exists(QEFF_MODELS_DIR):
         shutil.rmtree(QEFF_MODELS_DIR)
-        logger.info(f'\n.............Cleaned up {QEFF_MODELS_DIR}')
+        logger.info(f"\n.............Cleaned up {QEFF_MODELS_DIR}")
+
 
 @pytest.fixture
 def clean_up_after_test():
     yield
     qeff_models_clean_up()
-   
+
+
 def pytest_sessionstart(session):
     logger.info("PYTEST Session Starting ...")
     cache_clean_up()
     qeff_models_clean_up()
 
-def pytest_sessionfinish(session,exitstatus):
+
+def pytest_sessionfinish(session, exitstatus):
     cache_clean_up()
     qeff_models_clean_up()
     logger.info("...PYTEST Session Ended.")
diff --git a/tests/cloud/test_compile.py b/tests/cloud/test_compile.py
index a0d6e261..247eb713 100644
--- a/tests/cloud/test_compile.py
+++ b/tests/cloud/test_compile.py
@@ -21,19 +21,20 @@ def test_compile(setup, mocker):
     mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions.
     """
     ms = setup
-    QEfficient.compile(onnx_path=ms.onnx_model_path(),
-            qpc_path=os.path.dirname(ms.qpc_dir_path()),
-            num_cores=ms.num_cores,
-            device_group=ms.device_group,
-            aic_enable_depth_first=ms.aic_enable_depth_first,
-            mos=ms.mos,
-            batch_size=ms.batch_size,
-            prompt_len=ms.prompt_len,
-            ctx_len=ms.ctx_len,
-            mxfp6=ms.mxfp6,
-            mxint8=ms.mxint8,
-            )
-    
+    QEfficient.compile(
+        onnx_path=ms.onnx_model_path(),
+        qpc_path=os.path.dirname(ms.qpc_dir_path()),
+        num_cores=ms.num_cores,
+        device_group=ms.device_group,
+        aic_enable_depth_first=ms.aic_enable_depth_first,
+        mos=ms.mos,
+        batch_size=ms.batch_size,
+        prompt_len=ms.prompt_len,
+        ctx_len=ms.ctx_len,
+        mxfp6=ms.mxfp6,
+        mxint8=ms.mxint8,
+    )
+
     assert os.path.isdir(os.path.join(ms.model_card_dir(), ms.qpc_base_dir_name()))
     assert os.path.isfile(ms.specialization_json_path())
     assert os.path.isfile(ms.custom_io_file_path())
diff --git a/tests/cloud/test_execute.py b/tests/cloud/test_execute.py
index 7921a7e1..09f4b7ab 100644
--- a/tests/cloud/test_execute.py
+++ b/tests/cloud/test_execute.py
@@ -25,19 +25,21 @@ def test_execute(setup, mocker):
     result = ms.check_batch_size_for_asserion_error()
     if result["error"] is not None:
         pytest.skip(f'...Skipping Because batch size is not compatible with the number of prompts: {result["error"]}')
-    assert result['result'] is not None
-    load_hf_tokenizer_spy = mocker.spy(QEfficient.cloud.execute,"load_hf_tokenizer")
-    get_compilation_dims_spy = mocker.spy(QEfficient.cloud.execute,"get_compilation_dims")
-    check_batch_size_and_num_prompts_spy = mocker.spy(QEfficient.cloud.execute,"check_batch_size_and_num_prompts")
-    cloud_ai_100_exec_kv_spy = mocker.spy(QEfficient.cloud.execute,"cloud_ai_100_exec_kv")
-  
-    execute(model_name=ms.model_name,
-            qpc_path=ms.qpc_dir_path(),
-            device_group=ms.device_group,
-            prompt=ms.prompt,
-            prompts_txt_file_path=ms.prompts_txt_file_path,
-            hf_token=ms.hf_token,)
-   
+    assert result["result"] is not None
+    load_hf_tokenizer_spy = mocker.spy(QEfficient.cloud.execute, "load_hf_tokenizer")
+    get_compilation_dims_spy = mocker.spy(QEfficient.cloud.execute, "get_compilation_dims")
+    check_batch_size_and_num_prompts_spy = mocker.spy(QEfficient.cloud.execute, "check_batch_size_and_num_prompts")
+    cloud_ai_100_exec_kv_spy = mocker.spy(QEfficient.cloud.execute, "cloud_ai_100_exec_kv")
+
+    execute(
+        model_name=ms.model_name,
+        qpc_path=ms.qpc_dir_path(),
+        device_group=ms.device_group,
+        prompt=ms.prompt,
+        prompts_txt_file_path=ms.prompts_txt_file_path,
+        hf_token=ms.hf_token,
+    )
+
     load_hf_tokenizer_spy.assert_called_once()
     get_compilation_dims_spy.assert_called_once()
     assert get_compilation_dims_spy.spy_return == (ms.batch_size, ms.ctx_len)
diff --git a/tests/cloud/test_export.py b/tests/cloud/test_export.py
index 3227cb2a..d101f60c 100644
--- a/tests/cloud/test_export.py
+++ b/tests/cloud/test_export.py
@@ -23,8 +23,8 @@ def test_export(setup, mocker):
     mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions.
     """
     ms = setup
-    get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.export,"get_onnx_model_path")
-    export(model_name=ms.model_name,cache_dir=Constants.CACHE_DIR,hf_token=ms.hf_token)
+    get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.export, "get_onnx_model_path")
+    export(model_name=ms.model_name, cache_dir=Constants.CACHE_DIR, hf_token=ms.hf_token)
     get_onnx_model_path_spy.assert_called_once()
     assert os.path.isfile(ms.onnx_model_path())
     assert get_onnx_model_path_spy.spy_return == ms.onnx_model_path()
diff --git a/tests/cloud/test_infer.py b/tests/cloud/test_infer.py
index 163585af..6b229f8f 100644
--- a/tests/cloud/test_infer.py
+++ b/tests/cloud/test_infer.py
@@ -1,4 +1,3 @@
-
 # -----------------------------------------------------------------------------
 #
 # Copyright (c)  2024 Qualcomm Innovation Center, Inc. All rights reserved.
@@ -32,29 +31,29 @@ def test_infer(setup, mocker):
     result = ms.check_batch_size_for_asserion_error()
     if result["error"] is not None:
         pytest.skip(f'...Skipping Because batch size is not compatible with the number of prompts: {result["error"]}')
-    assert result['result'] is not None
-    get_qpc_dir_name_infer_spy = mocker.spy(QEfficient.cloud.infer,"get_qpc_dir_name_infer")
-    check_batch_size_and_num_prompts_spy = mocker.spy(QEfficient.cloud.infer,"check_batch_size_and_num_prompts")
-    load_hf_tokenizer_spy = mocker.spy(QEfficient.cloud.infer,"load_hf_tokenizer")
-    qpc_exists_spy = mocker.spy(QEfficient.cloud.infer,"qpc_exists")
-    get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.infer,"get_onnx_model_path")
-    compile_spy = mocker.spy(QEfficient,"compile")
-    cloud_ai_100_exec_kv_spy = mocker.spy(QEfficient.cloud.infer,"cloud_ai_100_exec_kv")
+    assert result["result"] is not None
+    get_qpc_dir_name_infer_spy = mocker.spy(QEfficient.cloud.infer, "get_qpc_dir_name_infer")
+    check_batch_size_and_num_prompts_spy = mocker.spy(QEfficient.cloud.infer, "check_batch_size_and_num_prompts")
+    load_hf_tokenizer_spy = mocker.spy(QEfficient.cloud.infer, "load_hf_tokenizer")
+    qpc_exists_spy = mocker.spy(QEfficient.cloud.infer, "qpc_exists")
+    get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.infer, "get_onnx_model_path")
+    compile_spy = mocker.spy(QEfficient, "compile")
+    cloud_ai_100_exec_kv_spy = mocker.spy(QEfficient.cloud.infer, "cloud_ai_100_exec_kv")
     infer(
-            model_name = ms.model_name,
-            num_cores = ms.num_cores,
-            prompt = ms.prompt,
-            prompts_txt_file_path = ms.prompts_txt_file_path,
-            aic_enable_depth_first = ms.aic_enable_depth_first,
-            mos = ms.mos,  
-            hf_token = ms.hf_token,
-            batch_size = ms.batch_size,
-            prompt_len = ms.prompt_len,
-            ctx_len = ms.ctx_len,
-            mxfp6 = ms.mxfp6,
-            mxint8 = ms.mxint8,
-            device_group = ms.device_group,
-            )  
+        model_name=ms.model_name,
+        num_cores=ms.num_cores,
+        prompt=ms.prompt,
+        prompts_txt_file_path=ms.prompts_txt_file_path,
+        aic_enable_depth_first=ms.aic_enable_depth_first,
+        mos=ms.mos,
+        hf_token=ms.hf_token,
+        batch_size=ms.batch_size,
+        prompt_len=ms.prompt_len,
+        ctx_len=ms.ctx_len,
+        mxfp6=ms.mxfp6,
+        mxint8=ms.mxint8,
+        device_group=ms.device_group,
+    )
     # prompt fucntion check
     get_qpc_dir_name_infer_spy.assert_called_once()
     check_batch_size_and_num_prompts_spy.assert_called_once()
diff --git a/tests/test_loader.py b/tests/test_loader.py
index 5c626361..52b44cf5 100644
--- a/tests/test_loader.py
+++ b/tests/test_loader.py
@@ -13,22 +13,22 @@
 import QEfficient
 from QEfficient import QEFFAutoModelForCausalLM, QEFFCommonLoader
 
-model_name_to_params_dict : Dict[str, Dict[str, Any]] = {
+model_name_to_params_dict: Dict[str, Dict[str, Any]] = {
     "gpt2": {
         "qeff_class": QEFFAutoModelForCausalLM,
         "hf_class": GPT2LMHeadModel,
-        "prompt": "Equator is"
+        "prompt": "Equator is",
     },
-    
 }
 model_names = model_name_to_params_dict.keys()
 
-#FIXME: Add test cases for passing cache_dir, pretrained_model_path instead of card name, etc., Passing other kwargs
+
+# FIXME: Add test cases for passing cache_dir, pretrained_model_path instead of card name, etc., Passing other kwargs
 @pytest.mark.parametrize("model_name", model_names)
 def test_qeff_auto_model_for_causal_lm(model_name: str):
     model = QEFFCommonLoader.from_pretrained(model_name)
-    assert isinstance(model, model_name_to_params_dict[model_name]['qeff_class'])
-    assert isinstance(model.model, model_name_to_params_dict[model_name]['hf_class']) # type: ignore
+    assert isinstance(model, model_name_to_params_dict[model_name]["qeff_class"])
+    assert isinstance(model.model, model_name_to_params_dict[model_name]["hf_class"])  # type: ignore
 
     # Run transform
     QEfficient.transform(model)