From e00978d4c09e0048da88c2b3e855fd12d85a504e Mon Sep 17 00:00:00 2001 From: Ilango Rajagopal Date: Wed, 3 Jul 2024 10:17:21 +0530 Subject: [PATCH] Lint & Format (#53) * Lint & Format - Added linting and formatting github actions - Formatted entire codebase - Fixed linter errors - Removed `# noqa` with fix Signed-off-by: Ilango Rajagopal * Split test config into multiple-lines Signed-off-by: Ilango Rajagopal * Fix external repo for workflow Signed-off-by: Ilango Rajagopal * Format newly added files Signed-off-by: Ilango Rajagopal --------- Signed-off-by: Ilango Rajagopal --- .github/workflows/lint-format.yml | 19 ++ QEfficient/__init__.py | 18 +- QEfficient/cloud/export.py | 40 +-- QEfficient/cloud/infer.py | 15 +- QEfficient/compile/compile_helper.py | 2 +- QEfficient/exporter/export_utils.py | 4 +- QEfficient/src/__init__.py | 6 +- QEfficient/src/_transformers/auto.py | 44 ++-- QEfficient/src/base.py | 15 +- QEfficient/src/common.py | 34 ++- QEfficient/transformers/modeling_outputs.py | 5 +- .../transformers/models/gptj/modeling_gptj.py | 5 +- QEfficient/utils/__init__.py | 2 +- QEfficient/utils/_utils.py | 34 ++- QEfficient/utils/constants.py | 2 +- QEfficient/utils/device_utils.py | 11 +- QEfficient/utils/generate_inputs.py | 22 +- QEfficient/utils/logging_utils.py | 4 +- QEfficient/utils/run_utils.py | 5 +- tests/cloud/conftest.py | 229 ++++++++++++------ tests/cloud/test_compile.py | 27 ++- tests/cloud/test_execute.py | 28 ++- tests/cloud/test_export.py | 4 +- tests/cloud/test_infer.py | 45 ++-- tests/test_loader.py | 12 +- 25 files changed, 393 insertions(+), 239 deletions(-) create mode 100644 .github/workflows/lint-format.yml diff --git a/.github/workflows/lint-format.yml b/.github/workflows/lint-format.yml new file mode 100644 index 00000000..309bcbdf --- /dev/null +++ b/.github/workflows/lint-format.yml @@ -0,0 +1,19 @@ +name: Lint & Format +on: [pull_request] +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - run: pip3 install ruff + - run: ruff check + env: + RUFF_OUTPUT_FORMAT: github + format: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - run: pip3 install ruff + - run: ruff format --check + env: + RUFF_OUTPUT_FORMAT: github diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py index 98e19e72..21344305 100644 --- a/QEfficient/__init__.py +++ b/QEfficient/__init__.py @@ -5,12 +5,22 @@ # # ----------------------------------------------------------------------------- -from QEfficient.compile.compile_helper import compile # noqa: F401 +from QEfficient.compile.compile_helper import compile from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter -from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv # noqa: F401 -from QEfficient.src import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader # noqa: F401 -from QEfficient.transformers.transform import transform # noqa: F401 +from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv +from QEfficient.src import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader +from QEfficient.transformers.transform import transform # Users can use QEfficient.export for exporting models to ONNX export = qualcomm_efficient_converter __version__ = "0.0.1.dev0" + +__all__ = [ + "transform", + "export", + "compile", + "cloud_ai_100_exec_kv", + "QEffAutoModel", + "QEFFAutoModelForCausalLM", + "QEFFCommonLoader", +] diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py index 5f3d07ee..17f5e58e 100644 --- a/QEfficient/cloud/export.py +++ b/QEfficient/cloud/export.py @@ -19,7 +19,13 @@ ROOT_DIR = os.path.dirname(os.path.abspath("")) -def get_onnx_model_path(model_name: str, cache_dir: Optional[str] = None, tokenizer: Optional[Union[PreTrainedTokenizerFast, PreTrainedTokenizer]]=None, hf_token: Optional[str] = None, local_model_dir: Optional[str] = None): +def get_onnx_model_path( + model_name: str, + cache_dir: Optional[str] = None, + tokenizer: Optional[Union[PreTrainedTokenizerFast, PreTrainedTokenizer]] = None, + hf_token: Optional[str] = None, + local_model_dir: Optional[str] = None, +): """ exports the model to onnx if pre-exported file is not found and returns onnx_model_path """ @@ -33,19 +39,21 @@ def get_onnx_model_path(model_name: str, cache_dir: Optional[str] = None, tokeni # Export to the Onnx logger.info(f"Exporting Pytorch {model_name} model to ONNX...") _, generated_onnx_model_path = qualcomm_efficient_converter( - model_name=model_name, - local_model_dir=local_model_dir, - tokenizer=tokenizer, - onnx_dir_path=onnx_dir_path, - kv=True, - form_factor="cloud", - hf_token=hf_token, - cache_dir=cache_dir - ) # type: ignore - logger.info(f"Generated Onnx_path {generated_onnx_model_path} \nOnnx_model_path {onnx_model_path} \nand Onnx_dir_path is {onnx_dir_path}") + model_name=model_name, + local_model_dir=local_model_dir, + tokenizer=tokenizer, + onnx_dir_path=onnx_dir_path, + kv=True, + form_factor="cloud", + hf_token=hf_token, + cache_dir=cache_dir, + ) # type: ignore + logger.info( + f"Generated Onnx_path {generated_onnx_model_path} \nOnnx_model_path {onnx_model_path} \nand Onnx_dir_path is {onnx_dir_path}" + ) assert ( - generated_onnx_model_path == onnx_model_path - ), f"ONNX files were generated at an unusual location, expected {onnx_model_path}, got {generated_onnx_model_path}" + generated_onnx_model_path == onnx_model_path + ), f"ONNX files were generated at an unusual location, expected {onnx_model_path}, got {generated_onnx_model_path}" return onnx_model_path @@ -63,14 +71,16 @@ def main( :hf_token: str. HuggingFace login token to access private repos. :local_model_dir: str. Path to custom model weights and config files. """ - cache_dir = check_and_assign_cache_dir(local_model_dir,cache_dir) + cache_dir = check_and_assign_cache_dir(local_model_dir, cache_dir) get_onnx_model_path(model_name=model_name, cache_dir=cache_dir, hf_token=hf_token, local_model_dir=local_model_dir) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Export script.") parser.add_argument("--model_name", "--model-name", required=True, help="HF Model card name/id") - parser.add_argument("--local-model-dir", "--local_model_dir", required=False, help="Path to custom model weights and config files") + parser.add_argument( + "--local-model-dir", "--local_model_dir", required=False, help="Path to custom model weights and config files" + ) parser.add_argument( "--cache_dir", "--cache-dir", diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index c531a85e..7068f785 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -30,7 +30,7 @@ def main( model_name: str, num_cores: int, - prompt: Optional[str] = None, # type: ignore + prompt: Optional[str] = None, # type: ignore local_model_dir: Optional[str] = None, prompts_txt_file_path: Optional[str] = None, aic_enable_depth_first: bool = False, @@ -51,9 +51,14 @@ def main( num_cores, mos, batch_size, prompt_len, ctx_len, mxfp6, mxint8, device_group ) prompt: List[str] = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size) - cache_dir = check_and_assign_cache_dir(local_model_dir,cache_dir) + cache_dir = check_and_assign_cache_dir(local_model_dir, cache_dir) - tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name), cache_dir=cache_dir, hf_token=hf_token, local_model_dir=local_model_dir) + tokenizer = load_hf_tokenizer( + pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name), + cache_dir=cache_dir, + hf_token=hf_token, + local_model_dir=local_model_dir, + ) qpc_path_exists, qpc_dir_path = qpc_exists(model_name, qpc_base_dir_name) # Handle qpc generation @@ -104,7 +109,9 @@ def main( description="Inference command, the model will be downloaded from HF, optmized, compiled, executed on Cloud AI 100" ) parser.add_argument("--model-name", "--model_name", required=True, help="HF Model card name/id") - parser.add_argument("--local-model-dir", "--local_model_dir", required=False, help="Path to custom model weights and config files") + parser.add_argument( + "--local-model-dir", "--local_model_dir", required=False, help="Path to custom model weights and config files" + ) parser.add_argument( "--cache-dir", "--cache_dir", diff --git a/QEfficient/compile/compile_helper.py b/QEfficient/compile/compile_helper.py index 8b5272e8..f2d412fb 100644 --- a/QEfficient/compile/compile_helper.py +++ b/QEfficient/compile/compile_helper.py @@ -109,7 +109,7 @@ def compile( ctx_len: int = 128, mxfp6: bool = True, mxint8: bool = False, - **kwargs + **kwargs, ) -> str: # Dynamically create the specializations JSON """ diff --git a/QEfficient/exporter/export_utils.py b/QEfficient/exporter/export_utils.py index 43f8bb81..b6eeddf6 100644 --- a/QEfficient/exporter/export_utils.py +++ b/QEfficient/exporter/export_utils.py @@ -169,7 +169,7 @@ def fix_onnx_fp16( ort_outputs: List[np.ndarray], gen_models_path: str, model_base_name: str, - pt_outputs: Dict[str, torch.Tensor] + pt_outputs: Dict[str, torch.Tensor], ) -> str: finfo = np.finfo(np.float16) fp16_max = finfo.max @@ -218,7 +218,7 @@ def fix_onnx_fp16( os.path.join(gen_models_path, f"{model_base_name}.onnx"), os.path.join(gen_models_path, f"{model_base_name}.onnxweights.data"), ) - + model_base_name += "_clipped_fp16" onnx.save_model( model, diff --git a/QEfficient/src/__init__.py b/QEfficient/src/__init__.py index 85468656..e4a46c4c 100644 --- a/QEfficient/src/__init__.py +++ b/QEfficient/src/__init__.py @@ -5,5 +5,7 @@ # # ----------------------------------------------------------------------------- -from QEfficient.src._transformers.auto import QEffAutoModel, QEFFAutoModelForCausalLM # noqa: F401 -from QEfficient.src.common import QEFFCommonLoader # noqa: F401 +from QEfficient.src._transformers.auto import QEffAutoModel, QEFFAutoModelForCausalLM +from QEfficient.src.common import QEFFCommonLoader + +__all__ = ["QEffAutoModel", "QEFFAutoModelForCausalLM", "QEFFCommonLoader"] diff --git a/QEfficient/src/_transformers/auto.py b/QEfficient/src/_transformers/auto.py index cb71eda2..6a2d824b 100644 --- a/QEfficient/src/_transformers/auto.py +++ b/QEfficient/src/_transformers/auto.py @@ -26,17 +26,21 @@ class QEFFTransformersBase(QEFFBaseModel): """ Parent class for models QEFF provides from transformers i.e. (AutoModel, AutoModelForCausalLM, AutoModelForAudioClassification etc.) from src/transformers/models/auto/modeling_auto.py file. """ - def __init__(self, model: nn.Module, transform:bool = True) -> None: - assert (model.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING.values() or - # FIXME: Use model architectures here instead of complete dictionary TransformersToQEffModulesDict - model.__class__ in TransformersToQEffModulesDict.values()), f"Given model{model.__class__.__name__} could not be found in transformers library i.e. {MODEL_FOR_CAUSAL_LM_MAPPING.values()}" # type: ignore + + def __init__(self, model: nn.Module, transform: bool = True) -> None: + assert ( + model.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING.values() + or + # FIXME: Use model architectures here instead of complete dictionary TransformersToQEffModulesDict + model.__class__ in TransformersToQEffModulesDict.values() + ), f"Given model{model.__class__.__name__} could not be found in transformers library i.e. {MODEL_FOR_CAUSAL_LM_MAPPING.values()}" # type: ignore self.model: nn.Module = model if transform: self.transform() def __repr__(self) -> str: return self.model.__repr__() - + @property def is_transformed(self) -> bool: return getattr(self.model, "qeff_transformed", False) @@ -49,19 +53,22 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs): :param transform:bool. Whether to optimize model for KV retention; default is True. Pass False to get BertStyle model. """ transform: bool = kwargs.get("transform", True) - kwargs.update({"use_cache": True}) # Always pass use_cache = True, to get KV values as output during ONNX export - kwargs.update({"attn_implementation" : "eager"}) # Always use eager mode for attention implementation - - model = QEFFAutoModelToTransformersAutoModelMap[cls.__name__].from_pretrained(pretrained_model_name_or_path, *args, **kwargs) + kwargs.update( + {"use_cache": True} + ) # Always pass use_cache = True, to get KV values as output during ONNX export + kwargs.update({"attn_implementation": "eager"}) # Always use eager mode for attention implementation + + model = QEFFAutoModelToTransformersAutoModelMap[cls.__name__].from_pretrained( + pretrained_model_name_or_path, *args, **kwargs + ) return cls(model, transform=transform) - def transform_export(self, *args, **kwargs) -> Any: raise NotImplementedError("Reached too far!!") - + def transform_export_compile(self, *args, **kwargs) -> Any: raise NotImplementedError("Reached too far!!") - + def transform(self): # FIXME: break down transform into optmization passes i.e. HW specific optimization(RMSNorm), KV retention pass etc. QEfficient.transform(self) @@ -72,22 +79,23 @@ class QEFFAutoModelForCausalLM(QEFFTransformersBase): """ QEFF class for manipulating any causal language model from HuggingFace hub. """ - def execute(self, *args, **kwargs): # type: ignore + + def execute(self, *args, **kwargs): # type: ignore raise NotImplementedError("Reached too far!!") - + def export(self): raise NotImplementedError("Reached too far!!") - + def compile(self, *args, **kwargs) -> Any: raise NotImplementedError("Reached too far!!") class QEffAutoModel(QEFFTransformersBase): - def execute(self, *args, **kwargs): # type: ignore + def execute(self, *args, **kwargs): # type: ignore raise NotImplementedError("Reached too far!!") - + def export(self): raise NotImplementedError("Reached too far!!") - + def compile(self, *args, **kwargs) -> Any: raise NotImplementedError("Reached too far!!") diff --git a/QEfficient/src/base.py b/QEfficient/src/base.py index ddc23fc8..6a441cce 100644 --- a/QEfficient/src/base.py +++ b/QEfficient/src/base.py @@ -18,17 +18,17 @@ QEFFBaseModel ________________________________________________|________________________________________________________________ - | | + | | QEFFTransformersBase QEFFDiffusersBase | | ____________|________________________________________________________ ________________ _________________|______________ - _____ | | | | | | + _____ | | | | | | | QEFFAutoModel QEFFAutoModelForCausalLM QEFFAWQModelForCausalLM ... ... ... -QEFFCommonLoader -| [Provides way to [Provides way to do 1-5 on [Supports 1-5 for +QEFFCommonLoader -| [Provides way to [Provides way to do 1-5 on [Supports 1-5 for [Provides | do steps 1-5 on transformers.AutoModelForCausalLM] AWQ Models] interface to |_____ transformers.AutoModel] -Load any of -These models +Load any of +These models by automatically detecting the type of the model] @@ -42,7 +42,7 @@ from typing import Any -#Defining placeholder ENUM for execute function +# Defining placeholder ENUM for execute function class Runtime(Enum): CPU_ORT = "CPU ONNX Runtime" CPU_PT = "CPU PyTorch Runtime" @@ -56,6 +56,7 @@ class QEFFBaseModel(ABC): All the child classes must provide way to load, transform(optimize), exoprt to ONNX etc. capabilities. """ + def __init__(self) -> None: super().__init__() # Users can call generate or execute @@ -96,4 +97,4 @@ def export(self, *args, **kwargs) -> Any: @abstractmethod def compile(self, *args, **kwargs) -> Any: - pass \ No newline at end of file + pass diff --git a/QEfficient/src/common.py b/QEfficient/src/common.py index bca39109..de890ae3 100644 --- a/QEfficient/src/common.py +++ b/QEfficient/src/common.py @@ -6,11 +6,12 @@ # ----------------------------------------------------------------------------- """ -MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP dictionary defines the mapping between names of the varities of Transformer model defined in +MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP dictionary defines the mapping between names of the varities of Transformer model defined in QEFF_MODEL_TYPE and the classes that implement the methods i.e.(compile, export etc.) for those types. QEFFAutoModel provides a common interface for loading the HuggingFace models using either the HF card name of local path of downloaded model. """ + import os from enum import Enum from typing import Any, Dict, Type @@ -27,6 +28,7 @@ class QEFF_MODEL_TYPE(Enum): """ Defines Names of the different varities of transformer models. """ + CAUSALLM = "LLM" DIFFUSION = "STABLE_DIFFUSION" AWQ = "AWQ" @@ -36,17 +38,22 @@ class QEFF_MODEL_TYPE(Enum): QEFF_MODEL_TYPE.CAUSALLM: QEFFAutoModelForCausalLM } -AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP: Dict[Type[QEFFBaseModel], QEFF_MODEL_TYPE] = {v:k for k,v in MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP.items()} +AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP: Dict[Type[QEFFBaseModel], QEFF_MODEL_TYPE] = { + v: k for k, v in MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP.items() +} + def get_hf_model_type(hf_model_path: str) -> QEFF_MODEL_TYPE: """ Loads model config file and returns the type of the model (i.e. LLMs, SD, quantized etc.) as supported by the library. """ - assert os.path.isdir(hf_model_path), "Pleae pass local dir path where the model is downloaded; use `QEfficient.utils.login_and_download_hf_lm` for downloading hf model" + assert os.path.isdir( + hf_model_path + ), "Pleae pass local dir path where the model is downloaded; use `QEfficient.utils.login_and_download_hf_lm` for downloading hf model" config, kwargs = AutoConfig.from_pretrained( - hf_model_path, - return_unused_kwargs=True, - ) + hf_model_path, + return_unused_kwargs=True, + ) if config.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING: # FIXME: Add logic to handle if quantization config is stored in separate quant_config.json outside of config, also create a separate function for this and below lines @@ -67,20 +74,27 @@ class QEFFCommonLoader: Provides HuggingFace model loading interface same as transformers APIs. Supports loading any model on HuggingFace. """ + def __init__(self, *args: Any, **kwds: Any) -> None: raise EnvironmentError( f"{self.__class__.__name__} is designed to be instantiated " - f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)`") - + f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)`" + ) + @classmethod def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> QEFFBaseModel: """ Downloads HuggingFace model if already doesn't exist locally, returns QEffAutoModel object based on type of model. """ - pretrained_model_name_or_path = pretrained_model_name_or_path if os.path.isdir(pretrained_model_name_or_path) \ + pretrained_model_name_or_path = ( + pretrained_model_name_or_path + if os.path.isdir(pretrained_model_name_or_path) else login_and_download_hf_lm(pretrained_model_name_or_path, *args, **kwargs) + ) model_type = get_hf_model_type(hf_model_path=pretrained_model_name_or_path) qeff_auto_model_class = MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP[model_type] - assert issubclass(qeff_auto_model_class, QEFFBaseModel), f"Expected class that inherits {QEFFBaseModel}, got {type(qeff_auto_model_class)}" + assert issubclass( + qeff_auto_model_class, QEFFBaseModel + ), f"Expected class that inherits {QEFFBaseModel}, got {type(qeff_auto_model_class)}" return qeff_auto_model_class.from_pretrained(pretrained_model_name_or_path=pretrained_model_name_or_path) diff --git a/QEfficient/transformers/modeling_outputs.py b/QEfficient/transformers/modeling_outputs.py index 1ade8a3b..36572fe5 100644 --- a/QEfficient/transformers/modeling_outputs.py +++ b/QEfficient/transformers/modeling_outputs.py @@ -179,7 +179,7 @@ class QEffCausalLMOutputWithPast(ModelOutput): hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None attention_mask_RetainedState: Optional[torch.BoolTensor] = None - + @dataclass class QEffMoeModelOutputWithPast(ModelOutput): @@ -222,7 +222,8 @@ class QEffMoeModelOutputWithPast(ModelOutput): attentions: Optional[Tuple[torch.FloatTensor, ...]] = None router_logits: Optional[Tuple[torch.FloatTensor]] = None attention_mask_RetainedState: Optional[torch.BoolTensor] = None - + + @dataclass class QEffMoeCausalLMOutputWithPast(ModelOutput): """ diff --git a/QEfficient/transformers/models/gptj/modeling_gptj.py b/QEfficient/transformers/models/gptj/modeling_gptj.py index 5bd061bb..9fd55416 100644 --- a/QEfficient/transformers/models/gptj/modeling_gptj.py +++ b/QEfficient/transformers/models/gptj/modeling_gptj.py @@ -55,9 +55,6 @@ def _attn( attention_mask=None, head_mask=None, ): - # compute causal mask from causal mask buffer - query_length, key_length = query.size(-2), key.size(-2) - # Keep the attention weights computation in fp32 to avoid overflow issues query = query.to(torch.float32) key = key.to(torch.float32) @@ -110,7 +107,7 @@ def forward( embed_positions = self._get_embed_positions(position_ids) repeated_position_ids = position_ids.unsqueeze(-1).repeat(1, 1, embed_positions.shape[-1]) - repeated_position_ids = torch.where(repeated_position_ids==-1, 0, repeated_position_ids) + repeated_position_ids = torch.where(repeated_position_ids == -1, 0, repeated_position_ids) sincos = torch.gather(embed_positions, 1, repeated_position_ids) sin, cos = torch.split(sincos, sincos.shape[-1] // 2, dim=-1) diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py index c65ff2d1..6eee3928 100755 --- a/QEfficient/utils/__init__.py +++ b/QEfficient/utils/__init__.py @@ -6,6 +6,7 @@ # ----------------------------------------------------------------------------- from QEfficient.utils._utils import ( # noqa: F401 + check_and_assign_cache_dir, get_qpc_dir_name_infer, hf_download, load_hf_tokenizer, @@ -13,5 +14,4 @@ onnx_exists, padding_check_and_fix, qpc_exists, - check_and_assign_cache_dir ) diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index 5db8259c..1708ba44 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -20,7 +20,7 @@ def login_and_download_hf_lm(model_name, *args, **kwargs): logger.info(f"loading HuggingFace model for {model_name}") hf_token = kwargs.pop("hf_token", None) - cache_dir = kwargs.pop("cache_dir", None) + cache_dir = kwargs.pop("cache_dir", None) if hf_token is not None: login(hf_token) model_name = hf_download( @@ -119,15 +119,29 @@ def onnx_exists(model_name: str) -> Tuple[bool, str, str]: return onnx_exists_bool, onnx_dir_path, onnx_model_path -def load_hf_tokenizer(pretrained_model_name_or_path: str, cache_dir: Optional[str] = None, hf_token: Optional[str] = None, padding_side:str = "right", **kwargs) -> Union[PreTrainedTokenizerFast, PreTrainedTokenizer]: +def load_hf_tokenizer( + pretrained_model_name_or_path: str, + cache_dir: Optional[str] = None, + hf_token: Optional[str] = None, + padding_side: str = "right", + **kwargs, +) -> Union[PreTrainedTokenizerFast, PreTrainedTokenizer]: logger.info("Loading Tokenizer") if hf_token is not None: login(hf_token) # Download tokenizer along with model if it doesn't exist - model_hf_path = pretrained_model_name_or_path if os.path.isdir(pretrained_model_name_or_path) else hf_download(repo_id=pretrained_model_name_or_path, cache_dir=cache_dir, allow_patterns=["*.json", "*.py", "*token*"]) - tokenizer = AutoTokenizer.from_pretrained(model_hf_path, padding_side=padding_side, trust_remote_code=True, **kwargs) + model_hf_path = ( + pretrained_model_name_or_path + if os.path.isdir(pretrained_model_name_or_path) + else hf_download( + repo_id=pretrained_model_name_or_path, cache_dir=cache_dir, allow_patterns=["*.json", "*.py", "*token*"] + ) + ) + tokenizer = AutoTokenizer.from_pretrained( + model_hf_path, padding_side=padding_side, trust_remote_code=True, **kwargs + ) padding_check_and_fix(tokenizer) # Check and fix tokenizer viability - + return tokenizer @@ -145,22 +159,24 @@ def get_qpc_dir_name_infer(num_cores, mos, batch_size, prompt_len, ctx_len, mxfp def check_and_assign_cache_dir(local_model_dir, cache_dir): if local_model_dir is not None: if cache_dir is not None: - logger.warning(f"Both local_model_dir ({local_model_dir}) and cache_dir ({cache_dir}) given. Using local_model_dir.") + logger.warning( + f"Both local_model_dir ({local_model_dir}) and cache_dir ({cache_dir}) given. Using local_model_dir." + ) return None return cache_dir if cache_dir else Constants.CACHE_DIR def padding_check_and_fix(tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]) -> None: """ - Checks and fixes tokenizer paddding side and pad_token_id viability. + Checks and fixes tokenizer paddding side and pad_token_id viability. -------- - + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]. Pass model tokenizer to check and fix. """ if tokenizer.padding_side != "right": logger.warning(f"Setting tokenizer padding_side to 'right', got {tokenizer.padding_side}") tokenizer.padding_side = "right" - + if tokenizer.pad_token_id is None: assert tokenizer.eos_token_id is not None, "Found tokenizer.eos_token_id to be None, expected int" # If Pad token is out of range of vocab size diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index 86d28c0e..ed679bbc 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -23,6 +23,6 @@ class Constants: INPUT_STRING = ["My name is"] CACHE_DIR = os.path.join(ROOT_DIR, "cache_dir") - + GB = 2**30 MAX_QPC_LIMIT = 30 diff --git a/QEfficient/utils/device_utils.py b/QEfficient/utils/device_utils.py index 74d7e3fc..8faaf5f1 100644 --- a/QEfficient/utils/device_utils.py +++ b/QEfficient/utils/device_utils.py @@ -5,10 +5,12 @@ # # ----------------------------------------------------------------------------- -import subprocess import math -from QEfficient.utils.logging_utils import logger +import subprocess + from QEfficient.utils.constants import Constants +from QEfficient.utils.logging_utils import logger + def get_available_device_id(): device_id = 0 @@ -29,7 +31,8 @@ def get_available_device_id(): elif "Failed to find requested device ID" in result.stdout: print("Failed to find requested device ID") return None - + + def is_qpc_size_gt_32gb(params: int, mxfp6: bool) -> bool: if mxfp6: qpc_size = math.ceil((params * 1) / Constants.GB) @@ -38,7 +41,7 @@ def is_qpc_size_gt_32gb(params: int, mxfp6: bool) -> bool: logger.warning(f"Approximate QPC size is: {qpc_size} GB") num_devices = math.ceil(qpc_size / Constants.MAX_QPC_LIMIT) - logger.warning(f"Number of Devices required: {num_devices}" ) + logger.warning(f"Number of Devices required: {num_devices}") return qpc_size > Constants.MAX_QPC_LIMIT diff --git a/QEfficient/utils/generate_inputs.py b/QEfficient/utils/generate_inputs.py index d09e1be5..b0c35d72 100644 --- a/QEfficient/utils/generate_inputs.py +++ b/QEfficient/utils/generate_inputs.py @@ -20,7 +20,7 @@ def __init__(self, tokenizer, input_str, prompt_len, ctx_len): :param prompt_len: int :param ctx_len: int """ - #check and fix tokenizer viability + # check and fix tokenizer viability padding_check_and_fix(tokenizer) self.tokenizer = tokenizer self.input_str = input_str @@ -105,17 +105,11 @@ def prepare_ort_inputs(self, n_layer, padding_shape): inputs.pop("attention_mask") position_ids = np.arange(input_len).reshape(1, -1) inputs["input_ids"] = np.concatenate( - [ - input_ids, - np.full((batch_size, self.prompt_len - input_len), self.tokenizer.pad_token_id) - ], + [input_ids, np.full((batch_size, self.prompt_len - input_len), self.tokenizer.pad_token_id)], axis=1, ).astype(np.int64) inputs["position_ids"] = np.concatenate( - [ - position_ids, - np.full((batch_size, self.prompt_len - input_len), -1) - ], + [position_ids, np.full((batch_size, self.prompt_len - input_len), -1)], axis=1, ).astype(np.int64) @@ -162,17 +156,11 @@ def prepare_cloud_ai_100_inputs(self, n_layer, padding_shape): inputs.pop("attention_mask") position_ids = np.arange(input_len).reshape(1, -1) inputs["input_ids"] = np.concatenate( - [ - input_ids, - np.full((batch_size, self.prompt_len - input_len), self.tokenizer.pad_token_id) - ], + [input_ids, np.full((batch_size, self.prompt_len - input_len), self.tokenizer.pad_token_id)], axis=1, ).astype(np.int64) inputs["position_ids"] = np.concatenate( - [ - position_ids, - np.full((batch_size, self.prompt_len - input_len), -1) - ], + [position_ids, np.full((batch_size, self.prompt_len - input_len), -1)], axis=1, ).astype(np.int64) diff --git a/QEfficient/utils/logging_utils.py b/QEfficient/utils/logging_utils.py index 044e6e83..8dbfd378 100644 --- a/QEfficient/utils/logging_utils.py +++ b/QEfficient/utils/logging_utils.py @@ -18,8 +18,8 @@ class QEffFormatter(logging.Formatter): red: str = "\x1b[31;20m" bold_red: str = "\x1b[31;1m" reset: str = "\x1b[0m" - common_format: str = "%(levelname)s - %(name)s - %(message)s" # type: ignore - format_with_line_info = "%(levelname)s - %(name)s - %(message)s (%(filename)s:%(lineno)d)" # type: ignore + common_format: str = "%(levelname)s - %(name)s - %(message)s" # type: ignore + format_with_line_info = "%(levelname)s - %(name)s - %(message)s (%(filename)s:%(lineno)d)" # type: ignore FORMATS = { logging.DEBUG: cyan + format_with_line_info + reset, diff --git a/QEfficient/utils/run_utils.py b/QEfficient/utils/run_utils.py index 573685bf..46786ae8 100644 --- a/QEfficient/utils/run_utils.py +++ b/QEfficient/utils/run_utils.py @@ -31,7 +31,7 @@ def __init__(self, tokenizer, prompt, prompt_len, ctx_len): :param prompt_len: int :param ctx_len: int """ - + self.tokenizer = tokenizer self.prompt = prompt self.prompt_len = prompt_len @@ -64,7 +64,6 @@ def run_hf_model_on_pytorch(self, model_hf): print("Completion:", repr(generated_text)) return generated_ids - def run_kv_model_on_pytorch(self, model, n_layer, padding_shape): """ Function responsible for running KV PyTorch model and return the output tokens @@ -140,7 +139,7 @@ def run_kv_model_on_ort(self, model_path, n_layer, padding_shape): np_tensor = onnx.numpy_helper.to_array(node.attribute[0].t) if len(np_tensor.shape) == 0 and np_tensor.item() == 65504: node.attribute[0].t.raw_data = np.array(-1).tobytes() - + onnxruntime_model = model_path[:-5] + "_ort.onnx" onnx.save(m, onnxruntime_model) session = onnxruntime.InferenceSession(onnxruntime_model) diff --git a/tests/cloud/conftest.py b/tests/cloud/conftest.py index 678d0990..6ea5a88b 100644 --- a/tests/cloud/conftest.py +++ b/tests/cloud/conftest.py @@ -18,17 +18,33 @@ def pytest_addoption(parser): - parser.addoption( - "--all", action="store_true",default=False, help="Run all test without skipping any test" - ) + parser.addoption("--all", action="store_true", default=False, help="Run all test without skipping any test") + class ModelSetup: """ - model_setup is a set up class for all the High Level testing script, - which provides all neccessary objects needed for checking the flow and creation + model_setup is a set up class for all the High Level testing script, + which provides all neccessary objects needed for checking the flow and creation of the HL API code. """ - def __init__(self,model_name,num_cores,prompt,prompts_txt_file_path,aic_enable_depth_first,mos,cache_dir,hf_token,batch_size,prompt_len,ctx_len,mxfp6,mxint8,device_group): + + def __init__( + self, + model_name, + num_cores, + prompt, + prompts_txt_file_path, + aic_enable_depth_first, + mos, + cache_dir, + hf_token, + batch_size, + prompt_len, + ctx_len, + mxfp6, + mxint8, + device_group, + ): """ Initialization set up ------ @@ -36,21 +52,23 @@ def __init__(self,model_name,num_cores,prompt,prompts_txt_file_path,aic_enable_d param: num_cores: int param: prompt: str param: prompts_txt_file_path: str - param: aic_enable_depth_first: bool + param: aic_enable_depth_first: bool param: mos: int - param: cache_dir: str - param: hf_token: str + param: cache_dir: str + param: hf_token: str param: batch_size: int - param: prompt_len: int - param: ctx_len: int - param: mxfp6: bool + param: prompt_len: int + param: ctx_len: int + param: mxfp6: bool param: mxint8: bool - param: device_group: List[int] + param: device_group: List[int] """ self.model_name = model_name self.num_cores = num_cores self.prompt = prompt - self.prompts_txt_file_path = os.path.join(ROOT_DIR,prompts_txt_file_path) if prompts_txt_file_path is not None else None + self.prompts_txt_file_path = ( + os.path.join(ROOT_DIR, prompts_txt_file_path) if prompts_txt_file_path is not None else None + ) self.aic_enable_depth_first = aic_enable_depth_first self.mos = mos self.cache_dir = cache_dir @@ -64,42 +82,70 @@ def __init__(self,model_name,num_cores,prompt,prompts_txt_file_path,aic_enable_d def model_card_dir(self): return str(os.path.join(QEFF_MODELS_DIR, str(self.model_name))) - + def qpc_base_dir_name(self): - return get_qpc_dir_name_infer(self.num_cores, self.mos, self.batch_size, self.prompt_len, self.ctx_len, self.mxfp6, self.mxint8, self.device_group) - + return get_qpc_dir_name_infer( + self.num_cores, + self.mos, + self.batch_size, + self.prompt_len, + self.ctx_len, + self.mxfp6, + self.mxint8, + self.device_group, + ) + def qpc_dir_path(self): return str(os.path.join(self.model_card_dir(), self.qpc_base_dir_name(), "qpcs")) - + def onnx_dir_path(self): return str(os.path.join(self.model_card_dir(), "onnx")) - + def onnx_model_path(self): return str(os.path.join(self.onnx_dir_path(), self.model_name.replace("/", "_") + "_kv_clipped_fp16.onnx")) - + def model_hf_path(self): - return str(os.path.join(self.cache_dir,self.model_name)) - + return str(os.path.join(self.cache_dir, self.model_name)) + def base_path_and_generated_onnx_path(self): - return str(self.onnx_dir_path()), str(os.path.join(self.onnx_dir_path(), self.model_name.replace("/", "_") + "_kv_clipped_fp16.onnx")) - + return str(self.onnx_dir_path()), str( + os.path.join(self.onnx_dir_path(), self.model_name.replace("/", "_") + "_kv_clipped_fp16.onnx") + ) + def specialization_json_path(self): return str(os.path.join(self.model_card_dir(), self.qpc_base_dir_name(), "specializations.json")) - + def custom_io_file_path(self): if self.mxint8: return str(os.path.join(self.onnx_dir_path(), "custom_io_int8.yaml")) else: return str(os.path.join(self.onnx_dir_path(), "custom_io_fp16.yaml")) + def check_batch_size_for_asserion_error(self): try: result = check_batch_size_and_num_prompts(self.prompt, self.prompts_txt_file_path, self.batch_size) - return {"result":result,"error":None} + return {"result": result, "error": None} except AssertionError as e: - return {"result":None,"error":str(e)} + return {"result": None, "error": str(e)} + @pytest.fixture -def setup(model_name,num_cores,prompt,prompts_txt_file_path,aic_enable_depth_first,mos,cache_dir,hf_token,batch_size,prompt_len,ctx_len,mxfp6,mxint8,device_group): +def setup( + model_name, + num_cores, + prompt, + prompts_txt_file_path, + aic_enable_depth_first, + mos, + cache_dir, + hf_token, + batch_size, + prompt_len, + ctx_len, + mxfp6, + mxint8, + device_group, +): """ It is a fixture or shared object of all testing script within or inner folder, Args are coming from the dynamically generated tests method i.e, pytest_generate_tests via testing script or method @@ -107,9 +153,25 @@ def setup(model_name,num_cores,prompt,prompts_txt_file_path,aic_enable_depth_fir Args: same as set up initialization Return: model_setup class object """ - yield ModelSetup(model_name,num_cores,prompt,prompts_txt_file_path,bool(aic_enable_depth_first),mos,cache_dir,hf_token,batch_size,prompt_len,ctx_len,bool(mxfp6),bool(mxint8),device_group) + yield ModelSetup( + model_name, + num_cores, + prompt, + prompts_txt_file_path, + bool(aic_enable_depth_first), + mos, + cache_dir, + hf_token, + batch_size, + prompt_len, + ctx_len, + bool(mxfp6), + bool(mxint8), + device_group, + ) + -def pytest_generate_tests(metafunc): +def pytest_generate_tests(metafunc): """ pytest_generate_tests hook is used to create our own input parametrization, It generates all the test cases of different combination of input parameters which are read from the json file, @@ -117,37 +179,42 @@ def pytest_generate_tests(metafunc): ----------- Ref: https://docs.pytest.org/en/7.3.x/how-to/parametrize.html """ - json_file = os.path.join(ROOT_DIR,"tests","cloud","high_level_testing.json") - with open(json_file,'r') as file: - json_data = json.load(file) - - metafunc.parametrize("model_name", json_data['model_name'], ids=lambda x: "model_name=" + str(x)) - metafunc.parametrize("num_cores", json_data['num_cores'],ids=lambda x: "num_cores=" + str(x)) - metafunc.parametrize("prompt",json_data['prompt'],ids=lambda x: "prompt=" + str(x)) - metafunc.parametrize("prompts_txt_file_path",json_data['prompts_txt_file_path'],ids=lambda x: "prompts_txt_file_path=" + str(x)) - metafunc.parametrize("aic_enable_depth_first",json_data['aic_enable_depth_first'],ids=lambda x: "aic_enable_depth_first=" + str(x)) - metafunc.parametrize("mos",json_data['mos'],ids=lambda x: "mos=" + str(x)) - metafunc.parametrize("cache_dir",[Constants.CACHE_DIR],ids=lambda x: "cache_dir=" + str(x)) - metafunc.parametrize("hf_token",json_data['hf_token'],ids=lambda x: "hf_token=" + str(x)) - metafunc.parametrize("batch_size",json_data['batch_size'],ids=lambda x: "batch_size=" + str(x)) - metafunc.parametrize("prompt_len",json_data['prompt_len'],ids=lambda x: "prompt_len=" + str(x)) - metafunc.parametrize("ctx_len",json_data['ctx_len'],ids=lambda x: "ctx_len=" + str(x)) - metafunc.parametrize("mxfp6",json_data['mxfp6'],ids=lambda x: "mxfp6=" + str(x)) - metafunc.parametrize("mxint8",json_data['mxint8'],ids=lambda x: "mxint8=" + str(x)) - metafunc.parametrize("device_group",json_data['device_group'],ids=lambda x: "device_group=" + str(x)) - -def pytest_collection_modifyitems(config,items): + json_file = os.path.join(ROOT_DIR, "tests", "cloud", "high_level_testing.json") + with open(json_file, "r") as file: + json_data = json.load(file) + + metafunc.parametrize("model_name", json_data["model_name"], ids=lambda x: "model_name=" + str(x)) + metafunc.parametrize("num_cores", json_data["num_cores"], ids=lambda x: "num_cores=" + str(x)) + metafunc.parametrize("prompt", json_data["prompt"], ids=lambda x: "prompt=" + str(x)) + metafunc.parametrize( + "prompts_txt_file_path", json_data["prompts_txt_file_path"], ids=lambda x: "prompts_txt_file_path=" + str(x) + ) + metafunc.parametrize( + "aic_enable_depth_first", json_data["aic_enable_depth_first"], ids=lambda x: "aic_enable_depth_first=" + str(x) + ) + metafunc.parametrize("mos", json_data["mos"], ids=lambda x: "mos=" + str(x)) + metafunc.parametrize("cache_dir", [Constants.CACHE_DIR], ids=lambda x: "cache_dir=" + str(x)) + metafunc.parametrize("hf_token", json_data["hf_token"], ids=lambda x: "hf_token=" + str(x)) + metafunc.parametrize("batch_size", json_data["batch_size"], ids=lambda x: "batch_size=" + str(x)) + metafunc.parametrize("prompt_len", json_data["prompt_len"], ids=lambda x: "prompt_len=" + str(x)) + metafunc.parametrize("ctx_len", json_data["ctx_len"], ids=lambda x: "ctx_len=" + str(x)) + metafunc.parametrize("mxfp6", json_data["mxfp6"], ids=lambda x: "mxfp6=" + str(x)) + metafunc.parametrize("mxint8", json_data["mxint8"], ids=lambda x: "mxint8=" + str(x)) + metafunc.parametrize("device_group", json_data["device_group"], ids=lambda x: "device_group=" + str(x)) + + +def pytest_collection_modifyitems(config, items): """ pytest_collection_modifyitems is pytest a hook, - which is used to re-order the execution order of the testing script/methods - with various combination of inputs. + which is used to re-order the execution order of the testing script/methods + with various combination of inputs. called after collection has been performed, may filter or re-order the items in-place. - Parameters: + Parameters: items (List[_pytest.nodes.Item]) list of item objects ---------- Ref: https://docs.pytest.org/en/4.6.x/reference.html#collection-hooks """ - run_first = ["test_export","test_compile","test_execute","test_infer"] + run_first = ["test_export", "test_compile", "test_execute", "test_infer"] modules_name = {item.module.__name__ for item in items} cloud_modules = [] non_cloud_modules = [] @@ -156,69 +223,79 @@ def pytest_collection_modifyitems(config,items): cloud_modules.append(module) else: non_cloud_modules.append(module) - - if len(cloud_modules)>1: + + if len(cloud_modules) > 1: modules = {item: item.module.__name__ for item in items} items[:] = sorted(items, key=lambda x: run_first.index(modules[x]) if modules[x] in run_first else len(items)) - + non_cloud_tests = [] for itm in items: if modules[itm] not in cloud_modules: non_cloud_tests.append(itm) - + num_cloud_tests = len(items) - len(non_cloud_tests) - num_cloud_test_cases = num_cloud_tests//len(cloud_modules) + num_cloud_test_cases = num_cloud_tests // len(cloud_modules) final_items = [] - + for i in range(num_cloud_test_cases): for j in range(len(cloud_modules)): - final_items.append(items[i+j*num_cloud_test_cases]) - + final_items.append(items[i + j * num_cloud_test_cases]) + final_items.extend(non_cloud_tests) items[:] = final_items if config.getoption("--all"): return - - first_model = items[0].callspec.params['model_name'] if hasattr(items[0],"callspec") else None - + + first_model = items[0].callspec.params["model_name"] if hasattr(items[0], "callspec") else None + for item in items: - if item.module.__name__ in ["test_export","test_compile","test_execute"]: - if hasattr(item,"callspec"): + if item.module.__name__ in ["test_export", "test_compile", "test_execute"]: + if hasattr(item, "callspec"): params = item.callspec.params - if "model_name" in params and params['model_name'] != first_model: + if "model_name" in params and params["model_name"] != first_model: item.add_marker(pytest.mark.skip(reason="Skipping because not needed now...")) - if "prompt_len" in params and params['prompt_len'] == 2: + if "prompt_len" in params and params["prompt_len"] == 2: item.add_marker(pytest.mark.skip(reason="Skipping because not needed now...")) - + if item.module.__name__ in ["test_infer"]: - if hasattr(item,"callspec"): + if hasattr(item, "callspec"): params = item.callspec.params - if "prompt_len" in params and params['prompt_len'] == 2 and "model_name" in params and params['model_name'] != first_model: + if ( + "prompt_len" in params + and params["prompt_len"] == 2 + and "model_name" in params + and params["model_name"] != first_model + ): item.add_marker(pytest.mark.skip(reason="Skipping because not needed now...")) + def cache_clean_up(): if os.path.exists(Constants.CACHE_DIR): shutil.rmtree(Constants.CACHE_DIR) - logger.info(f'\n.............Cleaned up {Constants.CACHE_DIR}') + logger.info(f"\n.............Cleaned up {Constants.CACHE_DIR}") + def qeff_models_clean_up(): if os.path.exists(QEFF_MODELS_DIR): shutil.rmtree(QEFF_MODELS_DIR) - logger.info(f'\n.............Cleaned up {QEFF_MODELS_DIR}') + logger.info(f"\n.............Cleaned up {QEFF_MODELS_DIR}") + @pytest.fixture def clean_up_after_test(): yield qeff_models_clean_up() - + + def pytest_sessionstart(session): logger.info("PYTEST Session Starting ...") cache_clean_up() qeff_models_clean_up() -def pytest_sessionfinish(session,exitstatus): + +def pytest_sessionfinish(session, exitstatus): cache_clean_up() qeff_models_clean_up() logger.info("...PYTEST Session Ended.") diff --git a/tests/cloud/test_compile.py b/tests/cloud/test_compile.py index a0d6e261..247eb713 100644 --- a/tests/cloud/test_compile.py +++ b/tests/cloud/test_compile.py @@ -21,19 +21,20 @@ def test_compile(setup, mocker): mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions. """ ms = setup - QEfficient.compile(onnx_path=ms.onnx_model_path(), - qpc_path=os.path.dirname(ms.qpc_dir_path()), - num_cores=ms.num_cores, - device_group=ms.device_group, - aic_enable_depth_first=ms.aic_enable_depth_first, - mos=ms.mos, - batch_size=ms.batch_size, - prompt_len=ms.prompt_len, - ctx_len=ms.ctx_len, - mxfp6=ms.mxfp6, - mxint8=ms.mxint8, - ) - + QEfficient.compile( + onnx_path=ms.onnx_model_path(), + qpc_path=os.path.dirname(ms.qpc_dir_path()), + num_cores=ms.num_cores, + device_group=ms.device_group, + aic_enable_depth_first=ms.aic_enable_depth_first, + mos=ms.mos, + batch_size=ms.batch_size, + prompt_len=ms.prompt_len, + ctx_len=ms.ctx_len, + mxfp6=ms.mxfp6, + mxint8=ms.mxint8, + ) + assert os.path.isdir(os.path.join(ms.model_card_dir(), ms.qpc_base_dir_name())) assert os.path.isfile(ms.specialization_json_path()) assert os.path.isfile(ms.custom_io_file_path()) diff --git a/tests/cloud/test_execute.py b/tests/cloud/test_execute.py index 7921a7e1..09f4b7ab 100644 --- a/tests/cloud/test_execute.py +++ b/tests/cloud/test_execute.py @@ -25,19 +25,21 @@ def test_execute(setup, mocker): result = ms.check_batch_size_for_asserion_error() if result["error"] is not None: pytest.skip(f'...Skipping Because batch size is not compatible with the number of prompts: {result["error"]}') - assert result['result'] is not None - load_hf_tokenizer_spy = mocker.spy(QEfficient.cloud.execute,"load_hf_tokenizer") - get_compilation_dims_spy = mocker.spy(QEfficient.cloud.execute,"get_compilation_dims") - check_batch_size_and_num_prompts_spy = mocker.spy(QEfficient.cloud.execute,"check_batch_size_and_num_prompts") - cloud_ai_100_exec_kv_spy = mocker.spy(QEfficient.cloud.execute,"cloud_ai_100_exec_kv") - - execute(model_name=ms.model_name, - qpc_path=ms.qpc_dir_path(), - device_group=ms.device_group, - prompt=ms.prompt, - prompts_txt_file_path=ms.prompts_txt_file_path, - hf_token=ms.hf_token,) - + assert result["result"] is not None + load_hf_tokenizer_spy = mocker.spy(QEfficient.cloud.execute, "load_hf_tokenizer") + get_compilation_dims_spy = mocker.spy(QEfficient.cloud.execute, "get_compilation_dims") + check_batch_size_and_num_prompts_spy = mocker.spy(QEfficient.cloud.execute, "check_batch_size_and_num_prompts") + cloud_ai_100_exec_kv_spy = mocker.spy(QEfficient.cloud.execute, "cloud_ai_100_exec_kv") + + execute( + model_name=ms.model_name, + qpc_path=ms.qpc_dir_path(), + device_group=ms.device_group, + prompt=ms.prompt, + prompts_txt_file_path=ms.prompts_txt_file_path, + hf_token=ms.hf_token, + ) + load_hf_tokenizer_spy.assert_called_once() get_compilation_dims_spy.assert_called_once() assert get_compilation_dims_spy.spy_return == (ms.batch_size, ms.ctx_len) diff --git a/tests/cloud/test_export.py b/tests/cloud/test_export.py index 3227cb2a..d101f60c 100644 --- a/tests/cloud/test_export.py +++ b/tests/cloud/test_export.py @@ -23,8 +23,8 @@ def test_export(setup, mocker): mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions. """ ms = setup - get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.export,"get_onnx_model_path") - export(model_name=ms.model_name,cache_dir=Constants.CACHE_DIR,hf_token=ms.hf_token) + get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.export, "get_onnx_model_path") + export(model_name=ms.model_name, cache_dir=Constants.CACHE_DIR, hf_token=ms.hf_token) get_onnx_model_path_spy.assert_called_once() assert os.path.isfile(ms.onnx_model_path()) assert get_onnx_model_path_spy.spy_return == ms.onnx_model_path() diff --git a/tests/cloud/test_infer.py b/tests/cloud/test_infer.py index 163585af..6b229f8f 100644 --- a/tests/cloud/test_infer.py +++ b/tests/cloud/test_infer.py @@ -1,4 +1,3 @@ - # ----------------------------------------------------------------------------- # # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. @@ -32,29 +31,29 @@ def test_infer(setup, mocker): result = ms.check_batch_size_for_asserion_error() if result["error"] is not None: pytest.skip(f'...Skipping Because batch size is not compatible with the number of prompts: {result["error"]}') - assert result['result'] is not None - get_qpc_dir_name_infer_spy = mocker.spy(QEfficient.cloud.infer,"get_qpc_dir_name_infer") - check_batch_size_and_num_prompts_spy = mocker.spy(QEfficient.cloud.infer,"check_batch_size_and_num_prompts") - load_hf_tokenizer_spy = mocker.spy(QEfficient.cloud.infer,"load_hf_tokenizer") - qpc_exists_spy = mocker.spy(QEfficient.cloud.infer,"qpc_exists") - get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.infer,"get_onnx_model_path") - compile_spy = mocker.spy(QEfficient,"compile") - cloud_ai_100_exec_kv_spy = mocker.spy(QEfficient.cloud.infer,"cloud_ai_100_exec_kv") + assert result["result"] is not None + get_qpc_dir_name_infer_spy = mocker.spy(QEfficient.cloud.infer, "get_qpc_dir_name_infer") + check_batch_size_and_num_prompts_spy = mocker.spy(QEfficient.cloud.infer, "check_batch_size_and_num_prompts") + load_hf_tokenizer_spy = mocker.spy(QEfficient.cloud.infer, "load_hf_tokenizer") + qpc_exists_spy = mocker.spy(QEfficient.cloud.infer, "qpc_exists") + get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.infer, "get_onnx_model_path") + compile_spy = mocker.spy(QEfficient, "compile") + cloud_ai_100_exec_kv_spy = mocker.spy(QEfficient.cloud.infer, "cloud_ai_100_exec_kv") infer( - model_name = ms.model_name, - num_cores = ms.num_cores, - prompt = ms.prompt, - prompts_txt_file_path = ms.prompts_txt_file_path, - aic_enable_depth_first = ms.aic_enable_depth_first, - mos = ms.mos, - hf_token = ms.hf_token, - batch_size = ms.batch_size, - prompt_len = ms.prompt_len, - ctx_len = ms.ctx_len, - mxfp6 = ms.mxfp6, - mxint8 = ms.mxint8, - device_group = ms.device_group, - ) + model_name=ms.model_name, + num_cores=ms.num_cores, + prompt=ms.prompt, + prompts_txt_file_path=ms.prompts_txt_file_path, + aic_enable_depth_first=ms.aic_enable_depth_first, + mos=ms.mos, + hf_token=ms.hf_token, + batch_size=ms.batch_size, + prompt_len=ms.prompt_len, + ctx_len=ms.ctx_len, + mxfp6=ms.mxfp6, + mxint8=ms.mxint8, + device_group=ms.device_group, + ) # prompt fucntion check get_qpc_dir_name_infer_spy.assert_called_once() check_batch_size_and_num_prompts_spy.assert_called_once() diff --git a/tests/test_loader.py b/tests/test_loader.py index 5c626361..52b44cf5 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -13,22 +13,22 @@ import QEfficient from QEfficient import QEFFAutoModelForCausalLM, QEFFCommonLoader -model_name_to_params_dict : Dict[str, Dict[str, Any]] = { +model_name_to_params_dict: Dict[str, Dict[str, Any]] = { "gpt2": { "qeff_class": QEFFAutoModelForCausalLM, "hf_class": GPT2LMHeadModel, - "prompt": "Equator is" + "prompt": "Equator is", }, - } model_names = model_name_to_params_dict.keys() -#FIXME: Add test cases for passing cache_dir, pretrained_model_path instead of card name, etc., Passing other kwargs + +# FIXME: Add test cases for passing cache_dir, pretrained_model_path instead of card name, etc., Passing other kwargs @pytest.mark.parametrize("model_name", model_names) def test_qeff_auto_model_for_causal_lm(model_name: str): model = QEFFCommonLoader.from_pretrained(model_name) - assert isinstance(model, model_name_to_params_dict[model_name]['qeff_class']) - assert isinstance(model.model, model_name_to_params_dict[model_name]['hf_class']) # type: ignore + assert isinstance(model, model_name_to_params_dict[model_name]["qeff_class"]) + assert isinstance(model.model, model_name_to_params_dict[model_name]["hf_class"]) # type: ignore # Run transform QEfficient.transform(model)