add compile_only and min transformers version

huggingface · Oct 2, 2024 · a1c1737 · a1c1737
1 parent 667dd3f
commit a1c1737
Show file tree

Hide file tree

Showing 3 changed files with 103 additions and 24 deletions.
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -1278,6 +1278,7 @@ class LlavaOpenVINOConfig(OnnxConfig):
     SUPPORTED_BEHAVIORS = [model_type.value for model_type in LlavaConfigBehavior]
     NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator,)
+    MIN_TRANSFORMERS_VERSION = version.parse("4.37.2")
 
     def __init__(
         self,
@@ -1426,7 +1427,7 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs) -> Dict:
 
 @register_in_tasks_manager("llava-next", *["image-text-to-text"], library_name="transformers")
 class LlavaNextOpenVINOConfig(LlavaOpenVINOConfig):
-    pass
+    MIN_TRANSFORMERS_VERSION = version.parse("4.40.0")
 
 
 class InternVLChatConfigBehavior(str, enum.Enum):

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
@@ -2672,6 +2672,9 @@ def __exit__(self, exc_type, exc_value, traceback):
 
 
 def llava_vision_embed_forward(self, pixel_values):
+    # copied from https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/models/llava/modeling_llava.py#L428-L441
+    # these changes does not bring any difference from original, it only packs model subcomponent inference together
+    # that allow us avoid memory overheads and their inference results handling on code-level
     image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
     # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated.
     selected_image_feature = image_outputs.hidden_states[self.config.vision_feature_layer]

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
@@ -43,6 +43,10 @@ def __init__(
         self.text_emb_model = text_embeds_model
         self.request = None
         self.text_emb_request = None
+        compile_only = kwargs.get("compile_only", False)
+        if compile_only:
+            self.text_emb_request = self.text_emb_model
+            self.request = self.model.create_infer_request()
 
         super().__init__(
             model, config, device, dynamic_shapes, ov_config, model_save_dir, quantization_config, **kwargs
@@ -60,13 +64,23 @@ def _compile_text_emb(self):
             self.text_emb_request = core.compile_model(self.text_emb_model, self._device, self.ov_config)
 
     def to(self, device: str):
+        if self._compile_only:
+            raise ValueError(
+                "`to()` is not supported with `compile_only` mode, please intialize model without this option"
+            )
+
         if isinstance(device, str):
             self._device = device.upper()
             self.clear_requests()
 
         return self
 
     def clear_requests(self):
+        if self._compile_only:
+            raise ValueError(
+                "`clear_requests()` is not supported with `compile_only` mode, please intialize model without this option"
+            )
+
         del self.request
         del self.text_emb_request
         self.request = None
@@ -226,6 +240,7 @@ def __init__(
         self.vision_embeddings_model = vision_embeddings
         self._supports_cache_class = False
         self.main_input_name = "input_ids"
+        self._compile_only = kwargs.get("compile_only", False)
 
         for part in self.additional_parts:
             setattr(self, f"{part}_model", kwargs.get(part))
@@ -244,7 +259,8 @@ def __init__(
             ov_config=ov_config,
             model_save_dir=model_save_dir,
             quantization_config=quantization_config,
-            compile=False,
+            compile=not self._compile_only,
+            compile_only=self._compile_only,
         )
         self.vision_embeddings = OVVisionEmbedding(self.vision_embeddings_model, self)
         for part in self.additional_parts:
@@ -253,7 +269,7 @@ def __init__(
                 model_part = MODEL_PARTS_CLS_MAPPING[part](model_part, self)
             setattr(self, part, model_part)
 
-        if enable_compilation:
+        if enable_compilation and not self._compile_only:
             self.compile()
 
         # Avoid warnings when creating a transformers pipeline
@@ -370,25 +386,51 @@ def _from_pretrained(
         model_cls = MODEL_TYPE_TO_CLS_MAPPING[config.model_type]
 
         quantization_config = model_cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
+        compile_only = kwargs.get("compile_only", False)
 
         # Load model from a local directory
         if os.path.isdir(model_id):
-            language_model = model_cls.load_model(
-                os.path.join(model_id, language_model_file_name), quantization_config
-            )
-            text_embeddings = model_cls.load_model(
-                os.path.join(model_id, text_embeddings_file_name), quantization_config
-            )
-            vision_embeddings = model_cls.load_model(
-                os.path.join(model_id, vision_embeddings_file_name), quantization_config
-            )
-
-            for part in model_cls.additional_parts:
-                part_file_name = f"openvino_{part}_model.xml"
-                part_model = model_cls.load_model(os.path.join(model_id, part_file_name), quantization_config)
-                kwargs[part] = part_model
-
             model_save_dir = Path(model_id)
+            if not compile_only:
+                language_model = model_cls.load_model(
+                    os.path.join(model_id, language_model_file_name), quantization_config
+                )
+                text_embeddings = model_cls.load_model(
+                    os.path.join(model_id, text_embeddings_file_name), quantization_config
+                )
+                vision_embeddings = model_cls.load_model(
+                    os.path.join(model_id, vision_embeddings_file_name), quantization_config
+                )
+
+                for part in model_cls.additional_parts:
+                    part_file_name = f"openvino_{part}_model.xml"
+                    part_model = model_cls.load_model(os.path.join(model_id, part_file_name), quantization_config)
+                    kwargs[part] = part_model
+            else:
+                language_model = model_cls._compile_model(
+                    os.path.join(model_id, language_model_file_name),
+                    kwargs.get("device", "CPU"),
+                    kwargs.get("ov_config"),
+                    model_save_dir,
+                )
+                text_embeddings = model_cls._compile_model(
+                    os.path.join(model_id, text_embeddings_file_name),
+                    kwargs.get("device", "CPU"),
+                    kwargs.get("ov_config"),
+                    model_save_dir,
+                )
+                vision_embeddings = model_cls._compile_model(
+                    os.path.join(model_id, vision_embeddings_file_name),
+                    kwargs.get("device", "CPU"),
+                    kwargs.get("ov_config"),
+                    model_save_dir,
+                )
+                for part in model_cls.additional_parts:
+                    part_file_name = f"openvino_{part}_model.xml"
+                    part_model = model_cls._compile_model(
+                        os.path.join(model_id, part_file_name), kwargs.get("device", "CPU"), kwargs.get("ov_config")
+                    )
+                    kwargs[part] = part_model
 
         # Load model from hub
         else:
@@ -414,12 +456,38 @@ def _from_pretrained(
                 file_names[name] = model_cache_path
 
             model_save_dir = Path(model_cache_path).parent
-            language_model = model_cls.load_model(file_names["language_model"], quantization_config)
-            text_embeddings = model_cls.load_model(file_names["text_embeddings"], quantization_config)
-            vision_embeddings = model_cls.load_model(file_names["vision_emnbeddings"], quantization_config)
-            for part in model_cls.additional_parts:
-                kwargs[part] = model_cls.load_model(file_names[part], quantization_config)
-
+            if not compile_only:
+                language_model = model_cls.load_model(file_names["language_model"], quantization_config)
+                text_embeddings = model_cls.load_model(file_names["text_embeddings"], quantization_config)
+                vision_embeddings = model_cls.load_model(file_names["vision_emnbeddings"], quantization_config)
+                for part in model_cls.additional_parts:
+                    kwargs[part] = model_cls.load_model(file_names[part], quantization_config)
+            else:
+                language_model = model_cls._compile_model(
+                    file_names["language_model"],
+                    kwargs.get("device", "CPU"),
+                    kwargs.get("ov_config"),
+                    model_save_dir,
+                )
+                text_embeddings = model_cls._compile_model(
+                    file_names["text_embeddings"],
+                    kwargs.get("device", "CPU"),
+                    kwargs.get("ov_config"),
+                    model_save_dir,
+                )
+                vision_embeddings = model_cls._compile_model(
+                    file_names["vision_embeddings"],
+                    kwargs.get("device", "CPU"),
+                    kwargs.get("ov_config"),
+                    model_save_dir,
+                )
+                for part in model_cls.additional_parts:
+                    kwargs[part] = model_cls._compile_model(
+                        file_names[part],
+                        kwargs.get("device", "CPU"),
+                        kwargs.get("ov_config"),
+                        model_save_dir,
+                    )
         try:
             generation_config = GenerationConfig.from_pretrained(
                 model_id,
@@ -471,6 +539,13 @@ def _from_transformers(
                 raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
             token = use_auth_token
 
+        compile_only = kwargs.pop("compile_only", False)
+        if compile_only:
+            logger.warning(
+                "`compile_only` mode will be disabled because it does not support model export."
+                "Please provide openvino model obtained using optimum-cli or saved on disk using `save_pretrained`"
+            )
+            compile_only = False
         save_dir = TemporaryDirectory()
         save_dir_path = Path(save_dir.name)