Skip to content

Commit

Permalink
add compile_only and min transformers version
Browse files Browse the repository at this point in the history
  • Loading branch information
eaidova committed Oct 2, 2024
1 parent 667dd3f commit a1c1737
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 24 deletions.
3 changes: 2 additions & 1 deletion optimum/exporters/openvino/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1278,6 +1278,7 @@ class LlavaOpenVINOConfig(OnnxConfig):
SUPPORTED_BEHAVIORS = [model_type.value for model_type in LlavaConfigBehavior]
NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator,)
MIN_TRANSFORMERS_VERSION = version.parse("4.37.2")

def __init__(
self,
Expand Down Expand Up @@ -1426,7 +1427,7 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs) -> Dict:

@register_in_tasks_manager("llava-next", *["image-text-to-text"], library_name="transformers")
class LlavaNextOpenVINOConfig(LlavaOpenVINOConfig):
pass
MIN_TRANSFORMERS_VERSION = version.parse("4.40.0")


class InternVLChatConfigBehavior(str, enum.Enum):
Expand Down
3 changes: 3 additions & 0 deletions optimum/exporters/openvino/model_patcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -2672,6 +2672,9 @@ def __exit__(self, exc_type, exc_value, traceback):


def llava_vision_embed_forward(self, pixel_values):
# copied from https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/models/llava/modeling_llava.py#L428-L441
# these changes does not bring any difference from original, it only packs model subcomponent inference together
# that allow us avoid memory overheads and their inference results handling on code-level
image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
# this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated.
selected_image_feature = image_outputs.hidden_states[self.config.vision_feature_layer]
Expand Down
121 changes: 98 additions & 23 deletions optimum/intel/openvino/modeling_visual_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ def __init__(
self.text_emb_model = text_embeds_model
self.request = None
self.text_emb_request = None
compile_only = kwargs.get("compile_only", False)
if compile_only:
self.text_emb_request = self.text_emb_model
self.request = self.model.create_infer_request()

super().__init__(
model, config, device, dynamic_shapes, ov_config, model_save_dir, quantization_config, **kwargs
Expand All @@ -60,13 +64,23 @@ def _compile_text_emb(self):
self.text_emb_request = core.compile_model(self.text_emb_model, self._device, self.ov_config)

def to(self, device: str):
if self._compile_only:
raise ValueError(
"`to()` is not supported with `compile_only` mode, please intialize model without this option"
)

if isinstance(device, str):
self._device = device.upper()
self.clear_requests()

return self

def clear_requests(self):
if self._compile_only:
raise ValueError(
"`clear_requests()` is not supported with `compile_only` mode, please intialize model without this option"
)

del self.request
del self.text_emb_request
self.request = None
Expand Down Expand Up @@ -226,6 +240,7 @@ def __init__(
self.vision_embeddings_model = vision_embeddings
self._supports_cache_class = False
self.main_input_name = "input_ids"
self._compile_only = kwargs.get("compile_only", False)

for part in self.additional_parts:
setattr(self, f"{part}_model", kwargs.get(part))
Expand All @@ -244,7 +259,8 @@ def __init__(
ov_config=ov_config,
model_save_dir=model_save_dir,
quantization_config=quantization_config,
compile=False,
compile=not self._compile_only,
compile_only=self._compile_only,
)
self.vision_embeddings = OVVisionEmbedding(self.vision_embeddings_model, self)
for part in self.additional_parts:
Expand All @@ -253,7 +269,7 @@ def __init__(
model_part = MODEL_PARTS_CLS_MAPPING[part](model_part, self)
setattr(self, part, model_part)

if enable_compilation:
if enable_compilation and not self._compile_only:
self.compile()

# Avoid warnings when creating a transformers pipeline
Expand Down Expand Up @@ -370,25 +386,51 @@ def _from_pretrained(
model_cls = MODEL_TYPE_TO_CLS_MAPPING[config.model_type]

quantization_config = model_cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
compile_only = kwargs.get("compile_only", False)

# Load model from a local directory
if os.path.isdir(model_id):
language_model = model_cls.load_model(
os.path.join(model_id, language_model_file_name), quantization_config
)
text_embeddings = model_cls.load_model(
os.path.join(model_id, text_embeddings_file_name), quantization_config
)
vision_embeddings = model_cls.load_model(
os.path.join(model_id, vision_embeddings_file_name), quantization_config
)

for part in model_cls.additional_parts:
part_file_name = f"openvino_{part}_model.xml"
part_model = model_cls.load_model(os.path.join(model_id, part_file_name), quantization_config)
kwargs[part] = part_model

model_save_dir = Path(model_id)
if not compile_only:
language_model = model_cls.load_model(
os.path.join(model_id, language_model_file_name), quantization_config
)
text_embeddings = model_cls.load_model(
os.path.join(model_id, text_embeddings_file_name), quantization_config
)
vision_embeddings = model_cls.load_model(
os.path.join(model_id, vision_embeddings_file_name), quantization_config
)

for part in model_cls.additional_parts:
part_file_name = f"openvino_{part}_model.xml"
part_model = model_cls.load_model(os.path.join(model_id, part_file_name), quantization_config)
kwargs[part] = part_model
else:
language_model = model_cls._compile_model(
os.path.join(model_id, language_model_file_name),
kwargs.get("device", "CPU"),
kwargs.get("ov_config"),
model_save_dir,
)
text_embeddings = model_cls._compile_model(
os.path.join(model_id, text_embeddings_file_name),
kwargs.get("device", "CPU"),
kwargs.get("ov_config"),
model_save_dir,
)
vision_embeddings = model_cls._compile_model(
os.path.join(model_id, vision_embeddings_file_name),
kwargs.get("device", "CPU"),
kwargs.get("ov_config"),
model_save_dir,
)
for part in model_cls.additional_parts:
part_file_name = f"openvino_{part}_model.xml"
part_model = model_cls._compile_model(
os.path.join(model_id, part_file_name), kwargs.get("device", "CPU"), kwargs.get("ov_config")
)
kwargs[part] = part_model

# Load model from hub
else:
Expand All @@ -414,12 +456,38 @@ def _from_pretrained(
file_names[name] = model_cache_path

model_save_dir = Path(model_cache_path).parent
language_model = model_cls.load_model(file_names["language_model"], quantization_config)
text_embeddings = model_cls.load_model(file_names["text_embeddings"], quantization_config)
vision_embeddings = model_cls.load_model(file_names["vision_emnbeddings"], quantization_config)
for part in model_cls.additional_parts:
kwargs[part] = model_cls.load_model(file_names[part], quantization_config)

if not compile_only:
language_model = model_cls.load_model(file_names["language_model"], quantization_config)
text_embeddings = model_cls.load_model(file_names["text_embeddings"], quantization_config)
vision_embeddings = model_cls.load_model(file_names["vision_emnbeddings"], quantization_config)
for part in model_cls.additional_parts:
kwargs[part] = model_cls.load_model(file_names[part], quantization_config)
else:
language_model = model_cls._compile_model(
file_names["language_model"],
kwargs.get("device", "CPU"),
kwargs.get("ov_config"),
model_save_dir,
)
text_embeddings = model_cls._compile_model(
file_names["text_embeddings"],
kwargs.get("device", "CPU"),
kwargs.get("ov_config"),
model_save_dir,
)
vision_embeddings = model_cls._compile_model(
file_names["vision_embeddings"],
kwargs.get("device", "CPU"),
kwargs.get("ov_config"),
model_save_dir,
)
for part in model_cls.additional_parts:
kwargs[part] = model_cls._compile_model(
file_names[part],
kwargs.get("device", "CPU"),
kwargs.get("ov_config"),
model_save_dir,
)
try:
generation_config = GenerationConfig.from_pretrained(
model_id,
Expand Down Expand Up @@ -471,6 +539,13 @@ def _from_transformers(
raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
token = use_auth_token

compile_only = kwargs.pop("compile_only", False)
if compile_only:
logger.warning(
"`compile_only` mode will be disabled because it does not support model export."
"Please provide openvino model obtained using optimum-cli or saved on disk using `save_pretrained`"
)
compile_only = False
save_dir = TemporaryDirectory()
save_dir_path = Path(save_dir.name)

Expand Down

0 comments on commit a1c1737

Please sign in to comment.