rebase to master branch

openvinotoolkit · Mar 26, 2024 · 40ba19f · 40ba19f
1 parent 375819d
commit 40ba19f
Show file tree

Hide file tree

Showing 5 changed files with 619 additions and 197 deletions.
diff --git a/.ci/spellcheck/.pyspelling.wordlist.txt b/.ci/spellcheck/.pyspelling.wordlist.txt
@@ -7,7 +7,6 @@ activations
 adaptively
 adas
 ADE
-adversarially
 aeroplane
 affective
 ai
@@ -54,6 +53,7 @@ bert
 BERT's
 BetterTransformer
 Bewley
+bge
 BGR
 BMP
 Bicubic
@@ -209,7 +209,6 @@ enum
 et
 Evol
 EVS
-facto
 fastcomposer
 FastComposer
 FastSAM
@@ -326,7 +325,6 @@ Kalman
 kbps
 KD
 keras
-KerasCV
 keypoint
 keypoints
 KiTS
@@ -379,7 +377,6 @@ LSTM
 Luo
 LVLM
 macOS
-Magika
 Mahalanobis
 Mapillary
 Markovian
@@ -608,12 +605,16 @@ ReLU
 René
 repo
 reproducibility
+rerank
+reranker
+reranking
 rescale
 rescaling
 Rescaling
 ResNet
 resnet
 RetinaFace
+RetroMAE
 RGB
 Riffusion
 riffusion
@@ -835,4 +836,5 @@ ZavyChromaXL
 Zongyuan
 ZeroScope
 zeroscope
+zh
 xformers
diff --git a/notebooks/llm-chatbot/config.py b/notebooks/llm-chatbot/config.py
@@ -291,12 +291,34 @@ def internlm_partial_text_processor(partial_text, new_text):
 }
 
 SUPPORTED_EMBEDDING_MODELS = {
-    "all-mpnet-base-v2": {
-        "model_id": "sentence-transformers/all-mpnet-base-v2",
-        "do_norm": True,
+    "English":{
+        "bge-small-en-v1.5": {
+            "model_id": "BAAI/bge-small-en-v1.5",
+            "do_norm": True,
+        },
+        "bge-large-en-v1.5": {
+            "model_id": "BAAI/bge-large-en-v1.5",
+            "do_norm": True,
+        },
     },
-    "text2vec-large-chinese": {
-        "model_id": "GanymedeNil/text2vec-large-chinese",
-        "do_norm": False,
+    "Chinese":{
+        "bge-small-zh-v1.5": {
+            "model_id": "BAAI/bge-small-zh-v1.5",
+            "do_norm": True,
+        },
+        "bge-large-zh-v1.5": {
+            "model_id": "BAAI/bge-large-zh-v1.5",
+            "do_norm": True,
+        },
     },
 }
+
+SUPPORTED_RERANK_MODELS = {
+    "bge-reranker-large": {
+        "model_id": "BAAI/bge-reranker-large",
+    },
+    "bge-reranker-base": {
+        "model_id": "BAAI/bge-reranker-base",
+    },
+
+}
diff --git a/notebooks/llm-chatbot/ov_embedding_model.py b/notebooks/llm-chatbot/ov_embedding_model.py
@@ -1,60 +1,53 @@
 from langchain.pydantic_v1 import BaseModel, Extra, Field
 from langchain.schema.embeddings import Embeddings
-from typing import Optional, Union, Dict, Tuple, Any, List
-from sklearn.preprocessing import normalize
+from typing import Union, Dict,  Any, List
 from transformers import AutoTokenizer
-from pathlib import Path
-import openvino as ov
 import torch
+from torch import Tensor
+from tqdm.autonotebook import trange
 import numpy as np
+from optimum.intel.openvino import OVModelForFeatureExtraction
+from numpy import ndarray
 
-class OVEmbeddings(BaseModel, Embeddings):
+
+import logging
+logger = logging.getLogger(__name__)
+
+DEFAULT_QUERY_BGE_INSTRUCTION_EN = (
+    "Represent this question for searching relevant passages: "
+)
+DEFAULT_QUERY_BGE_INSTRUCTION_ZH = "为这个句子生成表示以用于检索相关文章："
+
+
+class OVBgeEmbeddings(BaseModel, Embeddings):
     """
-    LangChain compatible model wrapper for embedding model
+    OpenVINO BGE embedding models.
+
     """
 
-    model: Any  #: :meta private:
-    """LLM Transformers model."""
-    model_kwargs: Optional[dict] = None
-    """OpenVINO model configurations."""
-    tokenizer: Any  #: :meta private:
-    """Huggingface tokenizer model."""
-    do_norm: bool
-    """Whether normlizing the output of model"""
-    num_stream: int
-    """Number of stream."""
+    ov_model: Any
+    """OpenVINO model object."""
+    tokenizer: Any
+    """Tokenizer for embedding model."""
+    model_dir: str
+    """Path to store models."""
+    model_kwargs: Dict[str, Any]
+    """Keyword arguments passed to the model."""
     encode_kwargs: Dict[str, Any] = Field(default_factory=dict)
     """Keyword arguments to pass when calling the `encode` method of the model."""
+    query_instruction: str = DEFAULT_QUERY_BGE_INSTRUCTION_EN
+    """Instruction to use for embedding query."""
 
-    @classmethod
-    def from_model_id(
-        cls,
-        model_id: str,
-        do_norm: bool,
-        ov_config: Optional[dict],
-        model_kwargs: Optional[dict],
-        **kwargs: Any,
-    ):
-        _model_kwargs = model_kwargs or {}
-        _ov_config = ov_config or {}
-        tokenizer = AutoTokenizer.from_pretrained(model_id, **_model_kwargs)
-        core = ov.Core()
-        model_path = Path(model_id) / "openvino_model.xml"
-        model = core.compile_model(model_path, **_ov_config)
-        num_stream = model.get_property('NUM_STREAMS')
-
-        return cls(
-            model=model,
-            tokenizer=tokenizer,
-            do_norm = do_norm,
-            num_stream=num_stream,
-            **kwargs,
-        )
+    def __init__(self, **kwargs: Any):
+        """Initialize the sentence_transformer."""
+        super().__init__(**kwargs)
 
-    class Config:
-        """Configuration for this pydantic object."""
+        self.ov_model = OVModelForFeatureExtraction.from_pretrained(
+            self.model_dir, **self.model_kwargs)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
 
-        extra = Extra.forbid
+        if "-zh" in self.model_dir:
+            self.query_instruction = DEFAULT_QUERY_BGE_INSTRUCTION_ZH
 
     def _text_length(self, text: Union[List[int], List[List[int]]]):
         """
@@ -74,43 +67,85 @@ def _text_length(self, text: Union[List[int], List[List[int]]]):
             # Sum of length of individual strings
             return sum([len(t) for t in text])
 
-    def encode(self, sentences: Union[str, List[str]]):
+    def encode(
+        self,
+        sentences: Union[str, List[str]],
+        batch_size: int = 4,
+        show_progress_bar: bool = None,
+        convert_to_numpy: bool = True,
+        convert_to_tensor: bool = False,
+        normalize_embeddings: bool = False,
+    ) -> Union[List[Tensor], ndarray, Tensor]:
+        """
+        Computes sentence embeddings.
+
+        :param sentences: the sentences to embed.
+        :param batch_size: the batch size used for the computation.
+        :param show_progress_bar: Whether to output a progress bar when encode sentences.
+        :param convert_to_numpy: Whether the output should be a list of numpy vectors. If False, it is a list of PyTorch tensors.
+        :param convert_to_tensor: Whether the output should be one large tensor. Overwrites `convert_to_numpy`.
+        :param normalize_embeddings: Whether to normalize returned vectors to have length 1. In that case,
+            the faster dot-product (util.dot_score) instead of cosine similarity can be used.
+
+        :return: By default, a 2d numpy array with shape [num_inputs, output_dimension] is returned. If only one string
+            input is provided, then the output is a 1d array with shape [output_dimension]. If `convert_to_tensor`, a
+            torch Tensor is returned instead.
         """
-        Computes sentence embeddings
 
-        Args: 
-            sentences: the sentences to embed
+        if convert_to_tensor:
+            convert_to_numpy = False
+
+        input_was_string = False
+        if isinstance(sentences, str) or not hasattr(
+            sentences, "__len__"
+        ):  # Cast an individual sentence to a list with length 1
+            sentences = [sentences]
+            input_was_string = True
 
-        Returns:
-           By default, a list of tensors is returned.
-        """
         all_embeddings = []
         length_sorted_idx = np.argsort(
             [-self._text_length(sen) for sen in sentences])
         sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
-        nireq = self.num_stream + 1
-        infer_queue = ov.AsyncInferQueue(self.model, nireq)
-
-        def postprocess(request, userdata):
-            embeddings = request.get_output_tensor(0).data
-            embeddings = np.mean(embeddings, axis=1)
-            if self.do_norm:
-                embeddings = normalize(embeddings, 'l2')
-            all_embeddings.extend(embeddings)
 
-        infer_queue.set_callback(postprocess)
-
-        for i, sentence in enumerate(sentences_sorted):
-            inputs = {}
+        for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar):
+            sentences_batch = sentences_sorted[start_index: start_index + batch_size]
             features = self.tokenizer(
-                sentence, padding=True, truncation=True, return_tensors='np')
-            for key in features:
-                inputs[key] = features[key]
-            infer_queue.start_async(inputs, i)
-        infer_queue.wait_all()
-        all_embeddings = np.asarray(all_embeddings)
+                sentences_batch, padding=True, truncation=True,  return_tensors='pt')
+
+            out_features = self.ov_model(**features)
+            embeddings = out_features[0][:, 0]
+            if normalize_embeddings:
+                embeddings = torch.nn.functional.normalize(
+                    embeddings, p=2, dim=1)
+
+            # fixes for #522 and #487 to avoid oom problems on gpu with large datasets
+            if convert_to_numpy:
+                embeddings = embeddings.cpu()
+
+            all_embeddings.extend(embeddings)
+
+        all_embeddings = [all_embeddings[idx]
+                          for idx in np.argsort(length_sorted_idx)]
+
+        if convert_to_tensor:
+            if len(all_embeddings):
+                all_embeddings = torch.stack(all_embeddings)
+            else:
+                all_embeddings = torch.Tensor()
+        elif convert_to_numpy:
+            all_embeddings = np.asarray([emb.numpy()
+                                        for emb in all_embeddings])
+
+        if input_was_string:
+            all_embeddings = all_embeddings[0]
+
         return all_embeddings
 
+    class Config:
+        """Configuration for this pydantic object."""
+
+        extra = Extra.forbid
+
     def embed_documents(self, texts: List[str]) -> List[List[float]]:
         """Compute doc embeddings using a HuggingFace transformer model.
 
@@ -120,9 +155,8 @@ def embed_documents(self, texts: List[str]) -> List[List[float]]:
         Returns:
             List of embeddings, one for each text.
         """
-        texts = list(map(lambda x: x.replace("\n", " "), texts))
+        texts = [t.replace("\n", " ") for t in texts]
         embeddings = self.encode(texts, **self.encode_kwargs)
-
         return embeddings.tolist()
 
     def embed_query(self, text: str) -> List[float]:
@@ -134,4 +168,8 @@ def embed_query(self, text: str) -> List[float]:
         Returns:
             Embeddings for the text.
         """
-        return self.embed_documents([text])[0]
+        text = text.replace("\n", " ")
+        embedding = self.encode(
+            self.query_instruction + text, **self.encode_kwargs
+        )
+        return embedding.tolist()