Skip to content

Commit

Permalink
rebase to master branch
Browse files Browse the repository at this point in the history
  • Loading branch information
openvino-dev-samples committed Mar 26, 2024
1 parent 375819d commit 40ba19f
Show file tree
Hide file tree
Showing 5 changed files with 619 additions and 197 deletions.
10 changes: 6 additions & 4 deletions .ci/spellcheck/.pyspelling.wordlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ activations
adaptively
adas
ADE
adversarially
aeroplane
affective
ai
Expand Down Expand Up @@ -54,6 +53,7 @@ bert
BERT's
BetterTransformer
Bewley
bge
BGR
BMP
Bicubic
Expand Down Expand Up @@ -209,7 +209,6 @@ enum
et
Evol
EVS
facto
fastcomposer
FastComposer
FastSAM
Expand Down Expand Up @@ -326,7 +325,6 @@ Kalman
kbps
KD
keras
KerasCV
keypoint
keypoints
KiTS
Expand Down Expand Up @@ -379,7 +377,6 @@ LSTM
Luo
LVLM
macOS
Magika
Mahalanobis
Mapillary
Markovian
Expand Down Expand Up @@ -608,12 +605,16 @@ ReLU
René
repo
reproducibility
rerank
reranker
reranking
rescale
rescaling
Rescaling
ResNet
resnet
RetinaFace
RetroMAE
RGB
Riffusion
riffusion
Expand Down Expand Up @@ -835,4 +836,5 @@ ZavyChromaXL
Zongyuan
ZeroScope
zeroscope
zh
xformers
34 changes: 28 additions & 6 deletions notebooks/llm-chatbot/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,12 +291,34 @@ def internlm_partial_text_processor(partial_text, new_text):
}

SUPPORTED_EMBEDDING_MODELS = {
"all-mpnet-base-v2": {
"model_id": "sentence-transformers/all-mpnet-base-v2",
"do_norm": True,
"English":{
"bge-small-en-v1.5": {
"model_id": "BAAI/bge-small-en-v1.5",
"do_norm": True,
},
"bge-large-en-v1.5": {
"model_id": "BAAI/bge-large-en-v1.5",
"do_norm": True,
},
},
"text2vec-large-chinese": {
"model_id": "GanymedeNil/text2vec-large-chinese",
"do_norm": False,
"Chinese":{
"bge-small-zh-v1.5": {
"model_id": "BAAI/bge-small-zh-v1.5",
"do_norm": True,
},
"bge-large-zh-v1.5": {
"model_id": "BAAI/bge-large-zh-v1.5",
"do_norm": True,
},
},
}

SUPPORTED_RERANK_MODELS = {
"bge-reranker-large": {
"model_id": "BAAI/bge-reranker-large",
},
"bge-reranker-base": {
"model_id": "BAAI/bge-reranker-base",
},

}
182 changes: 110 additions & 72 deletions notebooks/llm-chatbot/ov_embedding_model.py
Original file line number Diff line number Diff line change
@@ -1,60 +1,53 @@
from langchain.pydantic_v1 import BaseModel, Extra, Field
from langchain.schema.embeddings import Embeddings
from typing import Optional, Union, Dict, Tuple, Any, List
from sklearn.preprocessing import normalize
from typing import Union, Dict, Any, List
from transformers import AutoTokenizer
from pathlib import Path
import openvino as ov
import torch
from torch import Tensor
from tqdm.autonotebook import trange
import numpy as np
from optimum.intel.openvino import OVModelForFeatureExtraction
from numpy import ndarray

class OVEmbeddings(BaseModel, Embeddings):

import logging
logger = logging.getLogger(__name__)

DEFAULT_QUERY_BGE_INSTRUCTION_EN = (
"Represent this question for searching relevant passages: "
)
DEFAULT_QUERY_BGE_INSTRUCTION_ZH = "为这个句子生成表示以用于检索相关文章:"


class OVBgeEmbeddings(BaseModel, Embeddings):
"""
LangChain compatible model wrapper for embedding model
OpenVINO BGE embedding models.
"""

model: Any #: :meta private:
"""LLM Transformers model."""
model_kwargs: Optional[dict] = None
"""OpenVINO model configurations."""
tokenizer: Any #: :meta private:
"""Huggingface tokenizer model."""
do_norm: bool
"""Whether normlizing the output of model"""
num_stream: int
"""Number of stream."""
ov_model: Any
"""OpenVINO model object."""
tokenizer: Any
"""Tokenizer for embedding model."""
model_dir: str
"""Path to store models."""
model_kwargs: Dict[str, Any]
"""Keyword arguments passed to the model."""
encode_kwargs: Dict[str, Any] = Field(default_factory=dict)
"""Keyword arguments to pass when calling the `encode` method of the model."""
query_instruction: str = DEFAULT_QUERY_BGE_INSTRUCTION_EN
"""Instruction to use for embedding query."""

@classmethod
def from_model_id(
cls,
model_id: str,
do_norm: bool,
ov_config: Optional[dict],
model_kwargs: Optional[dict],
**kwargs: Any,
):
_model_kwargs = model_kwargs or {}
_ov_config = ov_config or {}
tokenizer = AutoTokenizer.from_pretrained(model_id, **_model_kwargs)
core = ov.Core()
model_path = Path(model_id) / "openvino_model.xml"
model = core.compile_model(model_path, **_ov_config)
num_stream = model.get_property('NUM_STREAMS')

return cls(
model=model,
tokenizer=tokenizer,
do_norm = do_norm,
num_stream=num_stream,
**kwargs,
)
def __init__(self, **kwargs: Any):
"""Initialize the sentence_transformer."""
super().__init__(**kwargs)

class Config:
"""Configuration for this pydantic object."""
self.ov_model = OVModelForFeatureExtraction.from_pretrained(
self.model_dir, **self.model_kwargs)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)

extra = Extra.forbid
if "-zh" in self.model_dir:
self.query_instruction = DEFAULT_QUERY_BGE_INSTRUCTION_ZH

def _text_length(self, text: Union[List[int], List[List[int]]]):
"""
Expand All @@ -74,43 +67,85 @@ def _text_length(self, text: Union[List[int], List[List[int]]]):
# Sum of length of individual strings
return sum([len(t) for t in text])

def encode(self, sentences: Union[str, List[str]]):
def encode(
self,
sentences: Union[str, List[str]],
batch_size: int = 4,
show_progress_bar: bool = None,
convert_to_numpy: bool = True,
convert_to_tensor: bool = False,
normalize_embeddings: bool = False,
) -> Union[List[Tensor], ndarray, Tensor]:
"""
Computes sentence embeddings.
:param sentences: the sentences to embed.
:param batch_size: the batch size used for the computation.
:param show_progress_bar: Whether to output a progress bar when encode sentences.
:param convert_to_numpy: Whether the output should be a list of numpy vectors. If False, it is a list of PyTorch tensors.
:param convert_to_tensor: Whether the output should be one large tensor. Overwrites `convert_to_numpy`.
:param normalize_embeddings: Whether to normalize returned vectors to have length 1. In that case,
the faster dot-product (util.dot_score) instead of cosine similarity can be used.
:return: By default, a 2d numpy array with shape [num_inputs, output_dimension] is returned. If only one string
input is provided, then the output is a 1d array with shape [output_dimension]. If `convert_to_tensor`, a
torch Tensor is returned instead.
"""
Computes sentence embeddings

Args:
sentences: the sentences to embed
if convert_to_tensor:
convert_to_numpy = False

input_was_string = False
if isinstance(sentences, str) or not hasattr(
sentences, "__len__"
): # Cast an individual sentence to a list with length 1
sentences = [sentences]
input_was_string = True

Returns:
By default, a list of tensors is returned.
"""
all_embeddings = []
length_sorted_idx = np.argsort(
[-self._text_length(sen) for sen in sentences])
sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
nireq = self.num_stream + 1
infer_queue = ov.AsyncInferQueue(self.model, nireq)

def postprocess(request, userdata):
embeddings = request.get_output_tensor(0).data
embeddings = np.mean(embeddings, axis=1)
if self.do_norm:
embeddings = normalize(embeddings, 'l2')
all_embeddings.extend(embeddings)

infer_queue.set_callback(postprocess)

for i, sentence in enumerate(sentences_sorted):
inputs = {}
for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar):
sentences_batch = sentences_sorted[start_index: start_index + batch_size]
features = self.tokenizer(
sentence, padding=True, truncation=True, return_tensors='np')
for key in features:
inputs[key] = features[key]
infer_queue.start_async(inputs, i)
infer_queue.wait_all()
all_embeddings = np.asarray(all_embeddings)
sentences_batch, padding=True, truncation=True, return_tensors='pt')

out_features = self.ov_model(**features)
embeddings = out_features[0][:, 0]
if normalize_embeddings:
embeddings = torch.nn.functional.normalize(
embeddings, p=2, dim=1)

# fixes for #522 and #487 to avoid oom problems on gpu with large datasets
if convert_to_numpy:
embeddings = embeddings.cpu()

all_embeddings.extend(embeddings)

all_embeddings = [all_embeddings[idx]
for idx in np.argsort(length_sorted_idx)]

if convert_to_tensor:
if len(all_embeddings):
all_embeddings = torch.stack(all_embeddings)
else:
all_embeddings = torch.Tensor()
elif convert_to_numpy:
all_embeddings = np.asarray([emb.numpy()
for emb in all_embeddings])

if input_was_string:
all_embeddings = all_embeddings[0]

return all_embeddings

class Config:
"""Configuration for this pydantic object."""

extra = Extra.forbid

def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Compute doc embeddings using a HuggingFace transformer model.
Expand All @@ -120,9 +155,8 @@ def embed_documents(self, texts: List[str]) -> List[List[float]]:
Returns:
List of embeddings, one for each text.
"""
texts = list(map(lambda x: x.replace("\n", " "), texts))
texts = [t.replace("\n", " ") for t in texts]
embeddings = self.encode(texts, **self.encode_kwargs)

return embeddings.tolist()

def embed_query(self, text: str) -> List[float]:
Expand All @@ -134,4 +168,8 @@ def embed_query(self, text: str) -> List[float]:
Returns:
Embeddings for the text.
"""
return self.embed_documents([text])[0]
text = text.replace("\n", " ")
embedding = self.encode(
self.query_instruction + text, **self.encode_kwargs
)
return embedding.tolist()
Loading

0 comments on commit 40ba19f

Please sign in to comment.