Enable the IPEX optimization for python backend

Signed-off-by: yuanwu <yuan.wu@intel.com>
huggingface · Apr 22, 2024 · 4d285bd · 4d285bd
1 parent d221b99
commit 4d285bd
Show file tree

Hide file tree

Showing 8 changed files with 159 additions and 13 deletions.
diff --git a/Dockerfile-intel b/Dockerfile-intel
@@ -0,0 +1,95 @@
+FROM lukemathwalker/cargo-chef:latest-rust-1.75-bookworm AS chef
+WORKDIR /usr/src
+
+ENV SCCACHE=0.5.4
+ENV RUSTC_WRAPPER=/usr/local/bin/sccache
+
+# Download and configure sccache
+RUN curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache && \
+    chmod +x /usr/local/bin/sccache
+
+FROM chef AS planner
+
+COPY backends backends
+COPY core core
+COPY router router
+COPY Cargo.toml ./
+COPY Cargo.lock ./
+
+RUN cargo chef prepare  --recipe-path recipe.json
+
+FROM chef AS builder
+
+ARG GIT_SHA
+ARG DOCKER_LABEL
+
+# sccache specific variables
+ARG ACTIONS_CACHE_URL
+ARG ACTIONS_RUNTIME_TOKEN
+ARG SCCACHE_GHA_ENABLED
+
+COPY --from=planner /usr/src/recipe.json recipe.json
+
+RUN cargo chef cook --release --features python --no-default-features --recipe-path recipe.json && sccache -s
+
+COPY backends backends
+COPY core core
+COPY router router
+COPY Cargo.toml ./
+COPY Cargo.lock ./
+
+RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
+    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
+    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
+    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
+    rm -f $PROTOC_ZIP
+
+FROM builder as http-builder
+
+RUN cargo build --release --bin text-embeddings-router -F python -F http --no-default-features && sccache -s
+
+FROM builder as grpc-builder
+
+COPY proto proto
+
+RUN cargo build --release --bin text-embeddings-router -F grpc -F python --no-default-features && sccache -s
+
+FROM ubuntu:22.04 as base
+
+ENV HUGGINGFACE_HUB_CACHE=/data \
+    PORT=80
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        ccache \
+        curl \
+        python3 \
+        python3-pip \
+        git && \
+        rm -rf /var/lib/apt/lists/*
+RUN ln -s /usr/bin/python3.10 /usr/bin/python
+
+WORKDIR /usr/src
+COPY backends backends
+COPY backends/python/server/text_embeddings_server/models/__init__.py backends/python/server/text_embeddings_server/models/__init__.py
+COPY backends/python/server/pyproject.toml backends/python/server/pyproject.toml
+COPY backends/python/server/requirements.txt backends/python/server/requirements.txt
+RUN cd backends/python/server && \
+    make install
+RUN python -m pip install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cpu
+RUN python -m pip install intel-extension-for-pytorch
+
+FROM base as grpc
+
+COPY --from=grpc-builder /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router
+
+ENTRYPOINT ["text-embeddings-router"]
+CMD ["--json-output"]
+
+FROM base
+
+COPY --from=http-builder /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router
+
+ENTRYPOINT ["text-embeddings-router"]
+CMD ["--json-output"]
diff --git a/backends/python/server/requirements.txt b/backends/python/server/requirements.txt
@@ -11,7 +11,7 @@ grpc-interceptor==0.15.3 ; python_version >= "3.9" and python_version < "3.13"
 grpcio-reflection==1.58.0 ; python_version >= "3.9" and python_version < "3.13"
 grpcio-status==1.58.0 ; python_version >= "3.9" and python_version < "3.13"
 grpcio==1.58.0 ; python_version >= "3.9" and python_version < "3.13"
-huggingface-hub==0.16.4 ; python_version >= "3.9" and python_version < "3.13"
+huggingface-hub==0.19.3 ; python_version >= "3.9" and python_version < "3.13"
 idna==3.4 ; python_version >= "3.9" and python_version < "3.13"
 jinja2==3.1.2 ; python_version >= "3.9" and python_version < "3.13"
 loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
@@ -41,3 +41,4 @@ typing-extensions==4.7.1 ; python_version >= "3.9" and python_version < "3.13"
 urllib3==2.0.4 ; python_version >= "3.9" and python_version < "3.13"
 win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
 wrapt==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.37.0 ; python_version >= "3.9" and python_version < "3.13"
diff --git a/backends/python/server/text_embeddings_server/cli.py b/backends/python/server/text_embeddings_server/cli.py
@@ -46,7 +46,6 @@ def serve(
 
     # Downgrade enum into str for easier management later on
     dtype = None if dtype is None else dtype.value
-
     server.serve(model_path, dtype, uds_path)
 
 

diff --git a/backends/python/server/text_embeddings_server/models/__init__.py b/backends/python/server/text_embeddings_server/models/__init__.py
@@ -8,6 +8,7 @@
 
 from text_embeddings_server.models.model import Model
 from text_embeddings_server.models.default_model import DefaultModel
+from text_embeddings_server.utils.device import get_device
 
 __all__ = ["Model"]
 
@@ -35,13 +36,7 @@ def get_model(model_path: Path, dtype: Optional[str]):
     else:
         raise RuntimeError(f"Unknown dtype {dtype}")
 
-    if torch.cuda.is_available():
-        device = torch.device("cuda")
-    else:
-        if dtype != torch.float32:
-            raise ValueError("CPU device only supports float32 dtype")
-        device = torch.device("cpu")
-
+    device = get_device()
     config = AutoConfig.from_pretrained(model_path)
 
     if config.model_type == "bert":

diff --git a/backends/python/server/text_embeddings_server/models/default_model.py b/backends/python/server/text_embeddings_server/models/default_model.py
@@ -5,16 +5,22 @@
 from typing import Type, List
 from transformers import AutoModel
 from opentelemetry import trace
+from loguru import logger
 
 from text_embeddings_server.models import Model
 from text_embeddings_server.models.types import PaddedBatch, Embedding
-
+from text_embeddings_server.utils.device import is_ipex_available, use_ipex
 tracer = trace.get_tracer(__name__)
 
 
 class DefaultModel(Model):
     def __init__(self, model_path: Path, device: torch.device, dtype: torch.dtype):
         model = AutoModel.from_pretrained(model_path).to(dtype).to(device)
+        model.eval()
+        if use_ipex() and device.type != "cuda":
+            import intel_extension_for_pytorch as ipex
+            model = ipex.optimize(model, dtype=dtype)
+
         self.hidden_size = model.config.hidden_size
 
         self.has_position_ids = (
@@ -39,10 +45,9 @@ def embed(self, batch: PaddedBatch) -> List[Embedding]:
             kwargs["token_type_ids"] = batch.token_type_ids
         if self.has_position_ids:
             kwargs["position_ids"] = batch.position_ids
-
         output = self.model(**kwargs)
         embedding = output[0][:, 0]
-        cpu_results = embedding.view(-1).tolist()
+        cpu_results = embedding.reshape(-1).tolist()
 
         return [
             Embedding(

diff --git a/backends/python/server/text_embeddings_server/utils/device.py b/backends/python/server/text_embeddings_server/utils/device.py
@@ -0,0 +1,47 @@
+import os
+from loguru import logger
+import importlib
+from packaging import version
+import torch
+
+def is_ipex_available():
+    def get_major_and_minor_from_version(full_version):
+        return str(version.parse(full_version).major) + "." + str(version.parse(full_version).minor)
+
+    _torch_version = importlib.metadata.version("torch")
+    if importlib.util.find_spec("intel_extension_for_pytorch") is None:
+        return False
+    _ipex_version = "N/A"
+    try:
+        _ipex_version = importlib.metadata.version("intel_extension_for_pytorch")
+    except importlib.metadata.PackageNotFoundError:
+        return False
+    torch_major_and_minor = get_major_and_minor_from_version(_torch_version)
+    ipex_major_and_minor = get_major_and_minor_from_version(_ipex_version)
+    if torch_major_and_minor != ipex_major_and_minor:
+        logger.warning(
+            f"Intel Extension for PyTorch {ipex_major_and_minor} needs to work with PyTorch {ipex_major_and_minor}.*,"
+            f" but PyTorch {_torch_version} is found. Please switch to the matching version and run again."
+        )
+        return False
+    return True
+
+def use_ipex() :
+    value = os.environ.get("USE_IPEX", "True")
+    if value in ["True", "true", "1"] and is_ipex_available():
+        return True
+    else:
+        return False
+
+def get_device() :
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    elif is_ipex_available():
+        if hasattr(torch, "xpu") and torch.xpu.is_available():
+            device = torch.device("xpu")
+        else:
+            device = torch.device("cpu")
+    else:
+        device = torch.device("cpu")
+    return device
+
diff --git a/backends/src/dtype.rs b/backends/src/dtype.rs
@@ -15,6 +15,8 @@ pub enum DType {
     // Float32 is not available on candle cuda
     #[cfg(any(feature = "python", feature = "candle"))]
     Float32,
+    #[cfg(feature = "python")]
+    Bfloat16,
     // #[cfg(feature = "candle")]
     // Q6K,
 }
@@ -33,6 +35,8 @@ impl fmt::Display for DType {
             DType::Float32 => write!(f, "float32"),
             // #[cfg(feature = "candle")]
             // DType::Q6K => write!(f, "q6k"),
+            #[cfg(feature = "python")]
+            DType::Bfloat16 => write!(f, "bfloat16"),
         }
     }
 }
diff --git a/sagemaker-entrypoint.sh b/sagemaker-entrypoint.sh
@@ -10,4 +10,4 @@ if [[ -n "${HF_MODEL_REVISION}" ]]; then
   export REVISION="${HF_MODEL_REVISION}"
 fi
 
-text-embeddings-router --port 8080 --json-output
+text-embeddings-router --port 8080 --json-output