Skip to content

Commit

Permalink
Enable the IPEX optimization for python backend
Browse files Browse the repository at this point in the history
Signed-off-by: yuanwu <yuan.wu@intel.com>
  • Loading branch information
yuanwu2017 committed Apr 22, 2024
1 parent d221b99 commit 4d285bd
Show file tree
Hide file tree
Showing 8 changed files with 159 additions and 13 deletions.
95 changes: 95 additions & 0 deletions Dockerfile-intel
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
FROM lukemathwalker/cargo-chef:latest-rust-1.75-bookworm AS chef
WORKDIR /usr/src

ENV SCCACHE=0.5.4
ENV RUSTC_WRAPPER=/usr/local/bin/sccache

# Download and configure sccache
RUN curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache && \
chmod +x /usr/local/bin/sccache

FROM chef AS planner

COPY backends backends
COPY core core
COPY router router
COPY Cargo.toml ./
COPY Cargo.lock ./

RUN cargo chef prepare --recipe-path recipe.json

FROM chef AS builder

ARG GIT_SHA
ARG DOCKER_LABEL

# sccache specific variables
ARG ACTIONS_CACHE_URL
ARG ACTIONS_RUNTIME_TOKEN
ARG SCCACHE_GHA_ENABLED

COPY --from=planner /usr/src/recipe.json recipe.json

RUN cargo chef cook --release --features python --no-default-features --recipe-path recipe.json && sccache -s

COPY backends backends
COPY core core
COPY router router
COPY Cargo.toml ./
COPY Cargo.lock ./

RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
rm -f $PROTOC_ZIP

FROM builder as http-builder

RUN cargo build --release --bin text-embeddings-router -F python -F http --no-default-features && sccache -s

FROM builder as grpc-builder

COPY proto proto

RUN cargo build --release --bin text-embeddings-router -F grpc -F python --no-default-features && sccache -s

FROM ubuntu:22.04 as base

ENV HUGGINGFACE_HUB_CACHE=/data \
PORT=80

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
ccache \
curl \
python3 \
python3-pip \
git && \
rm -rf /var/lib/apt/lists/*
RUN ln -s /usr/bin/python3.10 /usr/bin/python

WORKDIR /usr/src
COPY backends backends
COPY backends/python/server/text_embeddings_server/models/__init__.py backends/python/server/text_embeddings_server/models/__init__.py
COPY backends/python/server/pyproject.toml backends/python/server/pyproject.toml
COPY backends/python/server/requirements.txt backends/python/server/requirements.txt
RUN cd backends/python/server && \
make install
RUN python -m pip install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cpu
RUN python -m pip install intel-extension-for-pytorch

FROM base as grpc

COPY --from=grpc-builder /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router

ENTRYPOINT ["text-embeddings-router"]
CMD ["--json-output"]

FROM base

COPY --from=http-builder /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router

ENTRYPOINT ["text-embeddings-router"]
CMD ["--json-output"]
3 changes: 2 additions & 1 deletion backends/python/server/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ grpc-interceptor==0.15.3 ; python_version >= "3.9" and python_version < "3.13"
grpcio-reflection==1.58.0 ; python_version >= "3.9" and python_version < "3.13"
grpcio-status==1.58.0 ; python_version >= "3.9" and python_version < "3.13"
grpcio==1.58.0 ; python_version >= "3.9" and python_version < "3.13"
huggingface-hub==0.16.4 ; python_version >= "3.9" and python_version < "3.13"
huggingface-hub==0.19.3 ; python_version >= "3.9" and python_version < "3.13"
idna==3.4 ; python_version >= "3.9" and python_version < "3.13"
jinja2==3.1.2 ; python_version >= "3.9" and python_version < "3.13"
loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
Expand Down Expand Up @@ -41,3 +41,4 @@ typing-extensions==4.7.1 ; python_version >= "3.9" and python_version < "3.13"
urllib3==2.0.4 ; python_version >= "3.9" and python_version < "3.13"
win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
wrapt==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
transformers==4.37.0 ; python_version >= "3.9" and python_version < "3.13"
1 change: 0 additions & 1 deletion backends/python/server/text_embeddings_server/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ def serve(

# Downgrade enum into str for easier management later on
dtype = None if dtype is None else dtype.value

server.serve(model_path, dtype, uds_path)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from text_embeddings_server.models.model import Model
from text_embeddings_server.models.default_model import DefaultModel
from text_embeddings_server.utils.device import get_device

__all__ = ["Model"]

Expand Down Expand Up @@ -35,13 +36,7 @@ def get_model(model_path: Path, dtype: Optional[str]):
else:
raise RuntimeError(f"Unknown dtype {dtype}")

if torch.cuda.is_available():
device = torch.device("cuda")
else:
if dtype != torch.float32:
raise ValueError("CPU device only supports float32 dtype")
device = torch.device("cpu")

device = get_device()
config = AutoConfig.from_pretrained(model_path)

if config.model_type == "bert":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,22 @@
from typing import Type, List
from transformers import AutoModel
from opentelemetry import trace
from loguru import logger

from text_embeddings_server.models import Model
from text_embeddings_server.models.types import PaddedBatch, Embedding

from text_embeddings_server.utils.device import is_ipex_available, use_ipex
tracer = trace.get_tracer(__name__)


class DefaultModel(Model):
def __init__(self, model_path: Path, device: torch.device, dtype: torch.dtype):
model = AutoModel.from_pretrained(model_path).to(dtype).to(device)
model.eval()
if use_ipex() and device.type != "cuda":
import intel_extension_for_pytorch as ipex
model = ipex.optimize(model, dtype=dtype)

self.hidden_size = model.config.hidden_size

self.has_position_ids = (
Expand All @@ -39,10 +45,9 @@ def embed(self, batch: PaddedBatch) -> List[Embedding]:
kwargs["token_type_ids"] = batch.token_type_ids
if self.has_position_ids:
kwargs["position_ids"] = batch.position_ids

output = self.model(**kwargs)
embedding = output[0][:, 0]
cpu_results = embedding.view(-1).tolist()
cpu_results = embedding.reshape(-1).tolist()

return [
Embedding(
Expand Down
47 changes: 47 additions & 0 deletions backends/python/server/text_embeddings_server/utils/device.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import os
from loguru import logger
import importlib
from packaging import version
import torch

def is_ipex_available():
def get_major_and_minor_from_version(full_version):
return str(version.parse(full_version).major) + "." + str(version.parse(full_version).minor)

_torch_version = importlib.metadata.version("torch")
if importlib.util.find_spec("intel_extension_for_pytorch") is None:
return False
_ipex_version = "N/A"
try:
_ipex_version = importlib.metadata.version("intel_extension_for_pytorch")
except importlib.metadata.PackageNotFoundError:
return False
torch_major_and_minor = get_major_and_minor_from_version(_torch_version)
ipex_major_and_minor = get_major_and_minor_from_version(_ipex_version)
if torch_major_and_minor != ipex_major_and_minor:
logger.warning(
f"Intel Extension for PyTorch {ipex_major_and_minor} needs to work with PyTorch {ipex_major_and_minor}.*,"
f" but PyTorch {_torch_version} is found. Please switch to the matching version and run again."
)
return False
return True

def use_ipex() :
value = os.environ.get("USE_IPEX", "True")
if value in ["True", "true", "1"] and is_ipex_available():
return True
else:
return False

def get_device() :
if torch.cuda.is_available():
device = torch.device("cuda")
elif is_ipex_available():
if hasattr(torch, "xpu") and torch.xpu.is_available():
device = torch.device("xpu")
else:
device = torch.device("cpu")
else:
device = torch.device("cpu")
return device

4 changes: 4 additions & 0 deletions backends/src/dtype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ pub enum DType {
// Float32 is not available on candle cuda
#[cfg(any(feature = "python", feature = "candle"))]
Float32,
#[cfg(feature = "python")]
Bfloat16,
// #[cfg(feature = "candle")]
// Q6K,
}
Expand All @@ -33,6 +35,8 @@ impl fmt::Display for DType {
DType::Float32 => write!(f, "float32"),
// #[cfg(feature = "candle")]
// DType::Q6K => write!(f, "q6k"),
#[cfg(feature = "python")]
DType::Bfloat16 => write!(f, "bfloat16"),
}
}
}
2 changes: 1 addition & 1 deletion sagemaker-entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ if [[ -n "${HF_MODEL_REVISION}" ]]; then
export REVISION="${HF_MODEL_REVISION}"
fi

text-embeddings-router --port 8080 --json-output
text-embeddings-router --port 8080 --json-output

0 comments on commit 4d285bd

Please sign in to comment.