intel-analytics · SANKHA1 · Jul 24, 2024 · Jul 24, 2024 · Jul 24, 2024 · Aug 8, 2024
diff --git a/docker/llm/vllm_sycl/docker/Dockerfile b/docker/llm/vllm_sycl/docker/Dockerfile
@@ -0,0 +1,129 @@
+FROM intel/oneapi-basekit:2024.1.1-devel-ubuntu22.04
+
+ARG http_proxy
+ARG https_proxy
+
+# Disable pip's cache behavior
+ARG PIP_NO_CACHE_DIR=false
+ADD ./gradio_web_server.patch /tmp/gradio_web_server.patch
+ADD ./oneccl-binding.patch  /tmp/oneccl-binding.patch
+
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
+    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
+    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
+    rm /etc/apt/sources.list.d/intel-graphics.list && \
+    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
+    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
+    chmod 644 /usr/share/keyrings/intel-graphics.gpg && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends curl wget git libunwind8-dev vim less && \
+    # Install PYTHON 3.11 and IPEX-LLM[xpu]
+    ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone && \
+    env DEBIAN_FRONTEND=noninteractive apt-get update && \
+    # add-apt-repository requires gnupg, gpg-agent, software-properties-common
+    apt-get install -y --no-install-recommends gnupg gpg-agent software-properties-common && \
+    # Add Python 3.11 PPA repository
+    add-apt-repository ppa:deadsnakes/ppa -y && \
+    apt-get install -y --no-install-recommends python3.11 git curl wget && \
+    rm /usr/bin/python3 && \
+    ln -s /usr/bin/python3.11 /usr/bin/python3 && \
+    ln -s /usr/bin/python3 /usr/bin/python && \
+    apt-get install -y --no-install-recommends python3-pip python3.11-dev python3-wheel python3.11-distutils && \
+    wget https://bootstrap.pypa.io/get-pip.py -O get-pip.py && \
+    # Install FastChat from source requires PEP 660 support
+    python3 get-pip.py && \
+    rm get-pip.py && \
+    pip install --upgrade requests argparse urllib3 && \
+    pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ && \
+    # Fix Trivy CVE Issues
+    pip install transformers==4.36.2 && \
+    pip install transformers_stream_generator einops tiktoken && \
+    # # Install opencl-related repos
+    # apt-get update && \
+    # apt-get install -y --no-install-recommends intel-opencl-icd=23.35.27191.42-775~22.04 intel-level-zero-gpu=1.3.27191.42-775~22.04 level-zero=1.14.0-744~22.04 && \
+    # Install related libary of chat.py
+    pip install --upgrade colorama && \
+    # Download all-in-one benchmark and examples
+    git clone https://github.com/intel-analytics/ipex-llm && \
+    cp -r ./ipex-llm/python/llm/dev/benchmark/ ./benchmark && \
+    cp -r ./ipex-llm/python/llm/example/GPU/HuggingFace/LLM ./examples && \
+    # Install vllm dependencies
+    pip install --upgrade fastapi && \
+    pip install --upgrade "uvicorn[standard]" && \
+    # Download vLLM-Serving
+    cp -r ./ipex-llm/python/llm/example/GPU/vLLM-Serving/ ./vLLM-Serving 
+
+
+# Install Serving Dependencies
+# Install ipex-llm[serving] only will update ipex_llm source code without updating
+# bigdl-core-xe, which will lead to problems
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends libfabric-dev wrk libaio-dev && \
+    mkdir -p /llm/neo && \
+    cd /llm/neo && \
+    wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.15136.4/intel-igc-core_1.0.15136.4_amd64.deb && \
+    wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.15136.4/intel-igc-opencl_1.0.15136.4_amd64.deb && \
+    wget https://github.com/intel/compute-runtime/releases/download/23.35.27191.9/intel-level-zero-gpu-dbgsym_1.3.27191.9_amd64.ddeb && \
+    wget https://github.com/intel/compute-runtime/releases/download/23.35.27191.9/intel-level-zero-gpu_1.3.27191.9_amd64.deb && \
+    wget https://github.com/intel/compute-runtime/releases/download/23.35.27191.9/intel-opencl-icd-dbgsym_23.35.27191.9_amd64.ddeb && \
+    wget https://github.com/intel/compute-runtime/releases/download/23.35.27191.9/intel-opencl-icd_23.35.27191.9_amd64.deb && \
+    wget https://github.com/intel/compute-runtime/releases/download/23.35.27191.9/libigdgmm12_22.3.11.ci17747749_amd64.deb && \
+    dpkg -i *.deb && \
+    pip install --pre --upgrade ipex-llm[xpu,serving] && \
+    pip install transformers==4.37.0 gradio==4.19.2 && \
+    # Use ipex-vllm-mainline
+    git clone -b vllm_202411_0807 https://github.com/xiangyuT/ipex-llm.git /llm/ipex-llm && \
+    cp /llm/ipex-llm/python/llm/src/ipex_llm/transformers/convert.py /usr/local/lib/python3.11/dist-packages/ipex_llm/transformers/convert.py && \
+    cp /llm/ipex-llm/python/llm/src/ipex_llm/transformers/low_bit_linear.py /usr/local/lib/python3.11/dist-packages/ipex_llm/transformers/low_bit_linear.py && \
+    rm -rf /usr/local/lib/python3.11/dist-packages/ipex_llm/vllm && \
+    cp -r /llm/ipex-llm/python/llm/src/ipex_llm/vllm /usr/local/lib/python3.11/dist-packages/ipex_llm/ && \
+    # install ipex 2.1.30
+    python -m pip install torch==2.1.0.post2 torchvision==0.16.0.post2 torchaudio==2.1.0.post2 intel-extension-for-pytorch==2.1.30.post0 oneccl_bind_pt==2.1.300+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ && \
+    python -m pip install setuptools==69.5.1 numpy==1.26.4 && \
+    # Install vLLM-v2 dependencies 
+    git clone -b xiangyu_test_202411_0806 https://github.com/analytics-zoo/vllm.git /llm/vllm && \
+    pip install -r /llm/vllm/requirements-common.txt && \
+    pip install -r /llm/vllm/requirements-xpu.txt && \
+    pip install --no-deps xformers && \
+    cd /llm/vllm && \
+    VLLM_TARGET_DEVICE=xpu python setup.py install && \
+    pip install outlines==0.0.34 --no-deps && \
+    pip install interegular cloudpickle diskcache joblib lark nest-asyncio numba scipy && \
+    # For Qwen series models support
+    pip install transformers_stream_generator einops tiktoken && \
+    # For pipeline serving support
+    pip install mpi4py fastapi uvicorn openai && \
+    # for gradio web UI
+    pip install gradio && \
+    # Install internal oneccl && \
+    cd /tmp/ && \
+    pip install --upgrade setuptools wheel twine && \
+    pip install "setuptools<70.0.0" && \
+    git clone https://github.com/intel/torch-ccl -b v2.1.300+xpu && \
+    cd torch-ccl && \
+    patch -p1 < /tmp/oneccl-binding.patch && \
+    git submodule sync && \
+    git submodule update --init --recursive && \
+    USE_SYSTEM_ONECCL=ON COMPUTE_BACKEND=dpcpp python setup.py install sdist bdist_wheel && \
+    mv /tmp/torch-ccl/dist/oneccl_bind_pt-2.1.300+xpu-cp311-cp311-linux_x86_64.whl /tmp/ && \
+    cd /tmp/ && \
+    wget https://sourceforge.net/projects/oneccl-wks/files/oneccl_wks_installer_2024.0.0.2.sh && \
+    bash oneccl_wks_installer_2024.0.0.2.sh && \
+    pip uninstall -y oneccl_bind_pt && \
+    pip install /tmp/oneccl_bind_pt-2.1.300+xpu-cp311-cp311-linux_x86_64.whl && \ 
+    rm /tmp/oneccl_bind_pt-2.1.300+xpu-cp311-cp311-linux_x86_64.whl && \
+    patch /usr/local/lib/python3.11/dist-packages/fastchat/serve/gradio_web_server.py < /tmp/gradio_web_server.patch && \
+    pip install -r /llm/vllm/requirements-common.txt && \
+    pip install ray
+
+COPY ./vllm_online_benchmark.py        /llm/
+COPY ./vllm_offline_inference.py       /llm/
+COPY ./payload-1024.lua                /llm/
+COPY ./start-vllm-service.sh           /llm/
+COPY ./benchmark_vllm_throughput.py   /llm/
+COPY ./start-fastchat-service.sh       /llm/
+COPY ./start-pp_serving-service.sh       /llm/
+COPY ./start-lightweight_serving-service.sh       /llm/
+
+
+WORKDIR /llm/
diff --git a/docker/llm/vllm_sycl/docker/README.md b/docker/llm/vllm_sycl/docker/README.md
@@ -0,0 +1,207 @@
+## Build/Use IPEX-LLM-serving xpu image
+
+### Build Image
+```bash
+docker build \
+  --build-arg http_proxy=.. \
+  --build-arg https_proxy=.. \
+  --build-arg no_proxy=.. \
+  --rm --no-cache -t intelanalytics/ipex-llm-serving-xpu:2024.1.1 .
+```
+
+
+### Use the image for doing xpu serving
+
+
+To map the `xpu` into the container, you need to specify `--device=/dev/dri` when booting the container.
+
+An example could be:
+```bash
+#/bin/bash
+export DOCKER_IMAGE=intelanalytics/ipex-llm-serving-xpu:2.1.0-SNAPSHOT
+
+sudo docker run -itd \
+        --net=host \
+        --device=/dev/dri \
+        --name=CONTAINER_NAME \
+        --shm-size="16g" \
+        $DOCKER_IMAGE
+```
+
+
+After the container is booted, you could get into the container through `docker exec`.
+
+To verify the device is successfully mapped into the container, run `sycl-ls` to check the result. In a machine with Arc A770, the sampled output is:
+
+```bash
+root@arda-arc12:/# sycl-ls
+[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device 1.2 [2023.16.7.0.21_160000]
+[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i9-13900K 3.0 [2023.16.7.0.21_160000]
+[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics 3.0 [23.17.26241.33]
+[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26241]
+```
+After the container is booted, you could get into the container through `docker exec`.
+
+Currently, we provide two different serving engines in the image, which are FastChat serving engine and vLLM serving engine.
+
+
+#### Lightweight serving engine
+
+To run Lightweight serving on one intel gpu using `IPEX-LLM` as backend, you can refer to this [readme](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/Lightweight-Serving).
+
+For convenience, we have included a file `/llm/start-lightweight_serving-service` in the image.
+
+
+#### Pipeline parallel serving engine
+
+To run Pipeline parallel serving using `IPEX-LLM` as backend, you can refer to this [readme](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/Pipeline-Parallel-FastAPI).
+
+For convenience, we have included a file `/llm/start-pp_serving-service.sh` in the image.
+
+
+#### FastChat serving engine
+
+To run model-serving using `IPEX-LLM` as backend using FastChat, you can refer to this [quickstart](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/fastchat_quickstart.html#).
+
+For convenience, we have included a file `/llm/fastchat-examples/start-fastchat-service.sh` in the image.
+
+You can modify this script to using fastchat with either `ipex_llm_worker` or `vllm_worker`.
+
+#### vLLM serving engine
+
+To run vLLM engine using `IPEX-LLM` as backend, you can refer to this [document](https://github.com/intel-analytics/ipex-llm/blob/main/python/llm/example/GPU/vLLM-Serving/README.md).
+
+We have included multiple example files in `/llm/`:
+1. `vllm_offline_inference.py`: Used for vLLM offline inference example
+2. `benchmark_vllm_throughput.py`: Used for benchmarking throughput
+3. `payload-1024.lua`: Used for testing request per second using 1k-128 request
+4. `start-vllm-service.sh`: Used for template for starting vLLM service
+
+##### Online benchmark throurgh api_server
+
+We can benchmark the api_server to get an estimation about TPS (transactions per second).  To do so, you need to start the service first according to the instructions in this [section](https://github.com/intel-analytics/ipex-llm/blob/main/python/llm/example/GPU/vLLM-Serving/README.md#service).
+
+###### Online benchmark through benchmark_util
+
+After starting vllm service, Sending reqs through `vllm_online_benchmark.py`
+```bash
+python vllm_online_benchmark.py $model_name $max_seqs
+```
+
+And it will output like this:
+```bash
+model_name: Qwen1.5-14B-Chat
+max_seq: 12
+Warm Up: 100%|█████████████████████████████████████████████████████| 24/24 [01:36<00:00,  4.03s/req]
+Benchmarking: 100%|████████████████████████████████████████████████| 60/60 [04:03<00:00,  4.05s/req]
+Total time for 60 requests with 12 concurrent requests: xxx seconds.
+Average responce time: xxx
+Token throughput: xxx
+
+Average first token latency: xxx milliseconds.
+P90 first token latency: xxx milliseconds.
+P95 first token latency: xxx milliseconds.
+
+Average next token latency: xxx milliseconds.
+P90 next token latency: xxx milliseconds.
+P95 next token latency: xxx milliseconds.
+```
+
+###### Online benchmark through wrk
+In container, do the following:
+1. modify the `/llm/payload-1024.lua` so that the "model" attribute is correct.  By default, we use a prompt that is roughly 1024 token long, you can change it if needed.
+2. Start the benchmark using `wrk` using the script below:
+
+```bash
+cd /llm
+# You can change -t and -c to control the concurrency.
+# By default, we use 12 connections to benchmark the service.
+wrk -t12 -c12 -d15m -s payload-1024.lua http://localhost:8000/v1/completions --timeout 1h
+```
+
+#### Offline benchmark through benchmark_vllm_throughput.py
+
+We have included the benchmark_throughput script provied by `vllm` in our image as `/llm/benchmark_vllm_throughput.py`.  To use the benchmark_throughput script, you will need to download the test dataset through:
+
+```bash
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+```
+
+The full example looks like this:
+```bash
+cd /llm/
+
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+export MODEL="YOUR_MODEL"
+
+# You can change load-in-low-bit from values in [sym_int4, fp8, fp16]
+
+python3 /llm/benchmark_vllm_throughput.py \
+    --backend vllm \
+    --dataset /llm/ShareGPT_V3_unfiltered_cleaned_split.json \
+    --model $MODEL \
+    --num-prompts 1000 \
+    --seed 42 \
+    --trust-remote-code \
+    --enforce-eager \
+    --dtype float16 \
+    --device xpu \
+    --load-in-low-bit sym_int4 \
+    --gpu-memory-utilization 0.85
+```
+
+> Note: you can adjust --load-in-low-bit to use other formats of low-bit quantization.
+
+
+You can also adjust `--gpu-memory-utilization` rate using the below script to find the best performance using the following script:
+
+```bash
+#!/bin/bash
+
+# Define the log directory
+LOG_DIR="YOUR_LOG_DIR"
+# Check if the log directory exists, if not, create it
+if [ ! -d "$LOG_DIR" ]; then
+    mkdir -p "$LOG_DIR"
+fi
+
+# Define an array of model paths
+MODELS=(
+    "YOUR TESTED MODELS"
+)
+
+# Define an array of utilization rates
+UTIL_RATES=(0.85 0.90 0.95)
+
+# Loop over each model
+for MODEL in "${MODELS[@]}"; do
+    # Loop over each utilization rate
+    for RATE in "${UTIL_RATES[@]}"; do
+        # Extract a simple model name from the path for easier identification
+        MODEL_NAME=$(basename "$MODEL")
+
+        # Define the log file name based on the model and rate
+        LOG_FILE="$LOG_DIR/${MODEL_NAME}_utilization_${RATE}.log"
+
+        # Execute the command and redirect output to the log file
+        # Sometimes you might need to set --max-model-len if memory is not enough
+        # load-in-low-bit accepts inputs [sym_int4, fp8, fp16]
+        python3 /llm/benchmark_vllm_throughput.py \
+            --backend vllm \
+            --dataset /llm/ShareGPT_V3_unfiltered_cleaned_split.json \
+            --model $MODEL \
+            --num-prompts 1000 \
+            --seed 42 \
+            --trust-remote-code \
+            --enforce-eager \
+            --dtype float16 \
+            --load-in-low-bit sym_int4 \
+            --device xpu \
+            --gpu-memory-utilization $RATE &> "$LOG_FILE"
+    done
+done
+
+# Inform the user that the script has completed its execution
+echo "All benchmarks have been executed and logged."
+```