Skip to content

Commit

Permalink
Merge pull request #213 from ROCm/upstream_merge_24_09_27_0.6.2
Browse files Browse the repository at this point in the history
Upstream merge 24 09 27 0.6.2
  • Loading branch information
gshtras authored Oct 2, 2024
2 parents 2d7ab9e + f49394a commit 030374b
Show file tree
Hide file tree
Showing 178 changed files with 8,261 additions and 3,000 deletions.
24 changes: 21 additions & 3 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ steps:
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py

- label: Core Test # 10min
mirror_hardwares: [amd]
fast_check: true
Expand All @@ -90,8 +90,11 @@ steps:
commands:
- pip install -e ./plugins/vllm_add_dummy_model
- pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- pytest -v -s entrypoints/openai
- pytest -v -s entrypoints/test_chat_utils.py
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
Expand Down Expand Up @@ -207,6 +210,21 @@ steps:
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
parallelism: 4

- label: "PyTorch Fullgraph Smoke Test"
fast_check: true
source_file_dependencies:
- vllm/
- tests/compile
commands:
- pytest -v -s compile/test_full_graph_smoke.py

- label: "PyTorch Fullgraph Test"
source_file_dependencies:
- vllm/
- tests/compile
commands:
- pytest -v -s compile/test_full_graph.py

- label: Kernels Test %N # 30min each
mirror_hardwares: [amd]
source_file_dependencies:
Expand Down Expand Up @@ -352,7 +370,7 @@ steps:
- tests/distributed/
- vllm/compilation
commands:
- pytest -v -s ./compile/test_full_graph.py
- pytest -v -s ./compile/test_full_graph_multi_gpu.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
Expand Down
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# vllm commit id, generated by setup.py
vllm/commit_id.py
# version file generated by setuptools-scm
/vllm/_version.py

# vllm-flash-attn built from source
vllm/vllm_flash_attn/
Expand Down
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/quantization/gguf/gguf_kernel.cu"
"csrc/quantization/fp8/fp8_marlin.cu"
"csrc/custom_all_reduce.cu"
"csrc/permute_cols.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
Expand Down Expand Up @@ -333,6 +334,11 @@ set(VLLM_MOE_EXT_SRC

if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
"csrc/moe/marlin_moe_ops.cu")
endif()

Expand Down
7 changes: 3 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -79,15 +79,13 @@ ENV MAX_JOBS=${max_jobs}
ARG nvcc_threads=8
ENV NVCC_THREADS=$nvcc_threads

ARG buildkite_commit
ENV BUILDKITE_COMMIT=${buildkite_commit}

ARG USE_SCCACHE
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0
# if USE_SCCACHE is set, use sccache to speed up compilation
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \
if [ "$USE_SCCACHE" = "1" ]; then \
echo "Installing sccache..." \
&& curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
Expand All @@ -107,6 +105,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \
if [ "$USE_SCCACHE" != "1" ]; then \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
fi
Expand Down Expand Up @@ -203,7 +202,7 @@ FROM vllm-base AS vllm-openai

# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
pip install accelerate hf_transfer 'modelscope!=1.15.0'
pip install accelerate hf_transfer 'modelscope!=1.15.0' bitsandbytes>=0.44.0 timm==0.9.10

ENV VLLM_USAGE_SOURCE production-docker-image

Expand Down
4 changes: 3 additions & 1 deletion Dockerfile.cpu
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,10 @@ ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}

RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/ccache \
--mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
pip install dist/*.whl
pip install dist/*.whl && \
rm -rf dist

WORKDIR /workspace/

Expand Down
23 changes: 13 additions & 10 deletions Dockerfile.neuron
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,12 @@ FROM $BASE_IMAGE
RUN echo "Base image is $BASE_IMAGE"

# Install some basic utilities
RUN apt-get update \
&& apt-get install python3 python3-pip -y \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1
RUN apt-get update && \
apt-get install -y \
git \
python3 \
python3-pip \
ffmpeg libsm6 libxext6 libgl1

### Mount Point ###
# When launching the container, mount the code directory to /app
Expand All @@ -22,17 +25,17 @@ RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U

COPY ./vllm /app/vllm/vllm
COPY ./setup.py /app/vllm/setup.py
COPY ./requirements-common.txt /app/vllm/requirements-common.txt
COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
COPY . /app/vllm

RUN cd /app/vllm \
&& python3 -m pip install -U -r requirements-neuron.txt
&& python3 -m pip install -U \
cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
-r requirements-neuron.txt

ENV VLLM_TARGET_DEVICE neuron
RUN cd /app/vllm \
&& pip install -e . \
RUN --mount=type=bind,source=.git,target=.git \
cd /app/vllm \
&& pip install --no-build-isolation -v -e . \
&& cd ..

CMD ["/bin/bash"]
5 changes: 3 additions & 2 deletions Dockerfile.openvino
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
FROM ubuntu:22.04 AS dev

RUN apt-get update -y && \
apt-get install -y python3-pip git && \
apt-get install -y ffmpeg libsm6 libxext6 libgl1
apt-get install -y \
git python3-pip \
ffmpeg libsm6 libxext6 libgl1
WORKDIR /workspace

# copy requirements
Expand Down
12 changes: 9 additions & 3 deletions Dockerfile.ppc64le
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,15 @@ COPY ./ /workspace/vllm
WORKDIR /workspace/vllm

# These packages will be in rocketce eventually
RUN pip install -v cmake xformers torch==2.3.1 uvloop==0.20.0 -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing

RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
torch==2.3.1 \
-r requirements-cpu.txt \
xformers uvloop==0.20.0

RUN --mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py install

WORKDIR /workspace/

Expand Down
17 changes: 13 additions & 4 deletions Dockerfile.tpu
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,25 @@ FROM $BASE_IMAGE
WORKDIR /workspace

# Install some basic utilities
RUN apt-get update && apt-get install -y ffmpeg libsm6 libxext6 libgl1
RUN apt-get update && apt-get install -y \
git \
ffmpeg libsm6 libxext6 libgl1

# Install the TPU and Pallas dependencies.
RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html

# Build vLLM.
COPY . /workspace/vllm
ENV VLLM_TARGET_DEVICE="tpu"
RUN cd /workspace/vllm && python3 -m pip install -r requirements-tpu.txt
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \
cd /workspace/vllm && \
python3 -m pip install \
cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
-r requirements-tpu.txt
RUN cd /workspace/vllm && python3 setup.py develop

CMD ["/bin/bash"]
13 changes: 9 additions & 4 deletions Dockerfile.xpu
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,20 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg

RUN apt-get update -y \
&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1
RUN apt-get update -y && \
apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1

COPY ./ /workspace/vllm

WORKDIR /workspace/vllm

RUN pip install -v -r requirements-xpu.txt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -v --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
-r requirements-xpu.txt

RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=xpu python3 setup.py install

CMD ["/bin/bash"]
8 changes: 4 additions & 4 deletions benchmarks/benchmark_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs
from vllm.inputs import PromptType
from vllm.inputs import PromptInputs
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.utils import FlexibleArgumentParser

Expand Down Expand Up @@ -62,7 +62,7 @@ def main(args: argparse.Namespace):
dummy_prompt_token_ids = np.random.randint(10000,
size=(args.batch_size,
args.input_len))
dummy_prompts: List[PromptType] = [{
dummy_inputs: List[PromptInputs] = [{
"prompt_token_ids": batch
} for batch in dummy_prompt_token_ids.tolist()]

Expand All @@ -75,13 +75,13 @@ def run_to_completion(profile_dir: Optional[str] = None):
],
on_trace_ready=torch.profiler.tensorboard_trace_handler(
str(profile_dir))) as p:
llm.generate(dummy_prompts,
llm.generate(dummy_inputs,
sampling_params=sampling_params,
use_tqdm=False)
print(p.key_averages())
else:
start_time = time.perf_counter()
llm.generate(dummy_prompts,
llm.generate(dummy_inputs,
sampling_params=sampling_params,
use_tqdm=False)
end_time = time.perf_counter()
Expand Down
Loading

0 comments on commit 030374b

Please sign in to comment.