From c8458fef2be25228a804ffc812ed72ac9ed625e7 Mon Sep 17 00:00:00 2001 From: alpayariyak Date: Mon, 10 Jun 2024 20:51:47 +0000 Subject: [PATCH] preparation for 1.0.0 release --- Dockerfile | 2 +- README.md | 15 +++++++-------- docker-bake.hcl | 2 +- vllm-base-image/vllm-metadata.yml | 3 +-- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/Dockerfile b/Dockerfile index 020f14e..f48b832 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ ARG WORKER_CUDA_VERSION=11.8.0 -ARG BASE_IMAGE_VERSION=1.0.0preview +ARG BASE_IMAGE_VERSION=1.0.0 FROM runpod/worker-vllm:base-${BASE_IMAGE_VERSION}-cuda${WORKER_CUDA_VERSION} AS vllm-base RUN apt-get update -y \ diff --git a/README.md b/README.md index 4939c58..a001e7f 100644 --- a/README.md +++ b/README.md @@ -2,11 +2,11 @@ # OpenAI-Compatible vLLM Serverless Endpoint Worker Deploy OpenAI-Compatible Blazing-Fast LLM Endpoints powered by the [vLLM](https://github.com/vllm-project/vllm) Inference Engine on RunPod Serverless with just a few clicks. - + @@ -18,16 +18,15 @@ Deploy OpenAI-Compatible Blazing-Fast LLM Endpoints powered by the [vLLM](https: ### 1. UI for Deploying vLLM Worker on RunPod console: ![Demo of Deploying vLLM Worker on RunPod console with new UI](media/ui_demo.gif) -### 2. Worker vLLM `1.0.0preview` with vLLM `0.4.2` now available under `stable` tags -Update 1.0.0preview is now available, use the image tag `runpod/worker-vllm:dev-cuda12.1.0` or `runpod/worker-vllm:dev-cuda11.8.0`. +### 2. Worker vLLM `1.0.0` with vLLM `0.4.2` now available under `stable` tags +Update 1.0.0 is now available, use the image tag `runpod/worker-vllm:stable-cuda12.1.0` or `runpod/worker-vllm:stable-cuda11.8.0`. -**Main Changes:** -- vLLM was updated from version `0.3.3` to `0.4.2`, adding compatibility for Llama 3 and other models, as well as increasing performance. +### 3. OpenAI-Compatible [Embedding Worker](https://github.com/runpod-workers/worker-infinity-embedding) Released +Deploy your own OpenAI-compatible Serverless Endpoint on RunPod with multiple embedding models and fast inference for RAG and more! -We will soon be adding more features from the updates, such as multi-LoRA, multi-modality, and more. -### 3. Caching Accross RunPod Machines +### 4. Caching Accross RunPod Machines Worker vLLM is now cached on all RunPod machines, resulting in near-instant deployment! Previously, downloading and extracting the image took 3-5 minutes on average. diff --git a/docker-bake.hcl b/docker-bake.hcl index fedfe81..1519a5f 100644 --- a/docker-bake.hcl +++ b/docker-bake.hcl @@ -7,7 +7,7 @@ variable "REPOSITORY" { } variable "BASE_IMAGE_VERSION" { - default = "1.0.0preview" + default = "1.0.0" } group "all" { diff --git a/vllm-base-image/vllm-metadata.yml b/vllm-base-image/vllm-metadata.yml index f1f8a0e..511290a 100644 --- a/vllm-base-image/vllm-metadata.yml +++ b/vllm-base-image/vllm-metadata.yml @@ -1,3 +1,2 @@ -version: '0.3.3' +version: '0.4.2' dev_version: '0.4.2' -worker_dev_version: '1.0.0preview' \ No newline at end of file