Skip to content

Commit

Permalink
abstracting model downloads and file-normalization to models dir
Browse files Browse the repository at this point in the history
Signed-off-by: greg pereira <grpereir@redhat.com>
  • Loading branch information
Gregory-Pereira committed Apr 12, 2024
1 parent 3734eae commit fc57821
Show file tree
Hide file tree
Showing 16 changed files with 183 additions and 173 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/chatbot.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ jobs:
run: make install

- name: Download model
working-directory: ./model_servers/llamacpp_python
run: make mistral
working-directory: ./recipes/natural_language_processing/${{ env.IMAGE_NAME }}
run: make download-model-mistral

- name: Run Functional Tests
shell: bash
Expand Down
7 changes: 3 additions & 4 deletions .github/workflows/model_servers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ on:

env:
REGISTRY: ghcr.io
REGISTRY_ORG: containers

jobs:
build-and-push-image:
Expand Down Expand Up @@ -82,7 +83,7 @@ jobs:

- name: Download model
working-directory: ./model_servers/${{ matrix.directory }}/
run: make ${{ matrix.model }}
run: make download-model-${{ matrix.model }}

- name: Set up Python
uses: actions/setup-python@v5.0.0
Expand All @@ -96,9 +97,7 @@ jobs:
- name: Run non-gpu tests
working-directory: ./model_servers/${{ matrix.directory }}/
if: ${{ matrix.no_gpu }}
run: make test
env:
IMAGE_NAME: ${{ matrix.image_name }}
run: make test REGISTRY=${{ env.REGISTRY }} IMAGE_NAME=${{ env.REGISTRY_ORG }}/${{ matrix.image_name}}:latest

- name: Run cuda test
working-directory: ./model_servers/${{ matrix.directory }}/
Expand Down
5 changes: 2 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
port_check.lock
*build
models/*
model_servers/llamacpp_python/model.gguf
!models/convert_models/*
!models/Containerfile
!models/Makefile
!models/README.md
convert_models/converted_models
recipes/chromedriver
47 changes: 47 additions & 0 deletions model_servers/common/Makefile.common
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
REGISTRY ?= quay.io
REGISTRY_ORG ?= ai-lab
COMPONENT ?= model_servers

BIND_MOUNT_OPTIONS := ro
OS := $(shell uname -s)
ifeq ($(OS),Linux)
BIND_MOUNT_OPTIONS := Z,ro
endif

.PHONY: build
build:
podman build --squash-all --build-arg $(PORT) -t $(IMAGE) . -f base/Containerfile

.PHONY: install
install:
pip install -r tests/requirements.txt

.PHONY: test
test:
@if [ ! -f "../../models/$(MODEL_NAME)" ]; then \
echo "Model file -- $(MODEL_NAME) -- not present in the models directory."; \
exit 1; \
else \
if [ ! -f "./$(MODEL_NAME)" ]; then \
ln -s ../../models/$(MODEL_NAME) ./$(MODEL_NAME); \
fi; \
REGISTRY=$(REGISTRY) IMAGE_NAME=$(IMAGE_NAME) MODEL_NAME=$(MODEL_NAME) MODEL_PATH=$(MODEL_PATH) PORT=$(PORT) pytest -vvv -s ; \
fi;

.PHONY: clean
clean:
- rm ./$(MODEL_NAME) &> /dev/null

.PHONY: run
run:
cd ../../models && \
podman run -it -d -p $(PORT):$(PORT) -v ./$(MODEL_NAME):$(MODELS_PATH)/$(MODEL_NAME):$(BIND_MOUNT_OPTIONS) -e MODEL_PATH=$(MODELS_PATH)/$(MODEL_NAME) -e HOST=0.0.0.0 -e PORT=$(PORT) $(IMAGE)

.PHONY: podman-clean
podman-clean:
@container_ids=$$(podman ps --format "{{.ID}} {{.Image}}" | awk '$$2 == "$(IMAGE)" {print $$1}'); \
echo "removing all containers with IMAGE=$(IMAGE)"; \
for id in $$container_ids; do \
echo "Removing container: $$id,"; \
podman rm -f $$id; \
done
79 changes: 11 additions & 68 deletions model_servers/llamacpp_python/Makefile
Original file line number Diff line number Diff line change
@@ -1,42 +1,18 @@
APP := llamacpp_python
IMAGE_BASE := llamacpp-python
PORT := 8001
PORT ?= 8001

IMAGE := quay.io/ai-lab/$(IMAGE_BASE):latest
CUDA_IMAGE := quay.io/ai-lab/$(IMAGE_BASE)-cuda:latest
VULKAN_IMAGE := quay.io/ai-lab/$(IMAGE_BASE)-vulkan:latest
include ../common/Makefile.common

# ----- MODEL OPTIONS -----
IMAGE_NAME ?= $(REGISTRY_ORG)/$(COMPONENT)/$(APP):latest
IMAGE := $(REGISTRY)/$(IMAGE_NAME)
CUDA_IMAGE := $(REGISTRY)/$(REGISTRY_ORG)/$(COMPONENT)/$(APP)_cuda:latest
VULKAN_IMAGE := $(REGISTRY)/$(REGISTRY_ORG)/$(COMPONENT)/$(APP)_vulkan:latest

LLAMA_MODEL_NAME := llama-2-7b-chat.Q5_K_S.gguf
LLAMA_MODEL_URL := https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q5_K_S.gguf

TINY_LLAMA_MODEL_NAME := tinyllama-1.1b-chat-v1.0.Q4_K_S.gguf
TINY_LLAMA_MODEL_URL := https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/blob/main/tinyllama-1.1b-chat-v1.0.Q4_K_S.gguf

MISTRAL_MODEL_NAME := mistral-7b-instruct-v0.1.Q4_K_M.gguf
MISTRAL_MODEL_URL := https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf

# --- END MODEL OPTIONS ---

SELECTED_MODEL_NAME := $(or $(SELECTED_MODEL),$(MISTRAL_MODEL_NAME))
SELECTED_MODEL_URL := $(or $(SELECTED_MODEL_LINK),$(MISTRAL_MODEL_URL))

RELATIVE_MODELS_PATH := ../../models
MODELS_PATH := /locallm/models

BIND_MOUNT_OPTIONS := ro
OS := $(shell uname -s)
ifeq ($(OS),Linux)
BIND_MOUNT_OPTIONS := ro,Z
endif
MODEL_NAME ?= mistral-7b-instruct-v0.1.Q4_K_M.gguf

.Phony: all
all: build mistral run

.PHONY: build
build:
podman build --squash-all -t $(IMAGE) . -f base/Containerfile
all: build download-model-mistral run

.PHONY: build-cuda
build-cuda:
Expand All @@ -46,40 +22,7 @@ build-cuda:
build-vulkan:
podman build --squash-all -t $(VULKAN_IMAGE) . -f vulkan/Containerfile

.PHONY: download-model-tiny-llama
download-model-tiny-llama:
curl -H "Cache-Control: no-cache" -s -S -L -f $(TINY_LLAMA_MODEL_URL) -z $(RELATIVE_MODELS_PATH)/$(LLAMA_MODEL_NAME) -o $(RELATIVE_MODELS_PATH)/$(LLAMA_MODEL_NAME).tmp && mv -f $(RELATIVE_MODELS_PATH)/$(LLAMA_MODEL_NAME).tmp $(RELATIVE_MODELS_PATH)/$(LLAMA_MODEL_NAME) 2>/dev/null || rm -f $(RELATIVE_MODELS_PATH)/$(LLAMA_MODEL_NAME).tmp $(RELATIVE_MODELS_PATH)/$(LLAMA_MODEL_NAME)

.PHONY: download-model-llama
download-model-llama:
curl -H "Cache-Control: no-cache" -s -S -L -f $(LLAMA_MODEL_URL) -z $(RELATIVE_MODELS_PATH)/$(TINY_LLAMA_MODEL_NAME) -o $(RELATIVE_MODELS_PATH)/$(TINY_LLAMA_MODEL_NAME).tmp && mv -f $(RELATIVE_MODELS_PATH)/$(TINY_LLAMA_MODEL_NAME).tmp $(RELATIVE_MODELS_PATH)/$(TINY_LLAMA_MODEL_NAME) 2>/dev/null || rm -f $(RELATIVE_MODELS_PATH)/$(TINY_LLAMA_MODEL_NAME).tmp $(RELATIVE_MODELS_PATH)/$(TINY_LLAMA_MODEL_NAME)


.PHONY: mistral
mistral:
curl -H "Cache-Control: no-cache" -s -S -L -f $(MISTRAL_MODEL_URL) -z $(RELATIVE_MODELS_PATH)/$(MISTRAL_MODEL_NAME) -o $(RELATIVE_MODELS_PATH)/$(MISTRAL_MODEL_NAME).tmp && mv -f $(RELATIVE_MODELS_PATH)/$(MISTRAL_MODEL_NAME).tmp $(RELATIVE_MODELS_PATH)/$(MISTRAL_MODEL_NAME) 2>/dev/null || rm -f $(RELATIVE_MODELS_PATH)/$(MISTRAL_MODEL_NAME).tmp $(RELATIVE_MODELS_PATH)/$(MISTRAL_MODEL_NAME)

.PHONY: install
install:
pip install -r tests/requirements.txt

.PHONY: run
run:
.PHONY: download-model-mistral # default model
download-model-mistral:
cd ../../models && \
podman run -it -d -p $(PORT):$(PORT) -v ./$(SELECTED_MODEL_NAME):$(MODELS_PATH)/model.gguf:$(BIND_MOUNT_OPTIONS) -e MODEL_PATH=$(MODELS_PATH)/model.gguf -e HOST=0.0.0.0 -e PORT=$(PORT) $(IMAGE)

# TODO: Add tests for llamacpp-cuda
# This never fails, placeholder for future test
.PHONY: run-cuda
run-cuda:
cd ../../models && \
podman run -it -d -p $(PORT):$(PORT) -v ./$(SELECTED_MODEL_NAME):$(MODELS_PATH)/model.gguf:$(BIND_MOUNT_OPTIONS) -e MODEL_PATH=$(MODELS_PATH)/model.gguf -e HOST=0.0.0.0 -e PORT=$(PORT) --net=host --device nvidia.com/gpu=all $(IMAGE) || true

# TODO: Add tests for llamacpp-cuda
.PHONY: test-cuda
test-cuda: run-cuda

.PHONY: test
test:
curl -H "Cache-Control: no-cache" -s -S -L -f $(SELECTED_MODEL_URL) -z ./model.gguf -o ./model.gguf.tmp && mv -f ./model.gguf.tmp ./model.gguf 2>/dev/null || rm -f ./model.gguf.tmp ./model.gguf
pytest --log-cli-level NOTSET
make MODEL_NAME=mistral-7b-instruct-v0.1.Q4_K_M.gguf MODEL_URL=https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf -f Makefile download-model
17 changes: 9 additions & 8 deletions model_servers/llamacpp_python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ podman pull quay.io/ai-lab/llamacpp-python-vulkan
```



## Download Model(s)

There are many models to choose from these days, most of which can be found on [huggingface.co](https://huggingface.co). In order to use a model with the llamacpp_python model server, it must be in GGUF format. You can either download pre-converted GGUF models directly or convert them yourself with the [model converter utility](../../convert_models/) available in this repo.
Expand All @@ -81,26 +82,26 @@ Download URL: [https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/res

Place all models in the [models](../../models/) directory.

You can use this snippet below to download models.
You can use this snippet below to download the default model:

```bash
cd ../../models
curl -sLO <Download URL>
cd model_servers/llamacpp_python
make -f Makefile download-model-mistral
```

or:
Or you can use the generic `download-models` target from the `/models` directory to download any model file from huggingface:

```bash
make -f Makefile download-model-mistral
make -f Makefile download-model-llama
cd ../../models
make MODEL_NAME=<model_name> MODEL_URL=<model_url> -f Makefile download-model
# EX: make MODEL_NAME=mistral-7b-instruct-v0.1.Q4_K_M.gguf MODEL_URL=https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf -f Makefile download-model
```


## Deploy Model Service

### Single Model Service:

To deploy the LLM server you must specify a volume mount `-v` where your models are stored on the host machine and the `MODEL_PATH` for your model of choice. The model_server is most easily deploy from calling the make command: `make -f Makefile run`
To deploy the LLM server you must specify a volume mount `-v` where your models are stored on the host machine and the `MODEL_PATH` for your model of choice. The model_server is most easily deploy from calling the make command: `make -f Makefile run`. Of course as with all our make calls you can pass any number of the following variables: `REGISTRY`, `IMAGE_NAME`, `MODEL_NAME`, `MODEL_PATH`, and `PORT`.

```bash
podman run --rm -it \
Expand Down
43 changes: 36 additions & 7 deletions model_servers/llamacpp_python/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,53 @@
import os

# For cuda, will add this to below Container: extra_launch_args=["--device", "nvidia.com/gpu=all"],
if not 'REGISTRY' in os.environ:
REGISTRY = 'ghcr.io'
else:
REGISTRY = os.environ['REGISTRY']

if not 'IMAGE_NAME' in os.environ:
IMAGE_NAME = 'containers/llamacpp_python:latest'
else:
IMAGE_NAME = os.environ['IMAGE_NAME']

if not 'MODEL_NAME' in os.environ:
MODEL_NAME = 'mistral-7b-instruct-v0.1.Q4_K_M.gguf'
else:
MODEL_NAME = os.environ['MODEL_NAME']

if not 'MODEL_PATH' in os.environ:
MODEL_PATH = "/locallm/models"
else:
MODEL_PATH = os.environ['MODEL_PATH']

if not 'PORT' in os.environ:
PORT = 8001
else:
PORT = os.environ['PORT']
try:
PORT = int(PORT)
except:
PORT = 8001

MS = pytest_container.Container(
url=f"containers-storage:{os.environ['REGISTRY']}/containers/{os.environ['IMAGE_NAME']}",
url=f"containers-storage:{REGISTRY}/{IMAGE_NAME}",
volume_mounts=[
pytest_container.container.BindMount(
container_path="/locallm/models/model.gguf",
host_path=f"./model.gguf",
container_path="{MODEL_PATH}/{MODEL_NAME}".format(MODEL_PATH=MODEL_PATH, MODEL_NAME=MODEL_NAME),
host_path=f"./{MODEL_NAME}",
flags=["ro"]
)
],
extra_environment_variables={
"MODEL_PATH": "/locallm/models/model.gguf",
"MODEL_PATH": "{MODEL_PATH}/{MODEL_NAME}".format(MODEL_PATH=MODEL_PATH, MODEL_NAME=MODEL_NAME),
"HOST": "0.0.0.0",
"PORT": "8001"
"PORT": f"{PORT}"
},
forwarded_ports=[
pytest_container.PortForwarding(
container_port=8001,
host_port=8001
container_port=PORT,
host_port=PORT
)
],
)
Expand Down
1 change: 1 addition & 0 deletions model_servers/llamacpp_python/tests/test_alive.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest_container
from .conftest import MS
import tenacity
import os

CONTAINER_IMAGES = [MS]

Expand Down
59 changes: 11 additions & 48 deletions model_servers/whispercpp/Makefile
Original file line number Diff line number Diff line change
@@ -1,57 +1,20 @@
PORT := 8001
APP := whispercpp
IMAGE := quay.io/ai-lab/model_servers/$(APP):latest
CUDA_IMAGE := quay.io/ai-lab/model_servers/$(APP)_cuda:latest
VULKAN_IMAGE :=quay.io/ai-lab/model_servers/$(APP)_vulkan:latest
PORT ?= 8001

# ----- MODEL OPTIONS -----
include ../common/Makefile.common

WHISPER_SMALL_MODEL_NAME := ggml-small.bin
WHISPER_SMALL_MODEL_URL := https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin

WHISPER_BASE_MODEL_NAME := ggml-base.en.bin
WHISPER_BASE_MODEL_URL := https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin

SELECTED_MODEL_NAME := $(or $(SELECTED_MODEL),$(WHISPER_SMALL_MODEL_NAME))
SELECTED_MODEL_URL := $(or $(SELECTED_MODEL_LINK),$(WHISPER_SMALL_MODEL_URL))

# --- END MODEL OPTIONS ---
IMAGE_NAME ?= $(REGISTRY_ORG)/$(COMPONENT)/$(APP):latest
IMAGE ?= $(REGISTRY)/$(IMAGE_NAME)
# CUDA_IMAGE_NAME := $(REGISTRY)/$(BASE_IMAGE_NAME)/$(APP)_cuda:latest
# VULKAN_IMAGE := $(REGISTRY)/$(BASE_IMAGE_NAME)/$(APP)_vulkan:latest

MODELS_PATH := /app/models

BIND_MOUNT_OPTIONS := ro
OS := $(shell uname -s)
ifeq ($(OS),Linux)
BIND_MOUNT_OPTIONS := Z,ro
endif
MODEL_NAME ?= ggml-small.bin

.PHONY: all
all: build whisper-small run

.PHONY: build
build:
podman build -t $(IMAGE) . -f Containerfile
all: build download-model-whisper-small run

.PHONY: whisper-small
whisper-small:
.PHONY: download-model-whisper-small # small .bin model type testing
download-model-whisper-small:
cd ../../models && \
curl -s -S -L -f $(WHISPER_SMALL_MODEL_URL) -z $(WHISPER_SMALL_MODEL_NAME) -o $(WHISPER_SMALL_MODEL_NAME).tmp && mv -f $(WHISPER_SMALL_MODEL_NAME).tmp $(WHISPER_SMALL_MODEL_NAME) 2>/dev/null || rm -f $(WHISPER_SMALL_MODEL_NAME).tmp $(WHISPER_SMALL_MODEL_NAME)

.PHONY: install
install:
pip install -r tests/requirements.txt

.PHONY: download-model-whisper-base
download-model-whisper-base:
cd ../../models && \
curl -s -S -L -f $(WHISPER_BASE_MODEL_URL) -z $(WHISPER_BASE_MODEL_NAME) -o $(WHISPER_BASE_MODEL_NAME).tmp && mv -f $(WHISPER_BASE_MODEL_NAME).tmp $(WHISPER_BASE_MODEL_NAME) 2>/dev/null || rm -f $(WHISPER_BASE_MODEL_NAME).tmp $(WHISPER_BASE_MODEL_NAME)

.PHONY: run
run:
cd ../../models && \
podman run -d --rm -it -p $(PORT):$(PORT) -v ./$(SELECTED_MODEL_NAME):$(MODELS_PATH)/$(SELECTED_MODEL_NAME):$(BIND_MOUNT_OPTIONS) -e HOST=0.0.0.0 -e MODEL_PATH=$(MODELS_PATH)/$(SELECTED_MODEL_NAME) -e PORT=$(PORT) $(IMAGE)

.PHONY: test
test:
curl -H "Cache-Control: no-cache" -s -S -L -f $(SELECTED_MODEL_URL) -z ./model.gguf -o ./model.gguf.tmp && mv -f ./model.gguf.tmp ./model.gguf 2>/dev/null || rm -f ./model.gguf.tmp ./model.gguf
pytest --log-cli-level NOTSET
make MODEL_NAME=ggml-small.bin MODEL_URL=https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin -f Makefile download-model
Loading

0 comments on commit fc57821

Please sign in to comment.