Skip to content

Commit

Permalink
Merge pull request #339 from MichaelClifford/new_default
Browse files Browse the repository at this point in the history
make granite default for models/ and llamacpp/
  • Loading branch information
rhatdan authored Apr 25, 2024
2 parents a680d5a + 273ab06 commit df755ca
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 21 deletions.
11 changes: 6 additions & 5 deletions model_servers/llamacpp_python/Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
APP := llamacpp_python
PORT ?= 8001
CHAT_FORMAT ?= openchat

include ../common/Makefile.common

Expand All @@ -9,10 +10,10 @@ CUDA_IMAGE := $(REGISTRY)/$(REGISTRY_ORG)/$(COMPONENT)/$(APP)_cuda:latest
VULKAN_IMAGE := $(REGISTRY)/$(REGISTRY_ORG)/$(COMPONENT)/$(APP)_vulkan:latest

MODELS_PATH := /locallm/models
MODEL_NAME ?= mistral-7b-instruct-v0.1.Q4_K_M.gguf
MODEL_NAME ?= granite-7b-lab-Q4_K_M.gguf

.Phony: all
all: build download-model-mistral run
all: build download-model-granite run

.PHONY: build-cuda
build-cuda:
Expand All @@ -22,7 +23,7 @@ build-cuda:
build-vulkan:
podman build --squash-all -t $(VULKAN_IMAGE) . -f vulkan/Containerfile

.PHONY: download-model-mistral # default model
download-model-mistral:
.PHONY: download-model-granite # default model
download-model-granite:
cd ../../models && \
make MODEL_NAME=mistral-7b-instruct-v0.1.Q4_K_M.gguf MODEL_URL=https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf -f Makefile download-model
make MODEL_NAME=granite-7b-lab-Q4_K_M.gguf MODEL_URL=https://huggingface.co/instructlab/granite-7b-lab-GGUF/resolve/main/granite-7b-lab-Q4_K_M.gguf -f Makefile download-model
29 changes: 16 additions & 13 deletions model_servers/llamacpp_python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,24 +76,25 @@ podman pull quay.io/ai-lab/llamacpp_python_vulkan

There are many models to choose from these days, most of which can be found on [huggingface.co](https://huggingface.co). In order to use a model with the llamacpp_python model server, it must be in GGUF format. You can either download pre-converted GGUF models directly or convert them yourself with the [model converter utility](../../convert_models/) available in this repo.

One of the more popular Apache-2.0 Licenesed models that we recommend using if you are just getting started is `mistral-7b-instruct-v0.1`. You can use the link below to quickly download a quantized (smaller) GGUF version of this model for use with the llamacpp_python model server.
A well performant Apache-2.0 licensed models that we recommend using if you are just getting started is
`granite-7b-lab`. You can use the link below to quickly download a quantized (smaller) GGUF version of this model for use with the llamacpp_python model server.

Download URL: [https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf)
Download URL: [https://huggingface.co/instructlab/granite-7b-lab-GGUF/resolve/main/granite-7b-lab-Q4_K_M.gguf](https://huggingface.co/instructlab/granite-7b-lab-GGUF/resolve/main/granite-7b-lab-Q4_K_M.gguf)

Place all models in the [models](../../models/) directory.

You can use this snippet below to download the default model:

```bash
make -f Makefile download-model-mistral
make -f Makefile download-model-granite
```

Or you can use the generic `download-models` target from the `/models` directory to download any model file from huggingface:

```bash
cd ../../models
make MODEL_NAME=<model_name> MODEL_URL=<model_url> -f Makefile download-model
# EX: make MODEL_NAME=mistral-7b-instruct-v0.1.Q4_K_M.gguf MODEL_URL=https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf -f Makefile download-model
# EX: make MODEL_NAME=granite-7b-lab-Q4_K_M.gguf MODEL_URL=https://huggingface.co/instructlab/granite-7b-lab-GGUF/resolve/main/granite-7b-lab-Q4_K_M.gguf -f Makefile download-model
```


Expand All @@ -107,9 +108,10 @@ To deploy the LLM server you must specify a volume mount `-v` where your models
podman run --rm -it \
-p 8001:8001 \
-v Local/path/to/locallm/models:/locallm/models:ro \
-e MODEL_PATH=models/mistral-7b-instruct-v0.1.Q4_K_M.gguf
-e MODEL_PATH=models/granite-7b-lab-Q4_K_M.gguf
-e HOST=0.0.0.0
-e PORT=8001
-e MODEL_CHAT_FORMAT=openchat
llamacpp_python \
```

Expand All @@ -120,31 +122,32 @@ podman run --rm -it \
--device nvidia.com/gpu=all
-p 8001:8001 \
-v Local/path/to/locallm/models:/locallm/models:ro \
-e MODEL_PATH=models/mistral-7b-instruct-v0.1.Q4_K_M.gguf
-e MODEL_PATH=models/granite-7b-lab-Q4_K_M.gguf
-e HOST=0.0.0.0
-e PORT=8001
-e MODEL_CHAT_FORMAT=openchat
llamacpp_python \
```
### Multiple Model Service:

To enable dynamic loading and unloading of different models present on your machine, you can start the model service with a `CONFIG_PATH` instead of a `MODEL_PATH`.

Here is an example `models_config.json` with two quantization variants of mistral-7B.
Here is an example `models_config.json` with two model options.

```json
{
"host": "0.0.0.0",
"port": 8001,
"models": [
{
"model": "models/mistral-7b-instruct-v0.1.Q4_K_M.gguf",
"model_alias": "mistral_Q4",
"chat_format": "mistral",
"model": "models/granite-7b-lab-Q4_K_M.gguf",
"model_alias": "granite",
"chat_format": "openchat",
},
{
"model": "models/mistral-7b-instruct-v0.1.Q5_K_M.gguf",
"model_alias": "mistral_Q5",
"chat_format": "mistral",
"model": "models/merlinite-7b-lab-Q4_K_M.gguf",
"model_alias": "merlinite",
"chat_format": "openchat",
},

]
Expand Down
2 changes: 1 addition & 1 deletion models/Containerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
FROM registry.access.redhat.com/ubi9/ubi-micro:9.3-15

# Can be substituted using the --build-arg defined above
ARG MODEL_URL=https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf
ARG MODEL_URL=https://huggingface.co/instructlab/granite-7b-lab-GGUF/resolve/main/granite-7b-lab-Q4_K_M.gguf

# By default the Model Server container image uses the AI Model stored in the model/model.file file.
WORKDIR /model
Expand Down
4 changes: 2 additions & 2 deletions models/Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
MODEL_URL ?= https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf
MODEL_NAME ?= mistral-7b-instruct
MODEL_URL ?= https://huggingface.co/instructlab/granite-7b-lab-GGUF/resolve/main/granite-7b-lab-Q4_K_M.gguf
MODEL_NAME ?= granite-7b-lab-Q4_K_M.gguf

REGISTRY ?= quay.io
REGISTRY_ORG ?= ai-lab
Expand Down

0 comments on commit df755ca

Please sign in to comment.