Skip to content
This repository has been archived by the owner on Dec 6, 2023. It is now read-only.

Commit

Permalink
llama improvement and stable diff release
Browse files Browse the repository at this point in the history
  • Loading branch information
filopedraz committed Jun 28, 2023
1 parent 612bd1c commit 0e9883f
Show file tree
Hide file tree
Showing 6 changed files with 13 additions and 5 deletions.
3 changes: 3 additions & 0 deletions cht-llama-cpp/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,12 @@ def generate(
stream: bool = False,
max_tokens: int = 256,
stop: list = [],
n_threads: int = None,
**kwargs,
):
messages = cls.reduce_number_of_messages(messages[::-1], max_tokens)[::-1]
if n_threads is not None:
cls.model.n_threads = n_threads
return cls.model.create_chat_completion(
messages,
temperature=temperature,
Expand Down
8 changes: 6 additions & 2 deletions cht-llama-cpp/routes.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import uuid
from datetime import datetime as dt
from typing import List, Optional, Union

from fastapi import APIRouter, HTTPException
from fastapi.responses import StreamingResponse
Expand All @@ -15,12 +16,13 @@ class ChatCompletionInput(BaseModel):
top_p: float = 0.95
n: int = 1
stream: bool = False
stop: str | list | None = []
stop: Optional[Union[str, List[str]]] = []
max_tokens: int = 256
presence_penalty: float = 0.0
frequence_penalty: float = 0.0
logit_bias: dict | None = {}
logit_bias: Optional[dict] = {}
user: str = ""
n_threads: int = None


class ChatCompletionResponse(BaseModel):
Expand Down Expand Up @@ -80,6 +82,7 @@ async def generate_chunk_based_response(body):
presence_penalty=body.presence_penalty,
frequence_penalty=body.frequence_penalty,
logit_bias=body.logit_bias,
n_threads=body.n_threads,
)
for chunk in chunks:
yield f"event: completion\ndata: {json.dumps(chunk)}\n\n"
Expand All @@ -104,6 +107,7 @@ async def chat_completions(body: ChatCompletionInput):
presence_penalty=body.presence_penalty,
frequence_penalty=body.frequence_penalty,
logit_bias=body.logit_bias,
n_threads=body.n_threads,
)
except ValueError as error:
raise HTTPException(
Expand Down
1 change: 1 addition & 0 deletions cht-llama-cpp/tests/test_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ def test_chat_llama_cpp() -> None:
json={
"model": "vicuna-7b-q4",
"messages": [{"role": "user", "content": "Hello!"}],
"n_threads": 10,
},
)
assert response.status_code == 200
Expand Down
2 changes: 1 addition & 1 deletion dfs-diffusers/docker/gpu/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@ RUN python3 download.py --model $MODEL_ID
COPY . .

ENV MODEL_ID=$MODEL_ID
ENV DEVICE=gpu
ENV DEVICE=cuda

CMD python3 main.py
2 changes: 1 addition & 1 deletion scripts/cht_llama_cpp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

set -e

export VERSION=1.0.2
export VERSION=1.0.3

docker buildx build --push \
--cache-from ghcr.io/premai-io/chat-gpt4all-lora-q4-cpu:latest \
Expand Down
2 changes: 1 addition & 1 deletion scripts/dfs_diffusers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

set -e

export VERSION=1.0.0
export VERSION=1.0.1

docker buildx build --push \
--cache-from=ghcr.io/premai-io/diffuser-stable-diffusion-2-1-base-gpu:latest \
Expand Down

0 comments on commit 0e9883f

Please sign in to comment.