nextcloud · kyteinsky · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024
diff --git a/.github/workflows/docker-build-publish.yml b/.github/workflows/docker-build-publish.yml
@@ -0,0 +1,53 @@
+# This workflow is provided via the organization template repository
+#
+# https://github.com/nextcloud/.github
+# https://docs.github.com/en/actions/learn-github-actions/sharing-workflows-with-your-organization
+
+name: Docker build and publish
+
+on:
+  push:
+    tags:
+      - '**'
+
+env:
+  APP_ID: translate2
+
+jobs:
+  build_and_publish:
+    runs-on: ubuntu-latest
+
+    # Only allowed to be run on nextcloud repositories
+    if: ${{ github.repository_owner == 'nextcloud' }}
+
+    steps:
+      - name: Checkout app
+        uses: actions/checkout@v4
+
+      - name: Get app version
+        id: appinfo
+        uses: skjnldsv/xpath-action@7e6a7c379d0e9abc8acaef43df403ab4fc4f770c # master
+        with:
+          filename: appinfo/info.xml
+          expression: "/info/version/text()"
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          push: true
+          tags: |
+            ghcr.io/nextcloud/${{ env.APP_ID }}:latest
+            ghcr.io/nextcloud/${{ env.APP_ID }}:${{ fromJson(steps.appinfo.outputs.result).version }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
diff --git a/.github/workflows/integration_test.yml b/.github/workflows/integration_test.yml
@@ -41,6 +41,7 @@ jobs:
       APP_ID: llm2
       APP_PORT: 9080
       APP_SECRET: 12345
+      COMPUTE_DEVICE: cpu
       NEXTCLOUD_URL: http://localhost:8080
 
     services:

diff --git a/Dockerfile b/Dockerfile
@@ -3,23 +3,29 @@ FROM nvidia/cuda:12.2.2-cudnn8-devel-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN apt update
-RUN apt install -y pipx build-essential
+RUN apt install -y pipx build-essential git vim
 RUN pipx install poetry
 
 ENV DEBIAN_FRONTEND=dialog
+ENV PATH="/root/.local/bin:${PATH}"
+ENV CMAKE_ARGS="-DGGML_CUDA=on"
+
+WORKDIR /app
+
+# Install requirements
+COPY pyproject.toml .
+COPY poetry.lock .
+COPY healthcheck.sh .
+
+RUN poetry install
+RUN ln -s /usr/local/cuda/compat/libcuda.so.1 /usr/lib/x86_64-linux-gnu/
 
 ADD li[b] /app/lib
 ADD model[s] /app/models
 ADD default_confi[g] /app/default_config
 
-# Install requirements
-COPY pyproject.toml /app
-COPY poetry.lock /app
-COPY init.sh /app
-COPY healthcheck.sh /app
-
-WORKDIR /app
-ENTRYPOINT ["bash", "init.sh"]
+WORKDIR /app/lib
+ENTRYPOINT ["poetry", "run", "python3", "main.py"]
 
 LABEL org.opencontainers.image.source=https://github.com/nextcloud/llm2
-HEALTHCHECK --interval=2s --timeout=2s --retries=300 CMD /app/healthcheck.sh
+HEALTHCHECK --interval=2s --timeout=2s --retries=300 CMD /app/healthcheck.sh
diff --git a/default_config/config.json b/default_config/config.json
@@ -1,58 +1,61 @@
 {
     "llama-2-7b-chat.Q4_K_M": {
         "prompt": "<|im_start|> system\n{system_prompt}\n<|im_end|>\n<|im_start|> user\n{user_prompt}\n<|im_end|>\n<|im_start|> assistant\n",
-        "gpt4all_config": {
-            "max_tokens": 4096,
-            "n_predict": 2048,
+        "loader_config": {
+            "n_ctx": 4096,
+            "max_tokens": 2048,
             "stop": ["<|im_end|>"]
         }
     },
     "gpt4all-falcon-q4_0": {
         "prompt": "### Instruction: {system_prompt}\n{user_prompt}\n### Response:",
-        "gpt4all_config": {
-            "max_tokens": 4096,
-            "n_predict": 2048,
+        "loader_config": {
+            "n_ctx": 4096,
+            "max_tokens": 2048,
             "stop": ["### Instruction:"]
         }
     },
     "leo-hessianai-13b-chat-bilingual.Q4_K_M": {
         "prompt": "<|im_start|> system\n{system_prompt}\n<|im_end|>\n<|im_start|> user\n{user_prompt}\n<|im_end|>\n<|im_start|> assistant\n",
-        "gpt4all_config": {
-            "max_tokens": 4096,
-            "n_predict": 2048,
+        "loader_config": {
+            "n_ctx": 4096,
+            "max_tokens": 2048,
             "stop": ["<|im_end|>"]
         }
     },
     "neuralbeagle14-7b.Q4_K_M": {
         "prompt": "<|im_start|> system\n{system_prompt}\n<|im_end|>\n<|im_start|> user\n{user_prompt}\n<|im_end|>\n<|im_start|> assistant\n",
-        "gpt4all_config": {
-            "max_tokens": 8000,
-            "n_predict": 4000,
+        "loader_config": {
+            "n_ctx": 8000,
+            "max_tokens": 4000,
             "stop": ["<|im_end|>"]
         }
     },
     "Meta-Llama-3-8B-Instruct.Q4_K_M": {
         "prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n{user_prompt}<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n",
-        "gpt4all_config": {
-            "max_tokens": 8000,
-            "n_predict": 4000,
-            "stop": ["<|eot_id|>"]
+        "loader_config": {
+            "n_ctx": 8000,
+            "max_tokens": 4000,
+            "stop": ["<|eot_id|>"],
+            "temperature": 0.3
         }
     },
     "Meta-Llama-3.1-8B-Instruct.Q4_K_M": {
         "prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n{user_prompt}<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n",
-        "gpt4all_config": {
-            "max_tokens": 128000,
-            "n_predict": 60000,
-            "stop": ["<|eot_id|>"]
+        "loader_config": {
+            "n_ctx": 128000,
+            "max_tokens": 60000,
+            "stop": ["<|eot_id|>"],
+            "temperature": 0.3
         }
     },
     "default": {
         "prompt": "<|im_start|> system\n{system_prompt}\n<|im_end|>\n<|im_start|> user\n{user_prompt}\n<|im_end|>\n<|im_start|> assistant\n",
-        "gpt4all_config": {
-            "max_tokens": 4096,
-            "n_predict": 2048,
-            "stop": ["<|im_end|>"]
+        "loader_config": {
+            "n_ctx": 4096,
+            "max_tokens": 2048,
+            "stop": ["<|im_end|>"],
+            "temperature": 0.6
         }
     }
-}
+}
diff --git a/init.sh b/init.sh
diff --git a/lib/chains.py b/lib/chains.py
@@ -2,19 +2,15 @@
 """
 
 import os
+import json
 
 from free_prompt import FreePromptChain
 from headline import HeadlineChain
 from topics import TopicsChain
 from summarize import SummarizeChain
-from contextwrite import ContextWriteChain
-from reformulate import ReformulateChain
-from simplify import SimplifyChain
-from formalize import FormalizeChain
 from langchain_community.llms import LlamaCpp
 from langchain.chains import LLMChain
 from langchain.prompts import PromptTemplate
-import json
 from nc_py_api.ex_app import persistent_storage
 
 dir_path = os.path.dirname(os.path.realpath(__file__))
@@ -42,18 +38,6 @@ def get_model_config(file_name):
     return model_config
 
 
-config = {
-    "llama": {
-        "n_batch": 10,
-        "n_ctx": 4096,
-        "n_gpu_layers": -1,
-        "model_kwargs": {
-            "device": "cuda"
-        }
-    }
-}
-
-
 def generate_llm_chain(file_name):
     model_config = get_model_config(file_name)
 
@@ -62,35 +46,24 @@ def generate_llm_chain(file_name):
     if not os.path.exists(path):
         path = os.path.join(persistent_storage(), file_name)
 
+    compute_device = os.getenv("COMPUTE_DEVICE", "cuda")
     try:
         llm = LlamaCpp(
             model_path=path,
-            model_kwargs={'device': config["llama"]["model_kwargs"]["device"]},
-            n_gpu_layers=config["llama"]["n_gpu_layers"],
-            n_ctx=model_config['gpt4all_config']["n_predict"],
-            max_tokens=model_config["gpt4all_config"]["max_tokens"],
-            stop=model_config["gpt4all_config"]["stop"],
-            echo=True
+            **{
+                "n_gpu_layers": (0, -1)[compute_device != "cpu"],
+                **model_config["loader_config"],
+            },
         )
-        print(f'Using: {config["llama"]["model_kwargs"]["device"]}', flush=True)
-    except Exception as gpu_error:
-        try:
-            llm = LlamaCpp(model_path=path, device="cpu",
-                           n_ctx=model_config['gpt4all_config']["n_predict"],
-                           max_tokens=model_config["gpt4all_config"]["max_tokens"],
-                           stop=model_config["gpt4all_config"]["stop"],
-                           echo=True)
-            print("Using: CPU", flush=True)
-        except Exception as cpu_error:
-            raise RuntimeError(f"Error: Failed to initialize the LLM model on both GPU and CPU.", f"{cpu_error}") from cpu_error
+    except Exception as e:
+        print(f"Failed to load model '{path}' with compute device '{compute_device}'")
+        raise e
 
     prompt = PromptTemplate.from_template(model_config['prompt'])
 
     return LLMChain(llm=llm, prompt=prompt)
 
 
-
-
 def generate_chains():
     chains = {}
     for file in os.scandir(models_folder_path):
@@ -104,6 +77,7 @@ def generate_chains():
 
     return chains
 
+
 def generate_chain_for_model(file_name, chains):
     model_name = file_name.split('.gguf')[0]