premAI-io · filopedraz · Sep 21, 2023 · Sep 21, 2023 · Sep 21, 2023 · Sep 21, 2023
diff --git a/cht-petals/build.sh b/cht-petals/build.sh
@@ -1,10 +1,7 @@
 #!/bin/bash
 set -e
-export VERSION=1.0.0
+export VERSION=1.0.1
 source "$(dirname "${BASH_SOURCE[0]}")/../utils.sh"
 
-# TODO: support linux/amd64
-BUILDX_PLATFORM=linux/arm64 TESTS_SKIP_CPU=1 \
 build_cpu ghcr.io/premai-io/chat-stable-beluga-2-cpu petals-team/StableBeluga2           ${@:1}
-BUILDX_PLATFORM=linux/arm64 TESTS_SKIP_CPU=1 \
 build_cpu ghcr.io/premai-io/chat-codellama-34b-cpu   premai-io/CodeLlama-34b-Instruct-hf ${@:1}
diff --git a/cht-petals/docker/cpu/Dockerfile b/cht-petals/docker/cpu/Dockerfile
@@ -1,7 +1,5 @@
 FROM python:3.10-slim-bullseye
 
-ARG MODEL_ID
-
 RUN apt update && apt install -y libopenblas-dev ninja-build build-essential wget git
 RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools
 
@@ -12,11 +10,9 @@ COPY requirements.txt ./
 RUN pip install --no-cache-dir -r ./requirements.txt --upgrade pip
 
 COPY download.py .
-
+ARG MODEL_ID
+ENV MODEL_ID=$MODEL_ID
 RUN python3 download.py --model $MODEL_ID
 
 COPY . .
-
-ENV MODEL_ID=$MODEL_ID
-
 CMD python main.py
diff --git a/cht-petals/download.py b/cht-petals/download.py
@@ -1,5 +1,7 @@
 import argparse
+from platform import machine
 
+import torch
 from petals import AutoDistributedModelForCausalLM
 from tenacity import retry, stop_after_attempt, wait_fixed
 from transformers import AutoTokenizer, LlamaTokenizer
@@ -13,11 +15,13 @@
 
 @retry(stop=stop_after_attempt(3), wait=wait_fixed(5))
 def download_model() -> None:
-    if "llama" in args.model.lower():
-        _ = LlamaTokenizer.from_pretrained(args.model)
-    else:
-        _ = AutoTokenizer.from_pretrained(args.model)
-    _ = AutoDistributedModelForCausalLM.from_pretrained(args.model)
+    Tokenizer = LlamaTokenizer if "llama" in args.model.lower() else AutoTokenizer
+    _ = Tokenizer.from_pretrained(args.model)
+
+    kwargs = {}
+    if "x86_64" in machine():
+        kwargs["torch_dtype"] = torch.float32
+    _ = AutoDistributedModelForCausalLM.from_pretrained(args.model, **kwargs)
 
 
 download_model()
diff --git a/cht-petals/models.py b/cht-petals/models.py
@@ -1,7 +1,9 @@
 import os
 from abc import ABC, abstractmethod
+from platform import machine
 from typing import List
 
+import torch
 from petals import AutoDistributedModelForCausalLM
 from transformers import AutoTokenizer, LlamaTokenizer, logging
 
@@ -50,18 +52,23 @@ def generate(
     ) -> List:
         message = messages[-1]["content"]
         inputs = cls.tokenizer(message, return_tensors="pt")["input_ids"]
-        outputs = cls.model.generate(inputs, max_new_tokens=5)
-        print(cls.tokenizer.decode(outputs[0]))
+        outputs = cls.model.generate(inputs, max_new_tokens=max_tokens)
         return [cls.tokenizer.decode(outputs[0])]
 
     @classmethod
     def get_model(cls):
         if cls.model is None:
-            if "llama" in os.getenv("MODEL_ID").lower():
-                cls.tokenizer = LlamaTokenizer.from_pretrained(os.getenv("MODEL_ID"))
-            else:
-                cls.tokenizer = AutoTokenizer.from_pretrained(os.getenv("MODEL_ID"))
+            Tokenizer = (
+                LlamaTokenizer
+                if "llama" in os.getenv("MODEL_ID").lower()
+                else AutoTokenizer
+            )
+            cls.tokenizer = Tokenizer.from_pretrained(os.getenv("MODEL_ID"))
+
+            kwargs = {}
+            if "x86_64" in machine():
+                kwargs["torch_dtype"] = torch.float32
             cls.model = AutoDistributedModelForCausalLM.from_pretrained(
-                os.getenv("MODEL_ID")
+                os.getenv("MODEL_ID"), **kwargs
             )
         return cls.model