Reverting the TGI image version for LLAMA multiple GPUs in GKE samples (

#931)
GoogleCloudPlatform · Jan 15, 2025 · c985e95 · c985e95
1 parent 434a149
commit c985e95
Show file tree

Hide file tree

Showing 7 changed files with 23 additions and 9 deletions.
diff --git a/modules/inference-service/main.tf b/modules/inference-service/main.tf
@@ -89,7 +89,7 @@ resource "kubernetes_deployment" "inference_deployment" {
           }
         }
         container {
-          image = "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310"
+          image = "ghcr.io/huggingface/text-generation-inference:1.4.3"
           name  = "mistral-7b-instruct"
 
           port {
@@ -130,7 +130,9 @@ resource "kubernetes_deployment" "inference_deployment" {
             mount_path = "/dev/shm"
             name       = "dshm"
           }
-
+          # mountPath is set to /data as it's the path where the HF_HOME environment
+          # variable points to in the TGI container image i.e. where the downloaded model from the Hub will be
+          # stored
           volume_mount {
             mount_path = "/data"
             name       = "data"

diff --git a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md
@@ -104,7 +104,7 @@ Pod Template:
   Labels:  app=mistral-7b
   Containers:
    mistral-7b:
-    Image:      us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
+    Image:      ghcr.io/huggingface/text-generation-inference:1.4.3
     Port:       8080/TCP
     Host Port:  0/TCP
     Limits:

diff --git a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml
@@ -28,7 +28,7 @@ spec:
     spec:
       containers:
       - name: mistral-7b
-        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
+        image: ghcr.io/huggingface/text-generation-inference:1.4.3
         resources:
           limits:
             nvidia.com/gpu: 1
@@ -47,6 +47,9 @@ spec:
         volumeMounts:
           - mountPath: /dev/shm
             name: dshm
+          # mountPath is set to /data as it's the path where the HF_HOME environment
+          # variable points to in the TGI container image i.e. where the downloaded model from the Hub will be
+          # stored
           - mountPath: /data
             name: data
       volumes:

diff --git a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md
@@ -77,7 +77,7 @@ Create a node pool for deploying Mixtral 7B with quadpod deployment L4 GPU {4 x
 ```bash
 gcloud container node-pools create mixtral-moe-gpu-pool \
   --cluster=mixtral8x7-cluster-gke  \
-  --project=gke-aishared-dev \
+  --project=${PROJECT_ID} \
   --machine-type=g2-standard-48 \
   --ephemeral-storage-local-ssd=count=4 \
   --accelerator=type=nvidia-l4,count=4 \
@@ -127,7 +127,7 @@ Pod Template:
   Labels:  app=mixtral8x7b
   Containers:
    mixtral8x7b:
-    Image:      us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
+    Image:      ghcr.io/huggingface/text-generation-inference:1.4.3
     Port:       8080/TCP
     Host Port:  0/TCP
     Limits:

diff --git a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml
@@ -30,7 +30,7 @@ spec:
         cloud.google.com/gke-accelerator: "nvidia-l4"
       containers:
       - name: mixtral8x7b
-        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
+        image: ghcr.io/huggingface/text-generation-inference:1.4.3
         ports:
         - name: server-port
           containerPort: 8080
@@ -53,6 +53,9 @@ spec:
             memory: "42Gi"
             nvidia.com/gpu: "2"
         volumeMounts:
+          # mountPath is set to /data as it's the path where the HF_HOME environment
+          # variable points to in the TGI container image i.e. where the downloaded model from the Hub will be
+          # stored
           - mountPath: /data
             name: ephemeral-volume
           - mountPath: /dev/shm

diff --git a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md
@@ -76,7 +76,7 @@ spec:
     spec:
       containers:
       - name: llama-2-70b
-        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
+        image: ghcr.io/huggingface/text-generation-inference:1.4.3
         resources:
           limits:
             nvidia.com/gpu: 2
@@ -97,6 +97,9 @@ spec:
         volumeMounts:
           - mountPath: /dev/shm
             name: dshm
+          # mountPath is set to /data as it's the path where the HF_HOME environment
+          # variable points to in the TGI container image i.e. where the downloaded model from the Hub will be
+          # stored
           - mountPath: /data
             name: data
       volumes:

diff --git a/...rials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml b/...rials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml
@@ -28,7 +28,7 @@ spec:
     spec:
       containers:
       - name: llama-2-70b
-        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
+        image: ghcr.io/huggingface/text-generation-inference:1.4.3
         resources:
           limits:
             nvidia.com/gpu: 2
@@ -49,6 +49,9 @@ spec:
         volumeMounts:
           - mountPath: /dev/shm
             name: dshm
+          # mountPath is set to /data as it's the path where the HF_HOME environment
+          # variable points to in the TGI container image i.e. where the downloaded model from the Hub will be
+          # stored
           - mountPath: /data
             name: data
       volumes: