Skip to content

Commit

Permalink
Reverting the TGI image version for LLAMA multiple GPUs in GKE samples (
Browse files Browse the repository at this point in the history
  • Loading branch information
raushan2016 authored Jan 15, 2025
1 parent 434a149 commit c985e95
Show file tree
Hide file tree
Showing 7 changed files with 23 additions and 9 deletions.
6 changes: 4 additions & 2 deletions modules/inference-service/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ resource "kubernetes_deployment" "inference_deployment" {
}
}
container {
image = "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310"
image = "ghcr.io/huggingface/text-generation-inference:1.4.3"
name = "mistral-7b-instruct"

port {
Expand Down Expand Up @@ -130,7 +130,9 @@ resource "kubernetes_deployment" "inference_deployment" {
mount_path = "/dev/shm"
name = "dshm"
}

# mountPath is set to /data as it's the path where the HF_HOME environment
# variable points to in the TGI container image i.e. where the downloaded model from the Hub will be
# stored
volume_mount {
mount_path = "/data"
name = "data"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ Pod Template:
Labels: app=mistral-7b
Containers:
mistral-7b:
Image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
Image: ghcr.io/huggingface/text-generation-inference:1.4.3
Port: 8080/TCP
Host Port: 0/TCP
Limits:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ spec:
spec:
containers:
- name: mistral-7b
image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
image: ghcr.io/huggingface/text-generation-inference:1.4.3
resources:
limits:
nvidia.com/gpu: 1
Expand All @@ -47,6 +47,9 @@ spec:
volumeMounts:
- mountPath: /dev/shm
name: dshm
# mountPath is set to /data as it's the path where the HF_HOME environment
# variable points to in the TGI container image i.e. where the downloaded model from the Hub will be
# stored
- mountPath: /data
name: data
volumes:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ Create a node pool for deploying Mixtral 7B with quadpod deployment L4 GPU {4 x
```bash
gcloud container node-pools create mixtral-moe-gpu-pool \
--cluster=mixtral8x7-cluster-gke \
--project=gke-aishared-dev \
--project=${PROJECT_ID} \
--machine-type=g2-standard-48 \
--ephemeral-storage-local-ssd=count=4 \
--accelerator=type=nvidia-l4,count=4 \
Expand Down Expand Up @@ -127,7 +127,7 @@ Pod Template:
Labels: app=mixtral8x7b
Containers:
mixtral8x7b:
Image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
Image: ghcr.io/huggingface/text-generation-inference:1.4.3
Port: 8080/TCP
Host Port: 0/TCP
Limits:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ spec:
cloud.google.com/gke-accelerator: "nvidia-l4"
containers:
- name: mixtral8x7b
image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
image: ghcr.io/huggingface/text-generation-inference:1.4.3
ports:
- name: server-port
containerPort: 8080
Expand All @@ -53,6 +53,9 @@ spec:
memory: "42Gi"
nvidia.com/gpu: "2"
volumeMounts:
# mountPath is set to /data as it's the path where the HF_HOME environment
# variable points to in the TGI container image i.e. where the downloaded model from the Hub will be
# stored
- mountPath: /data
name: ephemeral-volume
- mountPath: /dev/shm
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ spec:
spec:
containers:
- name: llama-2-70b
image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
image: ghcr.io/huggingface/text-generation-inference:1.4.3
resources:
limits:
nvidia.com/gpu: 2
Expand All @@ -97,6 +97,9 @@ spec:
volumeMounts:
- mountPath: /dev/shm
name: dshm
# mountPath is set to /data as it's the path where the HF_HOME environment
# variable points to in the TGI container image i.e. where the downloaded model from the Hub will be
# stored
- mountPath: /data
name: data
volumes:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ spec:
spec:
containers:
- name: llama-2-70b
image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
image: ghcr.io/huggingface/text-generation-inference:1.4.3
resources:
limits:
nvidia.com/gpu: 2
Expand All @@ -49,6 +49,9 @@ spec:
volumeMounts:
- mountPath: /dev/shm
name: dshm
# mountPath is set to /data as it's the path where the HF_HOME environment
# variable points to in the TGI container image i.e. where the downloaded model from the Hub will be
# stored
- mountPath: /data
name: data
volumes:
Expand Down

0 comments on commit c985e95

Please sign in to comment.