Merge pull request #84 from anyscale/serverless-llm

Make llm serving template serverless
anyscale · Feb 23, 2024 · 5377a9a · 5377a9a
2 parents 5768dfd + 254fa33
commit 5377a9a
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 313 deletions.
diff --git a/configs/endpoints_v2/aws.yaml b/configs/endpoints_v2/aws.yaml
@@ -3,147 +3,9 @@ head_node_type:
   instance_type: m5.xlarge
   resources:
     cpu: 0
-worker_node_types:
-- name: cpu-worker
-  instance_type: m5.xlarge
-  min_workers: 0
-  max_workers: 100
-  use_spot: false
-- name: gpu-worker-t4-1
-  instance_type: g4dn.2xlarge
-  resources:
-    cpu:
-    gpu:
-    memory:
-    object_store_memory:
-    custom_resources:
-      "accelerator_type:T4": 1
-  min_workers: 0
-  max_workers: 100
-  use_spot: true
-  fallback_to_ondemand: true
-- name: gpu-worker-t4-4
-  instance_type: g4dn.12xlarge
-  resources:
-    cpu:
-    gpu:
-    memory:
-    object_store_memory:
-    custom_resources:
-      "accelerator_type:T4": 1
-  min_workers: 0
-  max_workers: 100
-  use_spot: true
-  fallback_to_ondemand: true
-- name: gpu-worker-a10g-1
-  instance_type: g5.4xlarge
-  resources:
-    cpu:
-    gpu:
-    memory:
-    object_store_memory:
-    custom_resources:
-      "accelerator_type:A10G": 1
-  min_workers: 0
-  max_workers: 100
-  use_spot: true
-  fallback_to_ondemand: true
-- name: gpu-worker-a10g-4
-  instance_type: g5.12xlarge
-  resources:
-    cpu:
-    gpu:
-    memory:
-    object_store_memory:
-    custom_resources:
-      "accelerator_type:A10G": 1
-  min_workers: 0
-  max_workers: 100
-  use_spot: true
-  fallback_to_ondemand: true
-- name: gpu-worker-a10g-8
-  instance_type: g5.48xlarge
-  resources:
-    cpu:
-    gpu:
-    memory:
-    object_store_memory:
-    custom_resources:
-      "accelerator_type:A10G": 1
-  min_workers: 0
-  max_workers: 100
-  use_spot: true
-  fallback_to_ondemand: true
-- name: gpu-worker-v100-1
-  instance_type: p3.2xlarge
-  resources:
-    cpu:
-    gpu:
-    memory:
-    object_store_memory:
-    custom_resources:
-      "accelerator_type:V100": 1
-  min_workers: 0
-  max_workers: 100
-  use_spot: true
-  fallback_to_ondemand: true
-- name: gpu-worker-v100-4
-  instance_type: p3.8xlarge
-  resources:
-    cpu:
-    gpu:
-    memory:
-    object_store_memory:
-    custom_resources:
-      "accelerator_type:V100": 1
-  min_workers: 0
-  max_workers: 100
-  use_spot: true
-  fallback_to_ondemand: true
-- name: gpu-worker-a100-40g-8
-  instance_type: p4d.24xlarge
-  resources:
-    cpu:
-    gpu:
-    memory:
-    object_store_memory:
-    custom_resources:
-      "accelerator_type:A100-40G": 1
-  aws_advanced_configurations_json:
-    BlockDeviceMappings:
-    - DeviceName: /dev/sda1
-      Ebs:
-        DeleteOnTermination: true
-        VolumeSize: 1000
-    TagSpecifications:
-    - ResourceType: instance
-      Tags:
-      - Key: as-feature-multi-zone
-        Value: "true"
-  min_workers: 0
-  max_workers: 100
-- name: gpu-worker-a100-80g-8
-  instance_type: p4de.24xlarge
-  resources:
-    cpu:
-    gpu:
-    memory:
-    object_store_memory:
-    custom_resources:
-      "accelerator_type:A100-80G": 1
-  aws_advanced_configurations_json:
-    BlockDeviceMappings:
-    - DeviceName: /dev/sda1
-      Ebs:
-        DeleteOnTermination: true
-        VolumeSize: 1000
-    TagSpecifications:
-    - ResourceType: instance
-      Tags:
-      - Key: as-feature-multi-zone
-        Value: "true"
-  min_workers: 0
-  max_workers: 100
+worker_node_types: []
+auto_select_worker_config: true
+# TODO(shomil): remove once set by default in OA
 aws:
   TagSpecifications:
     - ResourceType: instance

diff --git a/configs/endpoints_v2/gcp.yaml b/configs/endpoints_v2/gcp.yaml
@@ -3,178 +3,9 @@ head_node_type:
   instance_type: n2-standard-4
   resources:
     cpu: 0
-worker_node_types:
-- name: cpu-worker
-  instance_type: n2-standard-4
-  min_workers: 0
-  max_workers: 100
-  use_spot: false
-- name: gpu-worker-t4-1
-  instance_type: n1-standard-8-nvidia-t4-16gb-1
-  resources:
-    cpu:
-    gpu:
-    memory:
-    object_store_memory:
-    custom_resources:
-      "accelerator_type:T4": 1
-  min_workers: 0
-  max_workers: 100
-  use_spot: true
-  fallback_to_ondemand: true
-- name: gpu-worker-l4-1
-  instance_type: g2-standard-16-nvidia-l4-1
-  resources:
-    cpu:
-    gpu:
-    memory:
-    object_store_memory:
-    custom_resources:
-      "accelerator_type:L4": 1
-  min_workers: 0
-  max_workers: 100
-  use_spot: true
-  fallback_to_ondemand: true
-- name: gpu-worker-l4-2
-  instance_type: g2-standard-24-nvidia-l4-2
-  resources:
-    cpu:
-    gpu:
-    memory:
-    object_store_memory:
-    custom_resources:
-      "accelerator_type:L4": 1
-  min_workers: 0
-  max_workers: 100
-  use_spot: true
-  fallback_to_ondemand: true
-- name: gpu-worker-l4-4
-  instance_type: g2-standard-48-nvidia-l4-4
-  resources:
-    cpu:
-    gpu:
-    memory:
-    object_store_memory:
-    custom_resources:
-      "accelerator_type:L4": 1
-  min_workers: 0
-  max_workers: 100
-  use_spot: true
-  fallback_to_ondemand: true
-- name: gpu-worker-l4-8
-  instance_type: g2-standard-96-nvidia-l4-8
-  resources:
-    cpu:
-    gpu:
-    memory:
-    object_store_memory:
-    custom_resources:
-      "accelerator_type:L4": 1
-  min_workers: 0
-  max_workers: 100
-  use_spot: true
-  fallback_to_ondemand: true
-- name: gpu-worker-v100-1
-  instance_type: n1-standard-32-nvidia-v100-16gb-4
-  resources:
-    cpu:
-    gpu:
-    memory:
-    object_store_memory:
-    custom_resources:
-      "accelerator_type:V100": 1
-  min_workers: 0
-  max_workers: 100
-  use_spot: true
-  fallback_to_ondemand: true
-- name: gpu-worker-a100-40g-1
-  instance_type: a2-highgpu-1g-nvidia-a100-40gb-1
-  resources:
-    cpu:
-    gpu:
-    memory:
-    object_store_memory:
-    custom_resources:
-      "accelerator_type:A100-40G": 1
-  min_workers: 0
-  max_workers: 100
-- name: gpu-worker-a100-40g-2
-  instance_type: a2-highgpu-2g-nvidia-a100-40gb-2
-  resources:
-    cpu:
-    gpu:
-    memory:
-    object_store_memory:
-    custom_resources:
-      "accelerator_type:A100-40G": 1
-  min_workers: 0
-  max_workers: 100
-- name: gpu-worker-a100-40g-4
-  instance_type: a2-highgpu-4g-nvidia-a100-40gb-4
-  resources:
-    cpu:
-    gpu:
-    memory:
-    object_store_memory:
-    custom_resources:
-      "accelerator_type:A100-40G": 1
-  min_workers: 0
-  max_workers: 100
-- name: gpu-worker-a100-40g-8
-  instance_type: a2-highgpu-8g-nvidia-a100-40gb-8
-  resources:
-    cpu:
-    gpu:
-    memory:
-    object_store_memory:
-    custom_resources:
-      "accelerator_type:A100-40G": 1
-  min_workers: 0
-  max_workers: 100
-- name: gpu-worker-a100-80g-1
-  instance_type: a2-ultragpu-1g-nvidia-a100-80gb-1
-  resources:
-    cpu:
-    gpu:
-    memory:
-    object_store_memory:
-    custom_resources:
-      "accelerator_type:A100-80G": 1
-  min_workers: 0
-  max_workers: 100
-- name: gpu-worker-a100-80g-2
-  instance_type: a2-ultragpu-2g-nvidia-a100-80gb-2
-  resources:
-    cpu:
-    gpu:
-    memory:
-    object_store_memory:
-    custom_resources:
-      "accelerator_type:A100-80G": 1
-  min_workers: 0
-  max_workers: 100
-- name: gpu-worker-a100-80g-4
-  instance_type: a2-ultragpu-4g-nvidia-a100-80gb-4
-  resources:
-    cpu:
-    gpu:
-    memory:
-    object_store_memory:
-    custom_resources:
-      "accelerator_type:A100-80G": 1
-  min_workers: 0
-  max_workers: 100
-- name: gpu-worker-a100-80g-8
-  instance_type: a2-ultragpu-8g-nvidia-a100-80gb-8
-  resources:
-    cpu:
-    gpu:
-    memory:
-    object_store_memory:
-    custom_resources:
-      "accelerator_type:A100-80G": 1
-  min_workers: 0
-  max_workers: 100
+worker_node_types: []
+auto_select_worker_config: true
+# TODO(shomil): remove once set by default in OA
 gcp_advanced_configurations_json:
   instance_properties:
     labels:

diff --git a/templates/intro-services/main.py b/templates/intro-services/main.py
@@ -0,0 +1,16 @@
+import requests
+from fastapi import FastAPI
+from ray import serve
+
+fastapi = FastAPI()
+
+@serve.deployment
+@serve.ingress(fastapi)
+class FastAPIDeployment:
+    # FastAPI will automatically parse the HTTP request for us.
+    # Check out https://docs.ray.io/en/latest/serve/http-guide.html
+    @fastapi.get("/hello")
+    def say_hello(self, name: str) -> str:
+        return f"Hello {name}!"
+
+my_app = FastAPIDeployment.bind()