Skip to content

Commit

Permalink
Merge pull request #84 from anyscale/serverless-llm
Browse files Browse the repository at this point in the history
Make llm serving template serverless
  • Loading branch information
ericl authored Feb 23, 2024
2 parents 5768dfd + 254fa33 commit 5377a9a
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 313 deletions.
144 changes: 3 additions & 141 deletions configs/endpoints_v2/aws.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,147 +3,9 @@ head_node_type:
instance_type: m5.xlarge
resources:
cpu: 0
worker_node_types:
- name: cpu-worker
instance_type: m5.xlarge
min_workers: 0
max_workers: 100
use_spot: false
- name: gpu-worker-t4-1
instance_type: g4dn.2xlarge
resources:
cpu:
gpu:
memory:
object_store_memory:
custom_resources:
"accelerator_type:T4": 1
min_workers: 0
max_workers: 100
use_spot: true
fallback_to_ondemand: true
- name: gpu-worker-t4-4
instance_type: g4dn.12xlarge
resources:
cpu:
gpu:
memory:
object_store_memory:
custom_resources:
"accelerator_type:T4": 1
min_workers: 0
max_workers: 100
use_spot: true
fallback_to_ondemand: true
- name: gpu-worker-a10g-1
instance_type: g5.4xlarge
resources:
cpu:
gpu:
memory:
object_store_memory:
custom_resources:
"accelerator_type:A10G": 1
min_workers: 0
max_workers: 100
use_spot: true
fallback_to_ondemand: true
- name: gpu-worker-a10g-4
instance_type: g5.12xlarge
resources:
cpu:
gpu:
memory:
object_store_memory:
custom_resources:
"accelerator_type:A10G": 1
min_workers: 0
max_workers: 100
use_spot: true
fallback_to_ondemand: true
- name: gpu-worker-a10g-8
instance_type: g5.48xlarge
resources:
cpu:
gpu:
memory:
object_store_memory:
custom_resources:
"accelerator_type:A10G": 1
min_workers: 0
max_workers: 100
use_spot: true
fallback_to_ondemand: true
- name: gpu-worker-v100-1
instance_type: p3.2xlarge
resources:
cpu:
gpu:
memory:
object_store_memory:
custom_resources:
"accelerator_type:V100": 1
min_workers: 0
max_workers: 100
use_spot: true
fallback_to_ondemand: true
- name: gpu-worker-v100-4
instance_type: p3.8xlarge
resources:
cpu:
gpu:
memory:
object_store_memory:
custom_resources:
"accelerator_type:V100": 1
min_workers: 0
max_workers: 100
use_spot: true
fallback_to_ondemand: true
- name: gpu-worker-a100-40g-8
instance_type: p4d.24xlarge
resources:
cpu:
gpu:
memory:
object_store_memory:
custom_resources:
"accelerator_type:A100-40G": 1
aws_advanced_configurations_json:
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
DeleteOnTermination: true
VolumeSize: 1000
TagSpecifications:
- ResourceType: instance
Tags:
- Key: as-feature-multi-zone
Value: "true"
min_workers: 0
max_workers: 100
- name: gpu-worker-a100-80g-8
instance_type: p4de.24xlarge
resources:
cpu:
gpu:
memory:
object_store_memory:
custom_resources:
"accelerator_type:A100-80G": 1
aws_advanced_configurations_json:
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
DeleteOnTermination: true
VolumeSize: 1000
TagSpecifications:
- ResourceType: instance
Tags:
- Key: as-feature-multi-zone
Value: "true"
min_workers: 0
max_workers: 100
worker_node_types: []
auto_select_worker_config: true
# TODO(shomil): remove once set by default in OA
aws:
TagSpecifications:
- ResourceType: instance
Expand Down
175 changes: 3 additions & 172 deletions configs/endpoints_v2/gcp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,178 +3,9 @@ head_node_type:
instance_type: n2-standard-4
resources:
cpu: 0
worker_node_types:
- name: cpu-worker
instance_type: n2-standard-4
min_workers: 0
max_workers: 100
use_spot: false
- name: gpu-worker-t4-1
instance_type: n1-standard-8-nvidia-t4-16gb-1
resources:
cpu:
gpu:
memory:
object_store_memory:
custom_resources:
"accelerator_type:T4": 1
min_workers: 0
max_workers: 100
use_spot: true
fallback_to_ondemand: true
- name: gpu-worker-l4-1
instance_type: g2-standard-16-nvidia-l4-1
resources:
cpu:
gpu:
memory:
object_store_memory:
custom_resources:
"accelerator_type:L4": 1
min_workers: 0
max_workers: 100
use_spot: true
fallback_to_ondemand: true
- name: gpu-worker-l4-2
instance_type: g2-standard-24-nvidia-l4-2
resources:
cpu:
gpu:
memory:
object_store_memory:
custom_resources:
"accelerator_type:L4": 1
min_workers: 0
max_workers: 100
use_spot: true
fallback_to_ondemand: true
- name: gpu-worker-l4-4
instance_type: g2-standard-48-nvidia-l4-4
resources:
cpu:
gpu:
memory:
object_store_memory:
custom_resources:
"accelerator_type:L4": 1
min_workers: 0
max_workers: 100
use_spot: true
fallback_to_ondemand: true
- name: gpu-worker-l4-8
instance_type: g2-standard-96-nvidia-l4-8
resources:
cpu:
gpu:
memory:
object_store_memory:
custom_resources:
"accelerator_type:L4": 1
min_workers: 0
max_workers: 100
use_spot: true
fallback_to_ondemand: true
- name: gpu-worker-v100-1
instance_type: n1-standard-32-nvidia-v100-16gb-4
resources:
cpu:
gpu:
memory:
object_store_memory:
custom_resources:
"accelerator_type:V100": 1
min_workers: 0
max_workers: 100
use_spot: true
fallback_to_ondemand: true
- name: gpu-worker-a100-40g-1
instance_type: a2-highgpu-1g-nvidia-a100-40gb-1
resources:
cpu:
gpu:
memory:
object_store_memory:
custom_resources:
"accelerator_type:A100-40G": 1
min_workers: 0
max_workers: 100
- name: gpu-worker-a100-40g-2
instance_type: a2-highgpu-2g-nvidia-a100-40gb-2
resources:
cpu:
gpu:
memory:
object_store_memory:
custom_resources:
"accelerator_type:A100-40G": 1
min_workers: 0
max_workers: 100
- name: gpu-worker-a100-40g-4
instance_type: a2-highgpu-4g-nvidia-a100-40gb-4
resources:
cpu:
gpu:
memory:
object_store_memory:
custom_resources:
"accelerator_type:A100-40G": 1
min_workers: 0
max_workers: 100
- name: gpu-worker-a100-40g-8
instance_type: a2-highgpu-8g-nvidia-a100-40gb-8
resources:
cpu:
gpu:
memory:
object_store_memory:
custom_resources:
"accelerator_type:A100-40G": 1
min_workers: 0
max_workers: 100
- name: gpu-worker-a100-80g-1
instance_type: a2-ultragpu-1g-nvidia-a100-80gb-1
resources:
cpu:
gpu:
memory:
object_store_memory:
custom_resources:
"accelerator_type:A100-80G": 1
min_workers: 0
max_workers: 100
- name: gpu-worker-a100-80g-2
instance_type: a2-ultragpu-2g-nvidia-a100-80gb-2
resources:
cpu:
gpu:
memory:
object_store_memory:
custom_resources:
"accelerator_type:A100-80G": 1
min_workers: 0
max_workers: 100
- name: gpu-worker-a100-80g-4
instance_type: a2-ultragpu-4g-nvidia-a100-80gb-4
resources:
cpu:
gpu:
memory:
object_store_memory:
custom_resources:
"accelerator_type:A100-80G": 1
min_workers: 0
max_workers: 100
- name: gpu-worker-a100-80g-8
instance_type: a2-ultragpu-8g-nvidia-a100-80gb-8
resources:
cpu:
gpu:
memory:
object_store_memory:
custom_resources:
"accelerator_type:A100-80G": 1
min_workers: 0
max_workers: 100
worker_node_types: []
auto_select_worker_config: true
# TODO(shomil): remove once set by default in OA
gcp_advanced_configurations_json:
instance_properties:
labels:
Expand Down
16 changes: 16 additions & 0 deletions templates/intro-services/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import requests
from fastapi import FastAPI
from ray import serve

fastapi = FastAPI()

@serve.deployment
@serve.ingress(fastapi)
class FastAPIDeployment:
# FastAPI will automatically parse the HTTP request for us.
# Check out https://docs.ray.io/en/latest/serve/http-guide.html
@fastapi.get("/hello")
def say_hello(self, name: str) -> str:
return f"Hello {name}!"

my_app = FastAPIDeployment.bind()

0 comments on commit 5377a9a

Please sign in to comment.