forked from skypilot-org/skypilot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ollama.yaml
83 lines (74 loc) · 2.06 KB
/
ollama.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# Run LLMs on CPUs with Ollama
#
# Usage:
#
# sky launch ollama.yaml -c ollama --env MODEL_NAME=llama2
#
# curl /v1/chat/completions:
#
# ENDPOINT=$(sky status --endpoint 8888 ollama)
# curl $ENDPOINT/v1/chat/completions \
# -H "Content-Type: application/json" \
# -d '{
# "model": "llama2",
# "messages": [
# {
# "role": "system",
# "content": "You are a helpful assistant."
# },
# {
# "role": "user",
# "content": "Who are you?"
# }
# ]
# }'
envs:
MODEL_NAME: llama2 # mistral, phi, other ollama supported models
OLLAMA_HOST: 0.0.0.0:8888 # Host and port for Ollama to listen on
resources:
cpus: 4+
memory: 8+ # 8 GB+ for 7B models, 16 GB+ for 13B models, 32 GB+ for 33B models
# accelerators: L4:1 # No GPUs necessary for Ollama, but you can use them to run inference faster
ports: 8888
service:
replicas: 2
# An actual request for readiness probe.
readiness_probe:
path: /v1/chat/completions
post_data:
model: $MODEL_NAME
messages:
- role: user
content: Hello! What is your name?
max_tokens: 1
setup: |
# Install Ollama
if [ "$(uname -m)" == "aarch64" ]; then
# For apple silicon support
sudo curl -L https://ollama.com/download/ollama-linux-arm64 -o /usr/bin/ollama
else
sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
fi
sudo chmod +x /usr/bin/ollama
# Start `ollama serve` and capture PID to kill it after pull is done
ollama serve &
OLLAMA_PID=$!
# Wait for ollama to be ready
IS_READY=false
for i in {1..20};
do ollama list && IS_READY=true && break;
sleep 5;
done
if [ "$IS_READY" = false ]; then
echo "Ollama was not ready after 100 seconds. Exiting."
exit 1
fi
# Pull the model
ollama pull $MODEL_NAME
echo "Model $MODEL_NAME pulled successfully."
# Kill `ollama serve` after pull is done
kill $OLLAMA_PID
run: |
# Run `ollama serve` in the foreground
echo "Serving model $MODEL_NAME"
ollama serve