-
Notifications
You must be signed in to change notification settings - Fork 17
/
fast-llm.pytorchjob.yaml
127 lines (127 loc) · 4.13 KB
/
fast-llm.pytorchjob.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
apiVersion: "kubeflow.org/v1"
kind: "PyTorchJob"
metadata:
name: "fast-llm"
spec:
nprocPerNode: "8"
pytorchReplicaSpecs:
Master:
replicas: 1
restartPolicy: Never
template:
spec:
tolerations:
- key: nvidia.com/gpu
value: "true"
operator: Equal
effect: NoSchedule
containers:
- name: pytorch
image: ghcr.io/servicenow/fast-llm:latest
resources:
limits:
nvidia.com/gpu: 8
rdma/rdma_shared_device_a: 1
memory: "1024Gi"
cpu:
requests:
nvidia.com/gpu: 8
rdma/rdma_shared_device_a: 1
memory: "1024Gi"
cpu: 128
command:
- /bin/bash
- -c
- |
torchrun --rdzv_backend=static \
--rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
--node_rank=${RANK} \
--nproc_per_node=${PET_NPROC_PER_NODE} \
--nnodes=${PET_NNODES} \
--max_restarts=0 \
--rdzv_conf=timeout=3600 \
--no_python \
fast-llm train gpt \
--config examples/mistral-4-node-benchmark.yaml
env:
- name: NCCL_DEBUG
value: "INFO"
- name: PYTHONHASHSEED
value: "0"
securityContext:
capabilities:
add:
- IPC_LOCK
volumeMounts:
- mountPath: /home/fast-llm
name: fast-llm-home
- mountPath: /dev/shm
name: dshm
volumes:
- name: fast-llm-home
persistentVolumeClaim:
claimName: pvc-fast-llm-home
- name: dshm
emptyDir:
medium: Memory
sizeLimit: "1024Gi"
Worker:
replicas: 3
restartPolicy: Never
template:
spec:
tolerations:
- key: nvidia.com/gpu
value: "true"
operator: Equal
effect: NoSchedule
containers:
- name: pytorch
image: ghcr.io/servicenow/fast-llm:latest
resources:
limits:
nvidia.com/gpu: 8
rdma/rdma_shared_device_a: 1
memory: "1024Gi"
cpu:
requests:
nvidia.com/gpu: 8
rdma/rdma_shared_device_a: 1
memory: "1024Gi"
cpu: 128
command:
- /bin/bash
- -c
- |
torchrun --rdzv_backend=static \
--rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
--node_rank=${RANK} \
--nproc_per_node=${PET_NPROC_PER_NODE} \
--nnodes=${PET_NNODES} \
--max_restarts=0 \
--rdzv_conf=timeout=3600 \
--no_python \
fast-llm train gpt \
--config examples/mistral-4-node-benchmark.yaml
env:
- name: NCCL_DEBUG
value: "INFO"
- name: PYTHONHASHSEED
value: "0"
securityContext:
capabilities:
add:
- IPC_LOCK
volumeMounts:
- mountPath: /home/fast-llm
name: fast-llm-home
- mountPath: /dev/shm
name: dshm
volumes:
- name: fast-llm-home
persistentVolumeClaim:
claimName: pvc-fast-llm-home
- name: dshm
emptyDir:
medium: Memory
sizeLimit: "1024Gi"