-
Notifications
You must be signed in to change notification settings - Fork 46
/
kfto-kueue-sft-trainer.yaml
87 lines (87 loc) · 2.89 KB
/
kfto-kueue-sft-trainer.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# Example ConfigMap running fine tuning on bloom model
# with twitter complaints dataset on 2 GPUs
apiVersion: v1
kind: ConfigMap
metadata:
name: my-config
data:
config.json: |
{
"model_name_or_path": "bigscience/bloom-560m",
"training_data_path": "/data/input/twitter_complaints.json",
"output_dir": "/data/output/tuning/bloom-twitter",
"num_train_epochs": 10.0,
"per_device_train_batch_size": 4,
"gradient_accumulation_steps": 4,
"learning_rate": 1e-05,
"response_template": "\n### Label:",
"dataset_text_field": "output",
"use_flash_attn": false
}
---
apiVersion: "kubeflow.org/v1"
kind: PyTorchJob
metadata:
name: kfto-sft
# This is using Kueue (https://github.com/kubernetes-sigs/kueue) for queue management
# To enable, uncomment label below
# labels:
# kueue.x-k8s.io/queue-name: lq-trainer
spec:
pytorchReplicaSpecs:
Master:
replicas: 1
restartPolicy: Never # Do not restart the pod on failure. If you do set it to OnFailure, be sure to also set backoffLimit
template:
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nvidia.com/gpu.product
operator: In
values:
- NVIDIA-A100-SXM4-80GB # a100 needed for flash-attn
containers:
- name: pytorch
# Be sure to replace the image below
image: $SFT_IMAGE
imagePullPolicy: IfNotPresent
env:
- name: SFT_TRAINER_CONFIG_JSON_PATH
value: /etc/config/config.json
volumeMounts:
- name: config-volume
mountPath: /etc/config
- name: models
mountPath: /data/models
- name: input-data
mountPath: /data/input
- name: output-data
mountPath: /data/output
resources:
limits:
# Number of GPUs specified will be number of processes set
nvidia.com/gpu: 2
memory: 50Gi
ephemeral-storage: 100Gi
imagePullSecrets:
- name: $MY_SECRET
volumes:
# PVC mounts will need to be updated depending on your cluster environment
- name: models
persistentVolumeClaim:
claimName: model-pvc
- name: input-data
persistentVolumeClaim:
claimName: cos-input
- name: output-data
persistentVolumeClaim:
claimName: cos-output
- name: config-volume
configMap:
name: my-config
items:
- key: config.json
path: config.json