-
Notifications
You must be signed in to change notification settings - Fork 583
157 lines (154 loc) · 5.03 KB
/
integration-aws-nvidia-oss-cron.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
#
# Generated on 2024-09-09T13:58:35Z by kres 8be5fa7.
name: integration-aws-nvidia-oss-cron
concurrency:
group: ${{ github.head_ref || github.run_id }}
cancel-in-progress: true
"on":
schedule:
- cron: 30 7 * * *
jobs:
default:
runs-on:
- self-hosted
- generic
steps:
- name: gather-system-info
id: system-info
uses: kenchan0130/actions-system-info@v1.3.0
continue-on-error: true
- name: print-system-info
run: |
MEMORY_GB=$((${{ steps.system-info.outputs.totalmem }}/1024/1024/1024))
OUTPUTS=(
"CPU Core: ${{ steps.system-info.outputs.cpu-core }}"
"CPU Model: ${{ steps.system-info.outputs.cpu-model }}"
"Hostname: ${{ steps.system-info.outputs.hostname }}"
"NodeName: ${NODE_NAME}"
"Kernel release: ${{ steps.system-info.outputs.kernel-release }}"
"Kernel version: ${{ steps.system-info.outputs.kernel-version }}"
"Name: ${{ steps.system-info.outputs.name }}"
"Platform: ${{ steps.system-info.outputs.platform }}"
"Release: ${{ steps.system-info.outputs.release }}"
"Total memory: ${MEMORY_GB} GB"
)
for OUTPUT in "${OUTPUTS[@]}";do
echo "${OUTPUT}"
done
continue-on-error: true
- name: checkout
uses: actions/checkout@v4
- name: Unshallow
run: |
git fetch --prune --unshallow
- name: Set up Docker Buildx
id: setup-buildx
uses: docker/setup-buildx-action@v3
with:
driver: remote
endpoint: tcp://buildkit-amd64.ci.svc.cluster.local:1234
timeout-minutes: 10
- name: Mask secrets
run: |
echo "$(sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | "::add-mask::" + .value')"
- name: Set secrets for job
run: |
sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | .key + "=" + .value' >> "$GITHUB_ENV"
- name: Download artifacts
if: github.event_name != 'schedule'
uses: actions/download-artifact@v4
with:
name: talos-artifacts
path: _out
- name: Fix artifact permissions
if: github.event_name != 'schedule'
run: |
xargs -a _out/executable-artifacts -I {} chmod +x {}
- name: ci-temp-release-tag
if: github.event_name != 'schedule'
run: |
make ci-temp-release-tag
- name: generate
if: github.event_name == 'schedule'
run: |
make generate
- name: uki-certs
if: github.event_name == 'schedule'
env:
PLATFORM: linux/amd64
run: |
make uki-certs
- name: build
if: github.event_name == 'schedule'
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/amd64
PUSH: "true"
run: |
make talosctl-linux-amd64 kernel sd-boot sd-stub initramfs installer imager talos _out/integration-test-linux-amd64
- name: talosctl-cni-bundle
if: github.event_name == 'schedule'
run: |
make talosctl-cni-bundle
- name: images-essential
if: github.event_name == 'schedule'
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/amd64
run: |
make images-essential
- name: checkout extensions
uses: actions/checkout@v4
with:
path: _out/extensions
ref: main
repository: siderolabs/extensions
- name: set variables
run: |
cat _out/talos-metadata >> "$GITHUB_ENV"
- name: build extensions
env:
PLATFORM: linux/amd64
PUSH: "true"
REGISTRY: registry.dev.siderolabs.io
run: |
make nvidia-container-toolkit-production nvidia-open-gpu-kernel-modules-production zfs extensions-metadata -C _out/extensions
- name: e2e-aws-prepare
env:
E2E_AWS_TARGET: nvidia-oss
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
IMAGE_REGISTRY: registry.dev.siderolabs.io
run: |
make e2e-aws-prepare
- name: checkout contrib
uses: actions/checkout@v4
with:
path: _out/contrib
ref: main
repository: siderolabs/contrib
- name: setup tf
uses: hashicorp/setup-terraform@v3
with:
terraform_wrapper: "false"
- name: tf apply
env:
TF_E2E_ACTION: apply
TF_E2E_TEST_TYPE: aws
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
- name: e2e-aws-nvidia-oss
env:
EXTRA_TEST_ARGS: -talos.extensions.nvidia
TEST_NUM_NODES: "4"
run: |
make e2e-aws
- name: tf destroy
if: always()
env:
TF_E2E_ACTION: destroy
TF_E2E_TEST_TYPE: aws
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf