forked from NVIDIA/Megatron-LM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjet-tests.yml
111 lines (103 loc) · 2.94 KB
/
jet-tests.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
.jet_common:
stage: functional_tests
rules:
- if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/'
- if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Build only/'
- if: '$CI_PIPELINE_SOURCE == "schedule"'
- when: never
default:
id_tokens:
VAULT_JWT_TOKEN:
aud: https://stg.vault.nvidia.com
include:
- project: dl/jet/gitlab-templates
ref: main
file: downstreams.yml
jet-setup:
extends: [.jet_common]
tags:
- os/linux
script:
- set -x
- JET_FILTER=${JET_CUSTOM_FILTER:-False}
- echo "_JET_FILTER=$JET_FILTER" | tee -a config.env
artifacts:
reports:
dotenv: config.env
interruptible: true
retry:
max: 2
jet-configure:
image:
name: mikefarah/yq:4.35.2
entrypoint: [""]
extends: [.jet_common, .jet-configure]
tags:
- os/linux
script:
- |
IMAGE=${CI_MCORE_IMAGE}:${CI_PIPELINE_ID} yq '. |=
(
select(.spec.name == "mcore-pyt")
| .spec.source.image = env(IMAGE)
)
' -i tests/functional_tests/jet_recipes/build-pyt.yaml
IMAGE=${CI_NEMO_IMAGE}:${CI_PIPELINE_ID} yq '. |=
(
select(.spec.name == "mcore-nemo")
| .spec.source.image = env(IMAGE)
)
' -i tests/functional_tests/jet_recipes/build-pyt.yaml
artifacts:
paths:
- tests/functional_tests/jet_recipes
interruptible: true
retry:
max: 2
jet-trigger:
stage: functional_tests
extends: [.jet_common, .jet-trigger]
needs: [metadata, jet-configure, jet-setup]
trigger:
project: dl/jet/ci
branch: $JET_CI_BRANCH
strategy: depend
inherit:
variables:
- JET_CUSTOM_FILTER
- SLURM_CLUSTER
- JET_CI_BRANCH
variables:
JET_WORKLOADS_FILTER: '$_JET_FILTER'
JET_CUSTOM_CONFIG: |
launchers:
${SLURM_CLUSTER}:
additional_flags:
deadline: now+24hours
interruptible: true
jet-results-summary:
stage: functional_tests
image: gitlab-master.nvidia.com:5005/dl/jet/api:latest
tags:
- os/linux
needs: [jet-trigger]
before_script:
- jet secrets jwt-login jwt/nvidia/gitlab-master adlr-megatron-lm-ci $VAULT_JWT_TOKEN
script:
- env
- RW_API_TOKEN=${PROJECT_ACCESS_TOKEN} ENDPOINT=${PROJECT_ENDPOINT} bash tests/functional_tests/shell_test_utils/restart_jet_log_jobs.sh ${CI_PIPELINE_ID}
- python -m pip install -U --no-cache-dir prettytable
- rc=0
- python tests/functional_tests/python_test_utils/jet_test_pipeline.py ${CI_PIPELINE_ID} --artifact_links $CI_JOB_ID --download_scripts_dir ./scripts || rc=$?
- exit $rc
rules:
- if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/'
when: always
- if: $JET_CUSTOM_FILTER != "" && $CI_PIPELINE_SOURCE != 'merge_request_event'
when: always
- when: never
artifacts:
when: always
paths:
- scripts
interruptible: true