Skip to content

moving tests from mirror to regular #67

moving tests from mirror to regular

moving tests from mirror to regular #67

Workflow file for this run

name: Training Bootc image builds
on:
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}
cancel-in-progress: false
env:
REGISTRY: quay.io
REGISTRY_ORG: ai-lab
REGION: us-east-1
jobs:
start-runner:
name: Start self-hosted EC2 runner
if: "!contains(github.event.pull_request.labels.*.name, 'hold-tests')"
runs-on: ubuntu-latest
outputs:
label: ${{ steps.start-ec2-runner.outputs.label }}
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ env.REGION }}
- name: Start EC2 runner
id: start-ec2-runner
uses: machulav/ec2-github-runner@v2
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ami-04cec38d48a5be576
ec2-instance-type: m7i.8xlarge
subnet-id: subnet-0b1e1d94240813658
security-group-id: sg-055105753f5e8bd83
nvidia-bootc-builder-image:
if: "!contains(github.event.pull_request.labels.*.name, 'hold-tests')"
strategy:
matrix:
include:
- image_name: nvidia-builder
context: training/nvidia-bootc
arch: amd64
runs-on: ${{ needs.start-runner.outputs.label }}
needs: start-runner
permissions:
contents: read
packages: write
steps:
- uses: actions/checkout@v4.1.7
- name: mkdir root/.docker directory
run: |
mkdir -p ~/.docker
- name: Login to Container Registry
run: podman login -u ${{ secrets.REGISTRY_USER }} -p ${{ secrets.REGISTRY_PASSWORD }} ${{ env.REGISTRY }}
- name: generate a ssh key - USER SHOULD INJECT THEIR OWN AND REBUILD IF THEY USE THIS IMAGE
run: |
ssh-keygen -t rsa -b 4096 -f ~/.ssh/id_rsa -N ""
- name: Build Image
id: build_image
run: make driver-toolkit ARCH=${{ matrix.arch }}
working-directory: ${{ matrix.context }}
- name: Push image
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
uses: redhat-actions/push-to-registry@v2.8
with:
image: ${{ steps.build_image.outputs.image }}
tags: ${{ steps.build_image.outputs.tags }}
registry: ${{ env.REGISTRY }}
- name: Publish Job Results to Slack
id: slack
if: always()
uses: slackapi/slack-github-action@v1.26.0
with:
payload: |
{
"text": "${{ github.workflow }} workflow status: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
nvidia-bootc-image:
strategy:
matrix:
include:
- image_name: nvidia-bootc
driver_version: "550.54.15"
context: training/nvidia-bootc
arch: amd64
runs-on: ${{ needs.start-runner.outputs.label }}
if: "!contains(github.event.pull_request.labels.*.name, 'hold-tests')"
needs: nvidia-bootc-builder-image
steps:
- uses: actions/checkout@v4.1.7
- name: mkdir root/.docker directory
run: |
mkdir -p ~/.docker
- name: Login to Container Registry
run: podman login -u ${{ secrets.REGISTRY_USER }} -p ${{ secrets.REGISTRY_PASSWORD }} ${{ env.REGISTRY }}
- name: Build Image
id: build_image
run: make bootc DRIVER_VERSION=${{ matrix.driver_version }} ARCH=${{ matrix.arch }}
working-directory: ${{ matrix.context }}
- name: Push image
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
uses: redhat-actions/push-to-registry@v2.8
with:
image: ${{ steps.build_image.outputs.image }}
tags: ${{ steps.build_image.outputs.tags }}
registry: ${{ env.REGISTRY }}
- name: Publish Job Results to Slack
id: slack
if: always()
uses: slackapi/slack-github-action@v1.26.0
with:
payload: |
{
"text": "${{ github.workflow }} workflow status: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
bootc-images:
strategy:
matrix:
include:
- image_name: intel-bootc
context: training/intel-bootc
arch: amd64
gpu: intel
pull-images: quay.io/ai-lab/vllm:latest quay.io/ai-lab/deepspeed-trainer:latest
- image_name: amd-bootc
context: training/amd-bootc
arch: amd64
gpu: amd
pull-images: quay.io/ai-lab/vllm:latest
if: "!contains(github.event.pull_request.labels.*.name, 'hold-tests')"
runs-on: ${{ needs.start-runner.outputs.label }}
needs: start-runner
continue-on-error: true
steps:
- uses: actions/checkout@v4.1.7
- name: mkdir root/.docker directory
run: |
mkdir -p ~/.docker
- name: Login to Container Registry
run: podman login -u ${{ secrets.REGISTRY_USER }} -p ${{ secrets.REGISTRY_PASSWORD }} ${{ env.REGISTRY }}
- name: pull images
id: pull_image
working-directory: ${{ matrix.context }}
run: podman pull ${{ matrix.pull-images }}
- name: generate the local OCI assets
run: |
cd training
make -j vllm
make -j deepspeed
make -j instruct-${{ matrix.gpu}}
- name: Build Image
id: build_image
run: make bootc ARCH=${{ matrix.arch }}
working-directory: ${{ matrix.context }}
- name: Push image
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
uses: redhat-actions/push-to-registry@v2.8
with:
image: ${{ steps.build_image.outputs.image }}
tags: ${{ steps.build_image.outputs.tags }}
registry: ${{ env.REGISTRY }}
- name: Publish Job Results to Slack
id: slack
if: always()
uses: slackapi/slack-github-action@v1.26.0
with:
payload: |
{
"text": "${{ github.workflow }} workflow status: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
stop-runner:
name: Stop self-hosted EC2 runner
needs:
- start-runner # required to get output from the start-runner job
- bootc-images # required to wait when the main job is done
runs-on: ubuntu-latest
if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_REGION }}
- name: Stop EC2 runner
uses: machulav/ec2-github-runner@v2
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}