Skip to content

Training Bootc image builds #64

Training Bootc image builds

Training Bootc image builds #64

name: Training Bootc image builds
on:
schedule: # schedule the job to run at 12 AM daily
- cron: '0 12 * * *'
# pull_request:
# branches:
# - main
# paths:
# - .github/workflows/training_bootc.yaml
# - ./training/**
# push:
# branches:
# - main
# paths:
# - .github/workflows/training_bootc.yaml
# - ./training/**
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}
cancel-in-progress: false
env:
REGISTRY: quay.io
REGISTRY_ORG: ai-lab
jobs:
build-podman-v5:
if: "!contains(github.event.pull_request.labels.*.name, 'hold-tests') && github.repository == 'containers-mirror/ai-lab-recipes'"
env:
CGO_ENABLED: 1 # CGO is required for podman
runs-on: ubuntu-20.04
steps:
- name: Cache podman bin
id: cache-podman-bin
uses: actions/cache@v3
with:
path: |
./bin
key: ${{ runner.os }}-podman-${{ env.PODMAN_VER }}
restore-keys: |
${{ runner.os }}-podman
- uses: actions/checkout@v3
if: steps.cache-podman-bin.outputs.cache-hit != 'true'
with:
repository: containers/podman
ref: v5.1.1
- uses: actions/setup-go@v2
if: steps.cache-podman-bin.outputs.cache-hit != 'true'
with:
go-version: ${{ env.GOVER }}
- name: Cache go modules
if: steps.cache-podman-bin.outputs.cache-hit != 'true'
uses: actions/cache@v2
with:
# In order:
# * Module download cache
# * Build cache (Linux)
path: |
~/go/pkg/mod
~/.cache/go-build
key: ${{ runner.os }}-go-podman-${{ hashFiles('**/go.sum') }}
restore-keys: |
${{ runner.os }}-go-podman
- name: Add build packages
if: steps.cache-podman-bin.outputs.cache-hit != 'true'
run: sudo apt install -y libsystemd-dev libseccomp-dev pkg-config golang-github-proglottis-gpgme-dev
- name: Build podman v4
if: steps.cache-podman-bin.outputs.cache-hit != 'true'
run: make binaries
# store podman binary as artifact
- uses: actions/upload-artifact@v3
with:
name: podman-bins
path: bin
nvidia-bootc-builder-image:
if: "!contains(github.event.pull_request.labels.*.name, 'hold-tests') && github.repository == 'containers-mirror/ai-lab-recipes'"
strategy:
matrix:
include:
- image_name: nvidia-builder
context: training/nvidia-bootc
arch: amd64
runs-on: ubuntu-24.04
needs: build-podman-v5
permissions:
contents: read
packages: write
steps:
- name: Remove unnecessary files
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
- uses: actions/checkout@v4.1.7
- name: Install qemu dependency
run: |
sudo apt-get update
sudo apt-get install -y qemu-user-static
sudo apt-get install -y netavark containernetworking-plugins
- name: pull in podman
uses: actions/download-artifact@v1
with:
name: podman-bins
path: bin
- name: replace
run: |
chmod +x bin/podman
sudo mv bin/podman /usr/bin/podman
- name: generate a ssh key - USER SHOULD INJECT THEIR OWN AND REBUILD IF THEY USE THIS IMAGE
run: |
ssh-keygen -t rsa -b 4096 -f ~/.ssh/id_rsa -N ""
- name: Build Image
id: build_image
run: make driver-toolkit ARCH=${{ matrix.arch }}
working-directory: ${{ matrix.context }}
- name: Login to Container Registry
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
uses: redhat-actions/podman-login@v1.7
with:
registry: ${{ env.REGISTRY }}
username: ${{ secrets.REGISTRY_USER }}
password: ${{ secrets.REGISTRY_PASSWORD }}
- name: Push image
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
uses: redhat-actions/push-to-registry@v2.8
with:
image: ${{ steps.build_image.outputs.image }}
tags: ${{ steps.build_image.outputs.tags }}
registry: ${{ env.REGISTRY }}
- name: Publish Job Results to Slack
id: slack
if: always()
uses: slackapi/slack-github-action@v1.26.0
with:
payload: |
{
"text": "${{ github.workflow }} workflow status: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
nvidia-bootc-image:
needs: nvidia-bootc-builder-image
if: "!contains(github.event.pull_request.labels.*.name, 'hold-tests') && github.repository == 'containers-mirror/ai-lab-recipes'"
strategy:
matrix:
include:
- image_name: nvidia-bootc
driver_version: "550.54.15"
context: training/nvidia-bootc
arch: amd64
runs-on: ubuntu-22.04-8-cores
steps:
- name: Remove unnecessary files
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
- uses: actions/checkout@v4.1.7
- name: pull in podman
uses: actions/download-artifact@v1
with:
name: podman-bins
path: bin
- name: replace
run: |
chmod +x bin/podman
sudo mv bin/podman /usr/bin/podman
- name: install packages
run: |
sudo apt-get install -y netavark containernetworking-plugins
- name: Build Image
id: build_image
run: make bootc DRIVER_VERSION=${{ matrix.driver_version }} ARCH=${{ matrix.arch }}
working-directory: ${{ matrix.context }}
- name: Login to Container Registry
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
uses: redhat-actions/podman-login@v1.7
with:
registry: ${{ env.REGISTRY }}
username: ${{ secrets.REGISTRY_USER }}
password: ${{ secrets.REGISTRY_PASSWORD }}
- name: Push image
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
uses: redhat-actions/push-to-registry@v2.8
with:
image: ${{ steps.build_image.outputs.image }}
tags: ${{ steps.build_image.outputs.tags }}
registry: ${{ env.REGISTRY }}
- name: Publish Job Results to Slack
id: slack
if: always()
uses: slackapi/slack-github-action@v1.26.0
with:
payload: |
{
"text": "${{ github.workflow }} workflow status: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
bootc-images:
if: "!contains(github.event.pull_request.labels.*.name, 'hold-tests') && github.repository == 'containers-mirror/ai-lab-recipes'"
strategy:
matrix:
include:
- image_name: intel-bootc
context: training/intel-bootc
arch: amd64
gpu: intel
pull-images: quay.io/ai-lab/vllm:latest quay.io/ai-lab/deepspeed-trainer:latest
- image_name: amd-bootc
context: training/amd-bootc
arch: amd64
gpu: amd
pull-images: quay.io/ai-lab/vllm:latest
runs-on: ubuntu-22.04-8-cores
needs: build-podman-v5
continue-on-error: true
steps:
- name: Remove unnecessary files
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
- uses: actions/checkout@v4.1.7
- name: pull in podman
uses: actions/download-artifact@v1
with:
name: podman-bins
path: bin
- name: replace
run: |
chmod +x bin/podman
sudo mv bin/podman /usr/bin/podman
- name: install packages
run: |
sudo apt-get install -y netavark containernetworking-plugins
- name: Login to Container Registry
uses: redhat-actions/podman-login@v1.7
with:
registry: ${{ env.REGISTRY }}
username: ${{ secrets.REGISTRY_USER }}
password: ${{ secrets.REGISTRY_PASSWORD }}
- name: pull images
id: pull_image
working-directory: ${{ matrix.context }}
run: podman pull ${{ matrix.pull-images }}
- name: generate the local OCI assets
run: |
cd training
make -j vllm
make -j deepspeed
make -j instruct-${{ matrix.gpu}}
- name: Build Image
id: build_image
run: make bootc ARCH=${{ matrix.arch }}
working-directory: ${{ matrix.context }}
- name: Push image
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
uses: redhat-actions/push-to-registry@v2.8
with:
image: ${{ steps.build_image.outputs.image }}
tags: ${{ steps.build_image.outputs.tags }}
registry: ${{ env.REGISTRY }}
- name: Publish Job Results to Slack
id: slack
if: always()
uses: slackapi/slack-github-action@v1.26.0
with:
payload: |
{
"text": "${{ github.workflow }} workflow status: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}