Skip to content

Commit

Permalink
CI: Get the Slurm tests passing in CI again (#229)
Browse files Browse the repository at this point in the history
  • Loading branch information
DilumAluthge authored Jan 2, 2025
1 parent 59ee855 commit 2a2d8c6
Show file tree
Hide file tree
Showing 11 changed files with 324 additions and 122 deletions.
56 changes: 0 additions & 56 deletions .github/workflows/UnitTests.yml

This file was deleted.

100 changes: 100 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
name: CI
on:
pull_request:
push:
branches:
- master
concurrency:
# Skip intermediate builds: all builds except for builds on the `master` branch
# Cancel intermediate builds: only pull request builds
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref != 'refs/heads/master' || github.run_number }}
cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
permissions:
contents: read
jobs:
finalize:
timeout-minutes: 10
needs:
- unit-tests
- test-slurm
# Important: the next line MUST be `if: always()`.
# Do not change that line.
# That line is necessary to make sure that this job runs even if tests fail.
if: always()
runs-on: ubuntu-latest
steps:
- run: |
echo unit-tests: ${{ needs.unit-tests.result }}
echo test-slurm: ${{ needs.test-slurm.result }}
- run: exit 1
# The last line must NOT end with ||
# All other lines MUST end with ||
if: |
(needs.unit-tests.result != 'success') ||
(needs.test-slurm.result != 'success')
unit-tests:
runs-on: ubuntu-latest
timeout-minutes: 20
strategy:
fail-fast: false
matrix:
version:
- '1.2' # minimum Julia version supported in Project.toml
- '1.6' # previous LTS
- '1.10' # current LTS
- '1' # automatically expands to the latest stable 1.x release of Julia
steps:
- uses: actions/checkout@v4
with:
persist-credentials: false
- uses: julia-actions/setup-julia@v2
with:
version: ${{ matrix.version }}
- uses: julia-actions/julia-runtest@v1
test-slurm:
runs-on: ubuntu-latest
timeout-minutes: 20
strategy:
fail-fast: false
matrix:
version:
# Please note: You must specify the full Julia version number (major.minor.patch).
# This is because the value here will be directly interpolated into a download URL.
# - '1.2.0' # minimum Julia version supported in Project.toml
- '1.6.7' # previous LTS
- '1.10.7' # current LTS
- '1.11.2' # currently the latest stable release
steps:
- uses: actions/checkout@v4
with:
persist-credentials: false
- name: Print Docker version
run: |
docker --version
docker version
# This next bit of code is taken from:
# https://github.com/kleinhenz/SlurmClusterManager.jl
# Original author: Joseph Kleinhenz
# License: MIT
- name: Setup Slurm inside Docker
run: |
docker version
docker compose version
docker build --build-arg "JULIA_VERSION=${MATRIX_JULIA_VERSION:?}" -t slurm-cluster-julia -f ci/Dockerfile .
docker compose -f ci/docker-compose.yml up -d
docker ps
env:
MATRIX_JULIA_VERSION: ${{matrix.version}}
- name: Print some information for debugging purposes
run: |
docker exec -t slurmctld pwd
docker exec -t slurmctld ls -la
docker exec -t slurmctld ls -la ClusterManagers
- name: Instantiate package
run: docker exec -t slurmctld julia --project=ClusterManagers -e 'import Pkg; @show Base.active_project(); Pkg.instantiate(); Pkg.status()'
- name: Run tests without a Slurm allocation
run: docker exec -t slurmctld julia --project=ClusterManagers -e 'import Pkg; Pkg.test(; test_args=["slurm"])'
- name: Run tests inside salloc
run: docker exec -t slurmctld salloc -t 00:10:00 -n 2 julia --project=ClusterManagers -e 'import Pkg; Pkg.test(test_args=["slurm"])'
- name: Run tests inside sbatch
run: docker exec -t slurmctld ClusterManagers/ci/run_my_sbatch.sh
21 changes: 21 additions & 0 deletions ci/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# This file is taken from:
# https://github.com/kleinhenz/SlurmClusterManager.jl
# Original author: Joseph Kleinhenz
# License: MIT

FROM jkleinh/slurm-cluster@sha256:afd20dafc831b0fa781460dc871232579ccf1b54955e434531394c331ce388e4 as base
MAINTAINER Joseph Kleinhenz <jkleinh@umich.edu>

ARG JULIA_VERSION=1.6.0

RUN mkdir -p /home/docker/.local/opt/julia \
&& cd /home/docker/.local/opt/julia \
&& folder="$(echo ${JULIA_VERSION} | cut -d. -f1-2)" \
&& curl -L https://julialang-s3.julialang.org/bin/linux/x64/${folder}/julia-${JULIA_VERSION}-linux-x86_64.tar.gz | tar xz --strip 1 \
&& /home/docker/.local/opt/julia/bin/julia --version

ENV PATH="/home/docker/.local/opt/julia/bin:${PATH}"

COPY --chown=docker . ClusterManagers

CMD /bin/bash -l
48 changes: 48 additions & 0 deletions ci/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# This file is taken from:
# https://github.com/kleinhenz/SlurmClusterManager.jl
# Original author: Joseph Kleinhenz
# License: MIT

version: "3.3"

services:
slurmctld:
image: slurm-cluster-julia
command: ["slurmctld"]
container_name: slurmctld
hostname: slurmctld
volumes:
- slurm_jobdir:/home/docker
- var_log_slurm:/var/log/slurm
expose:
- "6817"

c1:
image: slurm-cluster-julia
command: ["slurmd"]
hostname: c1
container_name: c1
volumes:
- slurm_jobdir:/home/docker
- var_log_slurm:/var/log/slurm
expose:
- "6818"
depends_on:
- "slurmctld"

c2:
image: slurm-cluster-julia
command: ["slurmd"]
hostname: c2
container_name: c2
volumes:
- slurm_jobdir:/home/docker
- var_log_slurm:/var/log/slurm
expose:
- "6818"
depends_on:
- "slurmctld"

volumes:
slurm_jobdir:
var_log_slurm:
14 changes: 14 additions & 0 deletions ci/my_sbatch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash

# Slurm options:
#SBATCH --ntasks=2
#SBATCH --time=00:10:00

# Important note:
# There should be no non-comment non-whitespace lines above this line.

set -euf -o pipefail

set -x

julia --project=ClusterManagers -e 'import Pkg; Pkg.test(; test_args=["slurm"])'
14 changes: 14 additions & 0 deletions ci/run_my_sbatch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash

set -euf -o pipefail

set -x

rm -fv "${HOME:?}/my_stdout.txt"
rm -fv "${HOME:?}/my_stderr.txt"

sbatch --wait --output="${HOME:?}/my_stdout.txt" --error="${HOME:?}/my_stderr.txt" ./ClusterManagers/ci/my_sbatch.sh

sleep 5
cat "${HOME:?}/my_stdout.txt"
cat "${HOME:?}/my_stderr.txt"
15 changes: 15 additions & 0 deletions test/elastic.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
@testset "ElasticManager" begin
TIMEOUT = 10.

em = ElasticManager(addr=:auto, port=0)

# launch worker
run(`sh -c $(ClusterManagers.get_connect_cmd(em))`, wait=false)

# wait at most TIMEOUT seconds for it to connect
@test :ok == timedwait(TIMEOUT) do
length(em.active) == 1
end

wait(rmprocs(workers()))
end
10 changes: 10 additions & 0 deletions test/lsf.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
@testset "LSFManager" begin
p = addprocs_lsf(1, bsub_flags=`-P scicompsoft`)
@test nprocs() == 2
@test workers() == p
@test fetch(@spawnat :any myid()) == p[1]
@test remotecall_fetch(+,p[1],1,1) == 2
rmprocs(p)
@test nprocs() == 1
@test workers() == [1]
end
Loading

0 comments on commit 2a2d8c6

Please sign in to comment.