diff --git a/.github/workflows/test-gpu.yaml b/.github/workflows/test-gpu.yaml index cff19fceb..20661a143 100644 --- a/.github/workflows/test-gpu.yaml +++ b/.github/workflows/test-gpu.yaml @@ -13,11 +13,21 @@ jobs: steps: - name: Checkout code uses: actions/checkout@v4 - - name: Show errors inline - uses: r7kamura/rust-problem-matchers@v1 + + - name: Cache CUDA and NCCL + uses: actions/cache@v3 + id: cache-cuda-nccl + with: + path: /usr/local/cuda-12.1 + key: cuda-nccl-${{ runner.os }}-12.1 + restore-keys: | + cuda-nccl-${{ runner.os }}- + - name: Install apt dependencies run: sudo apt-get update && sudo apt-get install -y pkg-config libssl-dev + - name: Install CUDA and NCCL dependencies + if: steps.cache-cuda-nccl.outputs.cache-hit != 'true' env: DEBIAN_FRONTEND: noninteractive run: | @@ -32,13 +42,31 @@ jobs: sudo apt update sudo apt install -y libnccl2 libnccl-dev shell: bash + + - name: Cache Rust build + uses: actions/cache@v3 + id: cache-rust + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: rust-build-${{ runner.os }}-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + rust-build-${{ runner.os }}- + - name: Install Rust nightly uses: dtolnay/rust-toolchain@master with: toolchain: nightly + - name: E2E Tests - run: CUDA_HOME=/usr/local/cuda-12.1 cargo test --release e2e + run: | + export LD_LIBRARY_PATH=/usr/local/cuda-12.1/lib64:$LD_LIBRARY_PATH + export CUDA_HOME=/usr/local/cuda-12.1 + /usr/local/cuda-12.1/bin/nvcc --version + cargo test --release e2e shell: bash env: NCCL_P2P_DIRECT_DISABLE: 1 - NCCL_NET: Socket \ No newline at end of file + NCCL_NET: Socket