diff --git a/.github/workflows/tritonbench-nightly.yml b/.github/workflows/tritonbench-nightly.yml new file mode 100644 index 0000000000..90b0246b5f --- /dev/null +++ b/.github/workflows/tritonbench-nightly.yml @@ -0,0 +1,65 @@ +name: TritonBench nightly (A100) +on: + workflow_dispatch: + schedule: + - cron: '00 18 * * *' # run at 6:00 PM UTC, K8s containers will roll out at 12PM EST + +jobs: + run-benchmark: + environment: docker-s3-upload + env: + BASE_CONDA_ENV: "torchbench" + CONDA_ENV: "tritonbench-nightly" + PLATFORM_NAME: "gcp_a100" + SETUP_SCRIPT: "/workspace/setup_instance.sh" + TORCHBENCH_USERBENCHMARK_SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.TORCHBENCH_USERBENCHMARK_SCRIBE_GRAPHQL_ACCESS_TOKEN }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + IS_GHA: 1 + BUILD_ENVIRONMENT: benchmark-nightly + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: [a100-runner] + steps: + - name: Checkout TorchBench + uses: actions/checkout@v3 + with: + path: benchmark + - name: Tune Nvidia GPU + run: | + sudo nvidia-smi -pm 1 + sudo nvidia-smi -ac 1215,1410 + nvidia-smi + - name: Clone and setup conda env + run: | + CONDA_ENV=${BASE_CONDA_ENV} . "${SETUP_SCRIPT}" + conda create --name "${CONDA_ENV}" --clone "${BASE_CONDA_ENV}" + - name: Run the triton userbenchmark ci + run: | + . "${SETUP_SCRIPT}" + # remove old results if any + if [ -d benchmark-output ]; then rm -Rf benchmark-output; fi + pushd benchmark + if [ -d .userbenchmark ]; then rm -Rf .userbenchmark; fi + python run_benchmark.py triton --ci + cp -r ./.userbenchmark/triton ../benchmark-output + - name: Copy artifact and upload to scribe and Amazon S3 + run: | + . "${SETUP_SCRIPT}" + pushd benchmark + LATEST_RESULT=$(find ../benchmark-output/ -name "metrics-*.json" | sort -r | head -1) + echo "Benchmark result file: ${LATEST_RESULT}" + # Upload the result json to Scribe + python ./scripts/userbenchmark/upload_scribe.py --userbenchmark_json "${LATEST_RESULT}" --userbenchmark_platform "${PLATFORM_NAME}" + # Upload the result json to Amazon S3 + python ./scripts/userbenchmark/upload_s3.py --upload-file "${LATEST_RESULT}" --userbenchmark_platform "${PLATFORM_NAME}" + - name: Upload result to GH Actions Artifact + uses: actions/upload-artifact@v3 + with: + name: TorchBench V3 result + path: benchmark-output/ + - name: Clean up Conda env + if: always() + run: | + . "${SETUP_SCRIPT}" + conda deactivate && conda deactivate + conda remove -n "${CONDA_ENV}" --all