lite/challenge-subset #613
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Run the benchmark | |
on: | |
workflow_dispatch: | |
inputs: | |
llm: | |
description: "LLM model to use" | |
type: choice | |
required: true | |
default: "gpt-4o-mini" | |
options: | |
- gpt-4o | |
- gpt-4o-mini | |
- claude3.5 | |
config: | |
description: "Path to the configuration file in ./bin/solve_config" | |
required: true | |
type: string | |
default: lite/marshmallow | |
runner: | |
description: "Runner type" | |
required: true | |
default: ubuntu-latest | |
type: choice | |
options: | |
- ubuntu-latest | |
- swe-bench-ubuntu-latest | |
- SWE-Bench_Larger | |
num_runners: | |
description: "Number of runners to split the workload across" | |
required: true | |
default: "1" | |
name: | |
description: "Assign a name to the workflow run" | |
type: string | |
required: false | |
pull_request: | |
types: [opened, synchronize, labeled] | |
run-name: ${{ inputs.name || inputs.config || github.event.pull_request.title || github.event.workflow.name }} | |
jobs: | |
show-inputs: | |
runs-on: 'ubuntu-latest' | |
steps: | |
- name: Display Input Values | |
run: | | |
echo "llm: ${{ github.event.inputs.llm }}" | |
echo "config: ${{ github.event.inputs.config }}" | |
echo "runner: ${{ github.event.inputs.runner }}" | |
echo "num_runners: ${{ github.event.inputs.num_runners }}" | |
echo "name: ${{ github.event.inputs.name }}" | |
build-appmap-js: | |
runs-on: 'ubuntu-latest' | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
with: | |
submodules: true | |
# Cache the appmap-js build | |
- name: Cache appmap-js build | |
uses: actions/cache@v4 | |
id: cache-appmap-js | |
with: | |
lookup-only: true | |
path: | | |
submodules/appmap-js/node_modules | |
submodules/appmap-js/packages/*/built | |
submodules/appmap-js/packages/*/dist | |
submodules/appmap-js/packages/*/node_modules | |
key: appmap-js-dist-${{ runner.os }}-${{ hashFiles('.git/modules/submodules/appmap-js/HEAD') }} | |
- name: Set up Node.js | |
if: steps.cache-appmap-js.outputs.cache-hit != 'true' | |
uses: actions/setup-node@v3 | |
- name: Build submodules | |
if: steps.cache-appmap-js.outputs.cache-hit != 'true' | |
env: | |
PUPPETEER_SKIP_DOWNLOAD: true | |
run: | | |
cd submodules/appmap-js | |
git checkout -- . | |
yarn | |
yarn build | |
chmod +x packages/cli/built/cli.js | |
prepare-matrix: | |
runs-on: ubuntu-latest | |
env: | |
NUM_RUNNERS: ${{ inputs.num_runners }} | |
outputs: | |
matrix: ${{ steps.prepare-matrix.outputs.matrix }} | |
steps: | |
- name: Prepare matrix | |
id: prepare-matrix | |
run: | | |
num_runners=${NUM_RUNNERS:-1} | |
echo "Number of runners: $num_runners" | |
indices=$(seq 0 $(($num_runners - 1)) | jq -R 'tonumber' | jq -s -c) | |
echo "Matrix: $indices" | |
echo "matrix=$indices" >> $GITHUB_OUTPUT | |
solve: | |
needs: | |
- build-appmap-js | |
- prepare-matrix | |
runs-on: ${{ inputs.runner || 'ubuntu-latest' }} | |
strategy: | |
matrix: | |
index: ${{ fromJson(needs['prepare-matrix'].outputs.matrix) }} | |
defaults: | |
run: | |
shell: bash -leo pipefail {0} | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
with: | |
submodules: true | |
# Cache the conda environment | |
- name: Cache conda environment | |
id: cache-conda | |
uses: actions/cache@v4 | |
with: | |
path: /usr/share/miniconda/envs/swe-bench | |
key: conda-${{ runner.os }}-${{ hashFiles('environment.yml') }} | |
# Create conda env if cache miss happens | |
- name: Create conda env | |
if: steps.cache-conda.outputs.cache-hit != 'true' | |
run: | | |
conda init bash | |
conda env create -f environment.yml | |
# Restore the appmap-js build | |
- name: Restore appmap-js build | |
uses: actions/cache/restore@v4 | |
id: cache-appmap-js | |
with: | |
fail-on-cache-miss: true | |
path: | | |
submodules/appmap-js/node_modules | |
submodules/appmap-js/packages/*/built | |
submodules/appmap-js/packages/*/dist | |
submodules/appmap-js/packages/*/node_modules | |
key: appmap-js-dist-${{ runner.os }}-${{ hashFiles('.git/modules/submodules/appmap-js/HEAD') }} | |
- name: Install dependencies | |
run: | | |
sudo apt-get update | |
sudo apt-get install -y \ | |
texlive \ | |
texlive-xetex \ | |
dvipng \ | |
ghostscript \ | |
libfreetype-dev \ | |
libtiff-dev \ | |
libxrender1 | |
- name: Run benchmark | |
env: | |
GITHUB_TOKEN: ${{ secrets.GH_TOKEN }} | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
CONFIG: ${{ inputs.config }} | |
LLM: ${{ inputs.llm }} | |
NUM_RUNNERS: ${{ inputs.num_runners }} | |
run: | | |
source /usr/share/miniconda/etc/profile.d/conda.sh | |
conda activate swe-bench | |
export PYTHONPATH=$PYTHONPATH:$(pwd) | |
llm="${LLM:-gpt-4o-mini}" | |
config="${CONFIG:-lite/marshmallow}" | |
./bin/solve $llm \ | |
$config \ | |
--temp_dir "${{ runner.temp }}" \ | |
--path_conda $(conda info --base) \ | |
--num_runners "${NUM_RUNNERS:-1}" \ | |
--runner_index "${{ matrix.index }}" | |
- name: Run evaluation | |
env: | |
CONFIG: ${{ inputs.config }} | |
run: | | |
mkdir -p logs | |
source /usr/share/miniconda/etc/profile.d/conda.sh | |
conda activate swe-bench | |
export PYTHONPATH=$PYTHONPATH:$(pwd) | |
config="${CONFIG:-lite/marshmallow}" | |
tasks=$(awk '/--instances/ { sub(/--instances/, "--swe_bench_tasks"); print }' ./bin/solve_config/$config.txt) | |
python swebench/harness/run_evaluation.py \ | |
--predictions_path predictions.jsonl \ | |
$tasks \ | |
--log_dir logs \ | |
--testbed "${{ runner.temp }}" \ | |
--skip_existing \ | |
--timeout 900 \ | |
--verbose \ | |
--num_processes 8 \ | |
--path_conda $(conda info --base) | |
- name: Compress evaluation results | |
if: ${{ always() }} | |
run: | | |
tar -cJf output_${{ matrix.index }}.tar.xz logs predictions.jsonl | |
- name: Upload evaluation results | |
uses: actions/upload-artifact@v4 | |
if: ${{ always() }} | |
with: | |
name: output_${{ matrix.index }} | |
path: output_${{ matrix.index }}.tar.xz | |
compression-level: 0 | |
report: | |
needs: | |
- solve | |
if: needs.solve.result == 'success' || needs.solve.result == 'failure' | |
runs-on: 'ubuntu-latest' | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
with: | |
submodules: true | |
- name: Download evaluation results | |
uses: actions/download-artifact@v4 | |
with: | |
path: ./eval-results | |
- name: Unpack evaluation results | |
run: | | |
mkdir -p logs | |
for file in eval-results/**/*.tar.xz; do | |
tar --to-stdout -xJf "$file" predictions.jsonl >> predictions.jsonl | |
tar -xJf "$file" logs | |
done | |
# Cache the conda environment | |
- name: Cache conda environment | |
id: cache-conda | |
uses: actions/cache@v4 | |
with: | |
path: /usr/share/miniconda/envs/swe-bench | |
key: conda-${{ runner.os }}-${{ hashFiles('environment.yml') }} | |
# Create conda env if cache miss happens | |
- name: Create conda env | |
if: steps.cache-conda.outputs.cache-hit != 'true' | |
run: | | |
conda init bash | |
conda env create -f environment.yml | |
- name: Generate AppMap report | |
if: always() | |
env: | |
CONFIG: ${{ inputs.config }} | |
run: | | |
source /usr/share/miniconda/etc/profile.d/conda.sh | |
conda activate swe-bench | |
export PYTHONPATH=$PYTHONPATH:$(pwd) | |
conda info | |
config="${CONFIG:-lite/marshmallow}" | |
instances=$(awk '/--instances/ { print }' ./bin/solve_config/$config.txt) | |
split=$(awk '/--split/ { print }' ./bin/solve_config/$config.txt) | |
python solver/report.py \ | |
$instances \ | |
$split | |
- name: Archive predictions and logs | |
if: always() | |
uses: actions/upload-artifact@v4 | |
with: | |
name: results-${{ github.run_id }} | |
path: | | |
logs/ | |
predictions.jsonl | |
results.csv |