feat: Generate test cases to evaluate each issue #578
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Run the benchmark | |
on: | |
workflow_dispatch: | |
inputs: | |
filter: | |
description: "Instance filter regex" | |
required: false | |
default: "django-1[67]" | |
dataset: | |
description: "Dataset name" | |
required: true | |
default: princeton-nlp/SWE-bench_Lite | |
type: choice | |
options: | |
- princeton-nlp/SWE-bench_Lite | |
- princeton-nlp/SWE-bench | |
runner: | |
description: "Runner type" | |
required: true | |
default: ubuntu-latest | |
type: choice | |
options: | |
- ubuntu-latest | |
- swe-bench-ubuntu-latest | |
- SWE-Bench_Larger | |
split: | |
description: "Dataset split" | |
required: true | |
default: test | |
type: choice | |
options: | |
- dev | |
- test | |
retries: | |
description: "Number of retries to perform on each instance until a patch is found" | |
required: false | |
default: "3" | |
num_runners: | |
description: "Number of runners to split the workload across" | |
required: true | |
default: "1" | |
name: | |
description: "Assign a name to the workflow run" | |
type: string | |
required: false | |
steps: | |
description: "List of solver steps to perform" | |
required: false | |
instance_set: | |
description: "Instance set to select a subset of instances" | |
type: string | |
required: false | |
pull_request: | |
types: [opened, synchronize, labeled] | |
run-name: ${{ inputs.name || github.event.pull_request.title || github.event.workflow.name }} | |
jobs: | |
show-inputs: | |
runs-on: 'ubuntu-latest' | |
steps: | |
- name: Display Input Values | |
run: | | |
echo "filter: ${{ github.event.inputs.filter }}" | |
echo "dataset: ${{ github.event.inputs.dataset }}" | |
echo "runner: ${{ github.event.inputs.runner }}" | |
echo "split: ${{ github.event.inputs.split }}" | |
echo "retries: ${{ github.event.inputs.retries }}" | |
echo "num_runners: ${{ github.event.inputs.num_runners }}" | |
echo "name: ${{ github.event.inputs.name }}" | |
echo "steps: ${{ github.event.inputs.steps }}" | |
echo "instance_set: ${{ github.event.inputs.instance_set }}" | |
build-appmap-js: | |
runs-on: 'ubuntu-latest' | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
with: | |
submodules: true | |
# Cache the appmap-js build | |
- name: Cache appmap-js build | |
uses: actions/cache@v4 | |
id: cache-appmap-js | |
with: | |
lookup-only: true | |
path: | | |
submodules/appmap-js/node_modules | |
submodules/appmap-js/packages/*/built | |
submodules/appmap-js/packages/*/dist | |
submodules/appmap-js/packages/*/node_modules | |
key: appmap-js-dist-${{ runner.os }}-${{ hashFiles('.git/modules/submodules/appmap-js/HEAD') }} | |
- name: Set up Node.js | |
if: steps.cache-appmap-js.outputs.cache-hit != 'true' | |
uses: actions/setup-node@v3 | |
- name: Build submodules | |
if: steps.cache-appmap-js.outputs.cache-hit != 'true' | |
env: | |
PUPPETEER_SKIP_DOWNLOAD: true | |
run: | | |
cd submodules/appmap-js | |
git checkout -- . | |
yarn | |
yarn build | |
chmod +x packages/cli/built/cli.js | |
prepare-matrix: | |
runs-on: ubuntu-latest | |
env: | |
NUM_RUNNERS: ${{ inputs.num_runners }} | |
outputs: | |
matrix: ${{ steps.prepare-matrix.outputs.matrix }} | |
steps: | |
- name: Prepare matrix | |
id: prepare-matrix | |
run: | | |
num_runners=${NUM_RUNNERS:-1} | |
echo "Number of runners: $num_runners" | |
indices=$(seq 0 $(($num_runners - 1)) | jq -R 'tonumber' | jq -s -c) | |
echo "Matrix: $indices" | |
echo "matrix=$indices" >> $GITHUB_OUTPUT | |
solve: | |
needs: | |
- build-appmap-js | |
- prepare-matrix | |
runs-on: ${{ inputs.runner || 'ubuntu-latest' }} | |
strategy: | |
matrix: | |
index: ${{ fromJson(needs['prepare-matrix'].outputs.matrix) }} | |
defaults: | |
run: | |
shell: bash -leo pipefail {0} | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
with: | |
submodules: true | |
# Cache the conda environment | |
- name: Cache conda environment | |
id: cache-conda | |
uses: actions/cache@v4 | |
with: | |
path: /usr/share/miniconda/envs/swe-bench | |
key: conda-${{ runner.os }}-${{ hashFiles('environment.yml') }} | |
# Create conda env if cache miss happens | |
- name: Create conda env | |
if: steps.cache-conda.outputs.cache-hit != 'true' | |
run: | | |
conda init bash | |
conda env create -f environment.yml | |
# Restore the appmap-js build | |
- name: Restore appmap-js build | |
uses: actions/cache/restore@v4 | |
id: cache-appmap-js | |
with: | |
fail-on-cache-miss: true | |
path: | | |
submodules/appmap-js/node_modules | |
submodules/appmap-js/packages/*/built | |
submodules/appmap-js/packages/*/dist | |
submodules/appmap-js/packages/*/node_modules | |
key: appmap-js-dist-${{ runner.os }}-${{ hashFiles('.git/modules/submodules/appmap-js/HEAD') }} | |
- name: Install dependencies | |
run: | | |
sudo apt-get update | |
sudo apt-get install -y \ | |
texlive \ | |
texlive-xetex \ | |
dvipng \ | |
ghostscript \ | |
libfreetype-dev \ | |
libtiff-dev \ | |
libxrender1 | |
- name: Run benchmark | |
env: | |
GITHUB_TOKEN: ${{ secrets.GH_TOKEN }} | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
SWE_DATASET: ${{ inputs.dataset }} | |
SWE_SPLIT: ${{ inputs.split }} | |
SWE_FILTER: ${{ inputs.filter }} | |
SWE_RETRIES: ${{ inputs.retries }} | |
NUM_RUNNERS: ${{ inputs.num_runners }} | |
run: | | |
source /usr/share/miniconda/etc/profile.d/conda.sh | |
conda activate swe-bench | |
export PYTHONPATH=$PYTHONPATH:$(pwd) | |
python solver/solve.py \ | |
--instances "${SWE_DATASET:-princeton-nlp/SWE-bench_Lite}" \ | |
--split ${SWE_SPLIT:-dev} \ | |
--filter "${SWE_FILTER:-marshmallow-1343}" \ | |
--retries "${SWE_RETRIES:-3}" \ | |
--appmap_command "node $(pwd)/submodules/appmap-js/packages/cli/built/cli.js" \ | |
--lint_command "flake8 --extend-ignore=BLK100,C402,C408,C416,D,E122,E124,E127,E128,E131,E201,E202,E203,E221,E225,E231,E251,E261,E265,E266,E302,E303,E305,E402,E501,E502,E713,E731,F401,F841,W291,W293" \ | |
--temp_dir "${{ runner.temp }}" \ | |
--path_conda $(conda info --base) \ | |
--num_runners "${NUM_RUNNERS:-1}" \ | |
--runner_index "${{ matrix.index }}" \ | |
--verbose \ | |
$(test -n "${{ inputs.steps }}" && echo --steps "${{ inputs.steps }}") \ | |
$(test -n "${{ inputs.instance_set }}" && echo --instance_set "${{ inputs.instance_set }}") | |
- name: Run evaluation | |
env: | |
SWE_DATASET: ${{ inputs.dataset }} | |
run: | | |
mkdir -p logs | |
source /usr/share/miniconda/etc/profile.d/conda.sh | |
conda activate swe-bench | |
export PYTHONPATH=$PYTHONPATH:$(pwd) | |
python swebench/harness/run_evaluation.py \ | |
--predictions_path predictions.jsonl \ | |
--swe_bench_tasks "${SWE_DATASET:-princeton-nlp/SWE-bench_Lite}" \ | |
--log_dir logs \ | |
--testbed "${{ runner.temp }}" \ | |
--skip_existing \ | |
--timeout 900 \ | |
--verbose \ | |
--num_processes 8 \ | |
--path_conda $(conda info --base) | |
- name: Compress evaluation results | |
if: ${{ always() }} | |
run: | | |
tar -cJf output_${{ matrix.index }}.tar.xz logs predictions.jsonl | |
- name: Upload evaluation results | |
uses: actions/upload-artifact@v4 | |
if: ${{ always() }} | |
with: | |
name: output_${{ matrix.index }} | |
path: output_${{ matrix.index }}.tar.xz | |
compression-level: 0 | |
report: | |
needs: | |
- solve | |
if: needs.solve.result == 'success' || needs.solve.result == 'failure' | |
runs-on: 'ubuntu-latest' | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Download evaluation results | |
uses: actions/download-artifact@v4 | |
with: | |
path: ./eval-results | |
- name: Unpack evaluation results | |
run: | | |
mkdir -p logs | |
for file in eval-results/**/*.tar.xz; do | |
tar --to-stdout -xJf "$file" predictions.jsonl >> predictions.jsonl | |
tar -xJf "$file" logs | |
done | |
# Cache the conda environment | |
- name: Cache conda environment | |
id: cache-conda | |
uses: actions/cache@v4 | |
with: | |
path: /usr/share/miniconda/envs/swe-bench | |
key: conda-${{ runner.os }}-${{ hashFiles('environment.yml') }} | |
# Create conda env if cache miss happens | |
- name: Create conda env | |
if: steps.cache-conda.outputs.cache-hit != 'true' | |
run: | | |
conda init bash | |
conda env create -f environment.yml | |
- name: Generate AppMap report | |
if: always() | |
env: | |
SWE_DATASET: ${{ inputs.dataset }} | |
SWE_SPLIT: ${{ inputs.split }} | |
run: | | |
source /usr/share/miniconda/etc/profile.d/conda.sh | |
conda activate swe-bench | |
export PYTHONPATH=$PYTHONPATH:$(pwd) | |
conda info | |
python solver/report.py \ | |
--instances "${SWE_DATASET:-princeton-nlp/SWE-bench_Lite}" \ | |
--split ${SWE_SPLIT:-test} | |
- name: Archive predictions and logs | |
if: always() | |
uses: actions/upload-artifact@v4 | |
with: | |
name: results-${{ github.run_id }} | |
path: | | |
logs/ | |
predictions.jsonl | |
results.csv |