Skip to content

feat: Generate test cases to evaluate each issue #549

feat: Generate test cases to evaluate each issue

feat: Generate test cases to evaluate each issue #549

Workflow file for this run

name: Run the benchmark
on:
workflow_dispatch:
inputs:
filter:
description: "Instance filter regex"
required: false
default: "django-1[67]"
dataset:
description: "Dataset name"
required: true
default: princeton-nlp/SWE-bench_Lite
type: choice
options:
- princeton-nlp/SWE-bench_Lite
- princeton-nlp/SWE-bench
runner:
description: "Runner type"
required: true
default: ubuntu-latest
type: choice
options:
- ubuntu-latest
- swe-bench-ubuntu-latest
- SWE-Bench_Larger
split:
description: "Dataset split"
required: true
default: test
type: choice
options:
- dev
- test
retries:
description: "Number of retries to perform on each instance until a patch is found"
required: false
default: "3"
num_runners:
description: "Number of runners to split the workload across"
required: true
default: "1"
name:
description: "Assign a name to the workflow run"
type: string
required: false
steps:
description: "List of solver steps to perform"
required: false
instance_set:
description: "Instance set to select a subset of instances"
type: string
required: false
pull_request:
types: [opened, synchronize, labeled]
run-name: ${{ inputs.name || github.event.pull_request.title || github.event.workflow.name }}
jobs:
show-inputs:
runs-on: 'ubuntu-latest'
steps:
- name: Display Input Values
run: |
echo "filter: ${{ github.event.inputs.filter }}"
echo "dataset: ${{ github.event.inputs.dataset }}"
echo "runner: ${{ github.event.inputs.runner }}"
echo "split: ${{ github.event.inputs.split }}"
echo "retries: ${{ github.event.inputs.retries }}"
echo "num_runners: ${{ github.event.inputs.num_runners }}"
echo "name: ${{ github.event.inputs.name }}"
echo "steps: ${{ github.event.inputs.steps }}"
echo "instance_set: ${{ github.event.inputs.instance_set }}"
build-appmap-js:
runs-on: 'ubuntu-latest'
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
# Cache the appmap-js build
- name: Cache appmap-js build
uses: actions/cache@v4
id: cache-appmap-js
with:
lookup-only: true
path: |
submodules/appmap-js/node_modules
submodules/appmap-js/packages/*/built
submodules/appmap-js/packages/*/dist
submodules/appmap-js/packages/*/node_modules
key: appmap-js-dist-${{ runner.os }}-${{ hashFiles('.git/modules/submodules/appmap-js/HEAD') }}
- name: Set up Node.js
if: steps.cache-appmap-js.outputs.cache-hit != 'true'
uses: actions/setup-node@v3
- name: Build submodules
if: steps.cache-appmap-js.outputs.cache-hit != 'true'
env:
PUPPETEER_SKIP_DOWNLOAD: true
run: |
cd submodules/appmap-js
git checkout -- .
yarn
yarn build
chmod +x packages/cli/built/cli.js
prepare-matrix:
runs-on: ubuntu-latest
env:
NUM_RUNNERS: ${{ inputs.num_runners }}
outputs:
matrix: ${{ steps.prepare-matrix.outputs.matrix }}
steps:
- name: Prepare matrix
id: prepare-matrix
run: |
num_runners=${NUM_RUNNERS:-1}
echo "Number of runners: $num_runners"
indices=$(seq 0 $(($num_runners - 1)) | jq -R 'tonumber' | jq -s -c)
echo "Matrix: $indices"
echo "matrix=$indices" >> $GITHUB_OUTPUT
solve:
needs:
- build-appmap-js
- prepare-matrix
runs-on: ${{ inputs.runner || 'ubuntu-latest' }}
strategy:
matrix:
index: ${{ fromJson(needs['prepare-matrix'].outputs.matrix) }}
defaults:
run:
shell: bash -leo pipefail {0}
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
# Cache the conda environment
- name: Cache conda environment
id: cache-conda
uses: actions/cache@v4
with:
path: /usr/share/miniconda/envs/swe-bench
key: conda-${{ runner.os }}-${{ hashFiles('environment.yml') }}
# Create conda env if cache miss happens
- name: Create conda env
if: steps.cache-conda.outputs.cache-hit != 'true'
run: |
conda init bash
conda env create -f environment.yml
# Restore the appmap-js build
- name: Restore appmap-js build
uses: actions/cache/restore@v4
id: cache-appmap-js
with:
fail-on-cache-miss: true
path: |
submodules/appmap-js/node_modules
submodules/appmap-js/packages/*/built
submodules/appmap-js/packages/*/dist
submodules/appmap-js/packages/*/node_modules
key: appmap-js-dist-${{ runner.os }}-${{ hashFiles('.git/modules/submodules/appmap-js/HEAD') }}
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install -y \
texlive \
texlive-xetex \
dvipng \
ghostscript \
libfreetype-dev \
libtiff-dev \
libxrender1
- name: Run benchmark
env:
GITHUB_TOKEN: ${{ secrets.GH_TOKEN }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
SWE_DATASET: ${{ inputs.dataset }}
SWE_SPLIT: ${{ inputs.split }}
SWE_FILTER: ${{ inputs.filter }}
SWE_RETRIES: ${{ inputs.retries }}
NUM_RUNNERS: ${{ inputs.num_runners }}
run: |
source /usr/share/miniconda/etc/profile.d/conda.sh
conda activate swe-bench
export PYTHONPATH=$PYTHONPATH:$(pwd)
python appmap/solve.py \
--instances "${SWE_DATASET:-princeton-nlp/SWE-bench_Lite}" \
--split ${SWE_SPLIT:-dev} \
--filter "${SWE_FILTER:-marshmallow-1343}" \
--retries "${SWE_RETRIES:-3}" \
--appmap_command $(pwd)/submodules/appmap-js/packages/cli/built/cli.js \
--lint_command "flake8 --extend-ignore=BLK100,C402,C408,C416,D,E122,E124,E127,E128,E131,E201,E202,E203,E221,E225,E231,E251,E261,E265,E266,E302,E303,E305,E402,E501,E502,E713,E731,F401,F841,W291,W293" \
--temp_dir "${{ runner.temp }}" \
--path_conda $(conda info --base) \
--num_runners "${NUM_RUNNERS:-1}" \
--runner_index "${{ matrix.index }}" \
--verbose \
$(test -n "${{ inputs.steps }}" && echo --steps "${{ inputs.steps }}") \
$(test -n "${{ inputs.instance_set }}" && echo --instance_set "${{ inputs.instance_set }}")
- name: Run evaluation
env:
SWE_DATASET: ${{ inputs.dataset }}
run: |
mkdir -p logs
source /usr/share/miniconda/etc/profile.d/conda.sh
conda activate swe-bench
export PYTHONPATH=$PYTHONPATH:$(pwd)
python swebench/harness/run_evaluation.py \
--predictions_path predictions.jsonl \
--swe_bench_tasks "${SWE_DATASET:-princeton-nlp/SWE-bench_Lite}" \
--log_dir logs \
--testbed "${{ runner.temp }}" \
--skip_existing \
--timeout 900 \
--verbose \
--num_processes 8 \
--path_conda $(conda info --base)
- name: Compress evaluation results
if: ${{ always() }}
run: |
tar -cJf output_${{ matrix.index }}.tar.xz logs predictions.jsonl
- name: Upload evaluation results
uses: actions/upload-artifact@v4
if: ${{ always() }}
with:
name: output_${{ matrix.index }}
path: output_${{ matrix.index }}.tar.xz
compression-level: 0
report:
needs:
- solve
if: needs.solve.result == 'success' || needs.solve.result == 'failure'
runs-on: 'ubuntu-latest'
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Download evaluation results
uses: actions/download-artifact@v4
with:
path: ./eval-results
- name: Unpack evaluation results
run: |
mkdir -p logs
for file in eval-results/**/*.tar.xz; do
tar --to-stdout -xJf "$file" predictions.jsonl >> predictions.jsonl
tar -xJf "$file" logs
done
# Cache the conda environment
- name: Cache conda environment
id: cache-conda
uses: actions/cache@v4
with:
path: /usr/share/miniconda/envs/swe-bench
key: conda-${{ runner.os }}-${{ hashFiles('environment.yml') }}
# Create conda env if cache miss happens
- name: Create conda env
if: steps.cache-conda.outputs.cache-hit != 'true'
run: |
conda init bash
conda env create -f environment.yml
- name: Generate AppMap report
if: always()
env:
SWE_DATASET: ${{ inputs.dataset }}
SWE_SPLIT: ${{ inputs.split }}
run: |
source /usr/share/miniconda/etc/profile.d/conda.sh
conda activate swe-bench
export PYTHONPATH=$PYTHONPATH:$(pwd)
conda info
python appmap/report.py \
--instances "${SWE_DATASET:-princeton-nlp/SWE-bench_Lite}" \
--split ${SWE_SPLIT:-test}
- name: Archive predictions and logs
if: always()
uses: actions/upload-artifact@v4
with:
name: results-${{ github.run_id }}
path: |
logs/
predictions.jsonl
results.csv