Skip to content

Run evaluation

Run evaluation #1

Workflow file for this run

name: Run evaluation
on:
workflow_dispatch:
inputs:
config:
description: "Path to the configuration file in ./bin/solve_config"
required: true
type: string
default: lite/marshmallow
runner:
description: "Runner type"
required: true
default: ubuntu-latest
type: choice
options:
- ubuntu-latest
- swe-bench-ubuntu-latest
- SWE-Bench_Larger
num_runners:
description: "Number of runners to split the workload across"
required: true
default: "1"
predictions:
description: "Path to the predictions file"
required: true
type: string
jobs:
prepare-matrix:
runs-on: ubuntu-latest
env:
NUM_RUNNERS: ${{ inputs.num_runners }}
outputs:
matrix: ${{ steps.prepare-matrix.outputs.matrix }}
steps:
- name: Prepare matrix
id: prepare-matrix
run: |
num_runners=${NUM_RUNNERS:-1}
echo "Number of runners: $num_runners"
indices=$(seq 0 $(($num_runners - 1)) | jq -R 'tonumber' | jq -s -c)
echo "Matrix: $indices"
echo "matrix=$indices" >> $GITHUB_OUTPUT
solve:
needs:
- prepare-matrix
runs-on: ${{ inputs.runner || 'ubuntu-latest' }}
strategy:
matrix:
index: ${{ fromJson(needs['prepare-matrix'].outputs.matrix) }}
defaults:
run:
shell: bash -leo pipefail {0}
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
# Cache the conda environment
- name: Cache conda environment
id: cache-conda
uses: actions/cache@v4
with:
path: /usr/share/miniconda/envs/swe-bench
key: conda-${{ runner.os }}-${{ hashFiles('environment.yml') }}
# Create conda env if cache miss happens
- name: Create conda env
if: steps.cache-conda.outputs.cache-hit != 'true'
run: |
conda init bash
conda env create -f environment.yml
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install -y \
texlive \
texlive-xetex \
dvipng \
ghostscript \
libfreetype-dev \
libtiff-dev \
libxrender1
- name: Run evaluation
run: |
mkdir -p logs
source /usr/share/miniconda/etc/profile.d/conda.sh
conda activate swe-bench
export PYTHONPATH=$PYTHONPATH:$(pwd)
tasks=$(awk '/--instances/ { sub(/--instances/, "--swe_bench_tasks"); print }' ./bin/solve_config/${{ inputs.config }}.txt)
cpy ${{ inputs.predictions }} predictions.jsonl
python swebench/harness/run_evaluation.py \
--predictions_path predictions.jsonl \
$tasks \
--log_dir logs \
--testbed "${{ runner.temp }}" \
--skip_existing \
--timeout 900 \
--verbose \
--num_processes 8 \
--path_conda $(conda info --base)
- name: Compress evaluation results
if: ${{ always() }}
run: |
tar -cJf output_${{ matrix.index }}.tar.xz logs predictions.jsonl
- name: Upload evaluation results
uses: actions/upload-artifact@v4
if: ${{ always() }}
with:
name: output_${{ matrix.index }}
path: output_${{ matrix.index }}.tar.xz
compression-level: 0
report:
needs:
- solve
if: needs.solve.result == 'success' || needs.solve.result == 'failure'
runs-on: 'ubuntu-latest'
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
- name: Download evaluation results
uses: actions/download-artifact@v4
with:
path: ./eval-results
- name: Unpack evaluation results
run: |
mkdir -p logs
for file in eval-results/**/*.tar.xz; do
tar --to-stdout -xJf "$file" predictions.jsonl >> predictions.jsonl
tar -xJf "$file" logs
done
# Cache the conda environment
- name: Cache conda environment
id: cache-conda
uses: actions/cache@v4
with:
path: /usr/share/miniconda/envs/swe-bench
key: conda-${{ runner.os }}-${{ hashFiles('environment.yml') }}
# Create conda env if cache miss happens
- name: Create conda env
if: steps.cache-conda.outputs.cache-hit != 'true'
run: |
conda init bash
conda env create -f environment.yml
- name: Generate AppMap report
if: always()
env:
CONFIG: ${{ inputs.config }}
run: |
source /usr/share/miniconda/etc/profile.d/conda.sh
conda activate swe-bench
export PYTHONPATH=$PYTHONPATH:$(pwd)
conda info
config="${CONFIG:-lite/marshmallow}"
instances=$(awk '/--instances/ { print }' ./bin/solve_config/$config.txt)
split=$(awk '/--split/ { print }' ./bin/solve_config/$config.txt)
python solver/report.py \
$instances \
$split
- name: Archive predictions and logs
if: always()
uses: actions/upload-artifact@v4
with:
name: results-${{ github.run_id }}
path: |
logs/
predictions.jsonl
results.csv