Run evaluation #1

Workflow file for this run

.github/workflows/evaluate.yml at 8a93bf9

	name: Run evaluation

	on:
	workflow_dispatch:
	inputs:
	config:
	description: "Path to the configuration file in ./bin/solve_config"
	required: true
	type: string
	default: lite/marshmallow
	runner:
	description: "Runner type"
	required: true
	default: ubuntu-latest
	type: choice
	options:
	- ubuntu-latest
	- swe-bench-ubuntu-latest
	- SWE-Bench_Larger
	num_runners:
	description: "Number of runners to split the workload across"
	required: true
	default: "1"
	predictions:
	description: "Path to the predictions file"
	required: true
	type: string

	jobs:
	prepare-matrix:
	runs-on: ubuntu-latest
	env:
	NUM_RUNNERS: ${{ inputs.num_runners }}
	outputs:
	matrix: ${{ steps.prepare-matrix.outputs.matrix }}
	steps:
	- name: Prepare matrix
	id: prepare-matrix
	run: \|
	num_runners=${NUM_RUNNERS:-1}
	echo "Number of runners: $num_runners"
	indices=$(seq 0 $(($num_runners - 1)) \| jq -R 'tonumber' \| jq -s -c)
	echo "Matrix: $indices"
	echo "matrix=$indices" >> $GITHUB_OUTPUT

	solve:
	needs:
	- prepare-matrix
	runs-on: ${{ inputs.runner \|\| 'ubuntu-latest' }}
	strategy:
	matrix:
	index: ${{ fromJson(needs['prepare-matrix'].outputs.matrix) }}
	defaults:
	run:
	shell: bash -leo pipefail {0}
	steps:
	- name: Checkout
	uses: actions/checkout@v4
	with:
	submodules: true

	# Cache the conda environment
	- name: Cache conda environment
	id: cache-conda
	uses: actions/cache@v4
	with:
	path: /usr/share/miniconda/envs/swe-bench
	key: conda-${{ runner.os }}-${{ hashFiles('environment.yml') }}

	# Create conda env if cache miss happens
	- name: Create conda env
	if: steps.cache-conda.outputs.cache-hit != 'true'
	run: \|
	conda init bash
	conda env create -f environment.yml

	- name: Install dependencies
	run: \|
	sudo apt-get update
	sudo apt-get install -y \
	texlive \
	texlive-xetex \
	dvipng \
	ghostscript \
	libfreetype-dev \
	libtiff-dev \
	libxrender1

	- name: Run evaluation
	run: \|
	mkdir -p logs
	source /usr/share/miniconda/etc/profile.d/conda.sh
	conda activate swe-bench
	export PYTHONPATH=$PYTHONPATH:$(pwd)

	tasks=$(awk '/--instances/ { sub(/--instances/, "--swe_bench_tasks"); print }' ./bin/solve_config/${{ inputs.config }}.txt)
	cpy ${{ inputs.predictions }} predictions.jsonl
	python swebench/harness/run_evaluation.py \
	--predictions_path predictions.jsonl \
	$tasks \
	--log_dir logs \
	--testbed "${{ runner.temp }}" \
	--skip_existing \
	--timeout 900 \
	--verbose \
	--num_processes 8 \
	--path_conda $(conda info --base)

	- name: Compress evaluation results
	if: ${{ always() }}
	run: \|
	tar -cJf output_${{ matrix.index }}.tar.xz logs predictions.jsonl

	- name: Upload evaluation results
	uses: actions/upload-artifact@v4
	if: ${{ always() }}
	with:
	name: output_${{ matrix.index }}
	path: output_${{ matrix.index }}.tar.xz
	compression-level: 0

	report:
	needs:
	- solve
	if: needs.solve.result == 'success' \|\| needs.solve.result == 'failure'
	runs-on: 'ubuntu-latest'
	steps:
	- name: Checkout
	uses: actions/checkout@v4
	with:
	submodules: true

	- name: Download evaluation results
	uses: actions/download-artifact@v4
	with:
	path: ./eval-results

	- name: Unpack evaluation results
	run: \|
	mkdir -p logs
	for file in eval-results/*/.tar.xz; do
	tar --to-stdout -xJf "$file" predictions.jsonl >> predictions.jsonl
	tar -xJf "$file" logs
	done

	# Cache the conda environment
	- name: Cache conda environment
	id: cache-conda
	uses: actions/cache@v4
	with:
	path: /usr/share/miniconda/envs/swe-bench
	key: conda-${{ runner.os }}-${{ hashFiles('environment.yml') }}

	# Create conda env if cache miss happens
	- name: Create conda env
	if: steps.cache-conda.outputs.cache-hit != 'true'
	run: \|
	conda init bash
	conda env create -f environment.yml

	- name: Generate AppMap report
	if: always()
	env:
	CONFIG: ${{ inputs.config }}
	run: \|
	source /usr/share/miniconda/etc/profile.d/conda.sh
	conda activate swe-bench
	export PYTHONPATH=$PYTHONPATH:$(pwd)
	conda info

	config="${CONFIG:-lite/marshmallow}"
	instances=$(awk '/--instances/ { print }' ./bin/solve_config/$config.txt)
	split=$(awk '/--split/ { print }' ./bin/solve_config/$config.txt)

	python solver/report.py \
	$instances \
	$split

	- name: Archive predictions and logs
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: results-${{ github.run_id }}
	path: \|
	logs/
	predictions.jsonl
	results.csv

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Run evaluation #1

Workflow file

Run evaluation #1

Jobs

Run details

Workflow file for this run