lite/challenge-subset #613

Workflow file for this run

	name: Run the benchmark

	on:
	workflow_dispatch:
	inputs:
	llm:
	description: "LLM model to use"
	type: choice
	required: true
	default: "gpt-4o-mini"
	options:
	- gpt-4o
	- gpt-4o-mini
	- claude3.5
	config:
	description: "Path to the configuration file in ./bin/solve_config"
	required: true
	type: string
	default: lite/marshmallow
	runner:
	description: "Runner type"
	required: true
	default: ubuntu-latest
	type: choice
	options:
	- ubuntu-latest
	- swe-bench-ubuntu-latest
	- SWE-Bench_Larger
	num_runners:
	description: "Number of runners to split the workload across"
	required: true
	default: "1"
	name:
	description: "Assign a name to the workflow run"
	type: string
	required: false

	pull_request:
	types: [opened, synchronize, labeled]

	run-name: ${{ inputs.name \|\| inputs.config \|\| github.event.pull_request.title \|\| github.event.workflow.name }}

	jobs:
	show-inputs:
	runs-on: 'ubuntu-latest'
	steps:
	- name: Display Input Values
	run: \|
	echo "llm: ${{ github.event.inputs.llm }}"
	echo "config: ${{ github.event.inputs.config }}"
	echo "runner: ${{ github.event.inputs.runner }}"
	echo "num_runners: ${{ github.event.inputs.num_runners }}"
	echo "name: ${{ github.event.inputs.name }}"
	build-appmap-js:
	runs-on: 'ubuntu-latest'
	steps:
	- name: Checkout
	uses: actions/checkout@v4
	with:
	submodules: true

	# Cache the appmap-js build
	- name: Cache appmap-js build
	uses: actions/cache@v4
	id: cache-appmap-js
	with:
	lookup-only: true
	path: \|
	submodules/appmap-js/node_modules
	submodules/appmap-js/packages/*/built
	submodules/appmap-js/packages/*/dist
	submodules/appmap-js/packages/*/node_modules
	key: appmap-js-dist-${{ runner.os }}-${{ hashFiles('.git/modules/submodules/appmap-js/HEAD') }}

	- name: Set up Node.js
	if: steps.cache-appmap-js.outputs.cache-hit != 'true'
	uses: actions/setup-node@v3

	- name: Build submodules
	if: steps.cache-appmap-js.outputs.cache-hit != 'true'
	env:
	PUPPETEER_SKIP_DOWNLOAD: true
	run: \|
	cd submodules/appmap-js
	git checkout -- .
	yarn
	yarn build
	chmod +x packages/cli/built/cli.js

	prepare-matrix:
	runs-on: ubuntu-latest
	env:
	NUM_RUNNERS: ${{ inputs.num_runners }}
	outputs:
	matrix: ${{ steps.prepare-matrix.outputs.matrix }}
	steps:
	- name: Prepare matrix
	id: prepare-matrix
	run: \|
	num_runners=${NUM_RUNNERS:-1}
	echo "Number of runners: $num_runners"
	indices=$(seq 0 $(($num_runners - 1)) \| jq -R 'tonumber' \| jq -s -c)
	echo "Matrix: $indices"
	echo "matrix=$indices" >> $GITHUB_OUTPUT

	solve:
	needs:
	- build-appmap-js
	- prepare-matrix
	runs-on: ${{ inputs.runner \|\| 'ubuntu-latest' }}
	strategy:
	matrix:
	index: ${{ fromJson(needs['prepare-matrix'].outputs.matrix) }}
	defaults:
	run:
	shell: bash -leo pipefail {0}
	steps:
	- name: Checkout
	uses: actions/checkout@v4
	with:
	submodules: true

	# Cache the conda environment
	- name: Cache conda environment
	id: cache-conda
	uses: actions/cache@v4
	with:
	path: /usr/share/miniconda/envs/swe-bench
	key: conda-${{ runner.os }}-${{ hashFiles('environment.yml') }}

	# Create conda env if cache miss happens
	- name: Create conda env
	if: steps.cache-conda.outputs.cache-hit != 'true'
	run: \|
	conda init bash
	conda env create -f environment.yml

	# Restore the appmap-js build
	- name: Restore appmap-js build
	uses: actions/cache/restore@v4
	id: cache-appmap-js
	with:
	fail-on-cache-miss: true
	path: \|
	submodules/appmap-js/node_modules
	submodules/appmap-js/packages/*/built
	submodules/appmap-js/packages/*/dist
	submodules/appmap-js/packages/*/node_modules
	key: appmap-js-dist-${{ runner.os }}-${{ hashFiles('.git/modules/submodules/appmap-js/HEAD') }}

	- name: Install dependencies
	run: \|
	sudo apt-get update
	sudo apt-get install -y \
	texlive \
	texlive-xetex \
	dvipng \
	ghostscript \
	libfreetype-dev \
	libtiff-dev \
	libxrender1

	- name: Run benchmark
	env:
	GITHUB_TOKEN: ${{ secrets.GH_TOKEN }}
	OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
	CONFIG: ${{ inputs.config }}
	LLM: ${{ inputs.llm }}
	NUM_RUNNERS: ${{ inputs.num_runners }}
	run: \|
	source /usr/share/miniconda/etc/profile.d/conda.sh
	conda activate swe-bench
	export PYTHONPATH=$PYTHONPATH:$(pwd)

	llm="${LLM:-gpt-4o-mini}"
	config="${CONFIG:-lite/marshmallow}"

	./bin/solve $llm \
	$config \
	--temp_dir "${{ runner.temp }}" \
	--path_conda $(conda info --base) \
	--num_runners "${NUM_RUNNERS:-1}" \
	--runner_index "${{ matrix.index }}"

	- name: Run evaluation
	env:
	CONFIG: ${{ inputs.config }}
	run: \|
	mkdir -p logs
	source /usr/share/miniconda/etc/profile.d/conda.sh
	conda activate swe-bench
	export PYTHONPATH=$PYTHONPATH:$(pwd)

	config="${CONFIG:-lite/marshmallow}"
	tasks=$(awk '/--instances/ { sub(/--instances/, "--swe_bench_tasks"); print }' ./bin/solve_config/$config.txt)

	python swebench/harness/run_evaluation.py \
	--predictions_path predictions.jsonl \
	$tasks \
	--log_dir logs \
	--testbed "${{ runner.temp }}" \
	--skip_existing \
	--timeout 900 \
	--verbose \
	--num_processes 8 \
	--path_conda $(conda info --base)

	- name: Compress evaluation results
	if: ${{ always() }}
	run: \|
	tar -cJf output_${{ matrix.index }}.tar.xz logs predictions.jsonl

	- name: Upload evaluation results
	uses: actions/upload-artifact@v4
	if: ${{ always() }}
	with:
	name: output_${{ matrix.index }}
	path: output_${{ matrix.index }}.tar.xz
	compression-level: 0

	report:
	needs:
	- solve
	if: needs.solve.result == 'success' \|\| needs.solve.result == 'failure'
	runs-on: 'ubuntu-latest'
	steps:
	- name: Checkout
	uses: actions/checkout@v4
	with:
	submodules: true

	- name: Download evaluation results
	uses: actions/download-artifact@v4
	with:
	path: ./eval-results

	- name: Unpack evaluation results
	run: \|
	mkdir -p logs
	for file in eval-results/*/.tar.xz; do
	tar --to-stdout -xJf "$file" predictions.jsonl >> predictions.jsonl
	tar -xJf "$file" logs
	done

	# Cache the conda environment
	- name: Cache conda environment
	id: cache-conda
	uses: actions/cache@v4
	with:
	path: /usr/share/miniconda/envs/swe-bench
	key: conda-${{ runner.os }}-${{ hashFiles('environment.yml') }}

	# Create conda env if cache miss happens
	- name: Create conda env
	if: steps.cache-conda.outputs.cache-hit != 'true'
	run: \|
	conda init bash
	conda env create -f environment.yml

	- name: Generate AppMap report
	if: always()
	env:
	CONFIG: ${{ inputs.config }}
	run: \|
	source /usr/share/miniconda/etc/profile.d/conda.sh
	conda activate swe-bench
	export PYTHONPATH=$PYTHONPATH:$(pwd)
	conda info

	config="${CONFIG:-lite/marshmallow}"
	instances=$(awk '/--instances/ { print }' ./bin/solve_config/$config.txt)
	split=$(awk '/--split/ { print }' ./bin/solve_config/$config.txt)

	python solver/report.py \
	$instances \
	$split

	- name: Archive predictions and logs
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: results-${{ github.run_id }}
	path: \|
	logs/
	predictions.jsonl
	results.csv

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

lite/challenge-subset #613

Workflow file

lite/challenge-subset #613

Jobs

Run details

Workflow file for this run