.github/workflows/solve.yml

name: Run the benchmark

on:
  workflow_dispatch:
    inputs:
      llm:
        description: "LLM model to use"
        type: choice
        required: true
        default: "gpt-4o-mini"
        options:
          - gpt-4o
          - gpt-4o-mini
          - claude3.5
      config:
        description: "Path to the configuration file in ./bin/solve_config"
        required: true
        type: string
        default: lite/marshmallow
      runner:
        description: "Runner type"
        required: true
        default: ubuntu-latest
        type: choice
        options:
          - ubuntu-latest
          - swe-bench-ubuntu-latest
          - SWE-Bench_Larger
      num_runners:
        description: "Number of runners to split the workload across"
        required: true
        default: "1"
      name:
        description: "Assign a name to the workflow run"
        type: string
        required: false

  pull_request:
    types: [opened, synchronize, labeled]

run-name: ${{ inputs.name || inputs.config || github.event.pull_request.title || github.event.workflow.name }}

jobs:
  show-inputs:
    runs-on: 'ubuntu-latest'
    steps:
      - name: Display Input Values
        run: |
          echo "llm: ${{ github.event.inputs.llm }}"
          echo "config: ${{ github.event.inputs.config }}"
          echo "runner: ${{ github.event.inputs.runner }}"
          echo "num_runners: ${{ github.event.inputs.num_runners }}"
          echo "name: ${{ github.event.inputs.name }}"
  build-appmap-js:
    runs-on: 'ubuntu-latest'
    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          submodules: true

      # Cache the appmap-js build
      - name: Cache appmap-js build
        uses: actions/cache@v4
        id: cache-appmap-js
        with:
          lookup-only: true
          path: |
            submodules/appmap-js/node_modules
            submodules/appmap-js/packages/*/built
            submodules/appmap-js/packages/*/dist
            submodules/appmap-js/packages/*/node_modules
          key: appmap-js-dist-${{ runner.os }}-${{ hashFiles('.git/modules/submodules/appmap-js/HEAD') }}

      - name: Set up Node.js
        if: steps.cache-appmap-js.outputs.cache-hit != 'true'
        uses: actions/setup-node@v3

      - name: Build submodules
        if: steps.cache-appmap-js.outputs.cache-hit != 'true'
        env:
          PUPPETEER_SKIP_DOWNLOAD: true
        run: |
          cd submodules/appmap-js
          git checkout -- .
          yarn
          yarn build
          chmod +x packages/cli/built/cli.js

  prepare-matrix:
    runs-on: ubuntu-latest
    env:
      NUM_RUNNERS: ${{ inputs.num_runners }}
    outputs:
      matrix: ${{ steps.prepare-matrix.outputs.matrix }}
    steps:
      - name: Prepare matrix
        id: prepare-matrix
        run: |
          num_runners=${NUM_RUNNERS:-1}
          echo "Number of runners: $num_runners"
          indices=$(seq 0 $(($num_runners - 1)) | jq -R 'tonumber' | jq -s -c)
          echo "Matrix: $indices"
          echo "matrix=$indices" >> $GITHUB_OUTPUT

  solve:
    needs:
      - build-appmap-js
      - prepare-matrix
    runs-on: ${{ inputs.runner || 'ubuntu-latest' }}
    strategy:
      matrix:
        index: ${{ fromJson(needs['prepare-matrix'].outputs.matrix) }}
    defaults:
      run:
        shell: bash -leo pipefail {0}
    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          submodules: true

      # Cache the conda environment
      - name: Cache conda environment
        id: cache-conda
        uses: actions/cache@v4
        with:
          path: /usr/share/miniconda/envs/swe-bench
          key: conda-${{ runner.os }}-${{ hashFiles('environment.yml') }}

      # Create conda env if cache miss happens
      - name: Create conda env
        if: steps.cache-conda.outputs.cache-hit != 'true'
        run: |
          conda init bash
          conda env create -f environment.yml

      # Restore the appmap-js build
      - name: Restore appmap-js build
        uses: actions/cache/restore@v4
        id: cache-appmap-js
        with:
          fail-on-cache-miss: true
          path: |
            submodules/appmap-js/node_modules
            submodules/appmap-js/packages/*/built
            submodules/appmap-js/packages/*/dist
            submodules/appmap-js/packages/*/node_modules
          key: appmap-js-dist-${{ runner.os }}-${{ hashFiles('.git/modules/submodules/appmap-js/HEAD') }}

      - name: Install dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y \
            texlive \
            texlive-xetex \
            dvipng \
            ghostscript \
            libfreetype-dev \
            libtiff-dev \
            libxrender1

      - name: Run benchmark
        env:
          GITHUB_TOKEN: ${{ secrets.GH_TOKEN }}
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          CONFIG: ${{ inputs.config }}
          LLM: ${{ inputs.llm }}
          NUM_RUNNERS: ${{ inputs.num_runners }}
        run: |
          source /usr/share/miniconda/etc/profile.d/conda.sh
          conda activate swe-bench
          export PYTHONPATH=$PYTHONPATH:$(pwd)

          llm="${LLM:-gpt-4o-mini}"
          config="${CONFIG:-lite/marshmallow}"

          ./bin/solve $llm \
            $config \
            --temp_dir "${{ runner.temp }}" \
            --path_conda $(conda info --base) \
            --num_runners "${NUM_RUNNERS:-1}" \
            --runner_index "${{ matrix.index }}"

      - name: Run evaluation
        env:
          CONFIG: ${{ inputs.config }}
        run: |
          mkdir -p logs
          source /usr/share/miniconda/etc/profile.d/conda.sh
          conda activate swe-bench
          export PYTHONPATH=$PYTHONPATH:$(pwd)

          config="${CONFIG:-lite/marshmallow}"
          tasks=$(awk '/--instances/ { sub(/--instances/, "--swe_bench_tasks"); print }' ./bin/solve_config/$config.txt)

          python swebench/harness/run_evaluation.py \
            --predictions_path predictions.jsonl \
            $tasks \
            --log_dir logs \
            --testbed "${{ runner.temp }}" \
            --skip_existing \
            --timeout 900 \
            --verbose \
            --num_processes 8 \
            --path_conda $(conda info --base)

      - name: Compress evaluation results
        if: ${{ always() }}
        run: |
          tar -cJf output_${{ matrix.index }}.tar.xz logs predictions.jsonl

      - name: Upload evaluation results
        uses: actions/upload-artifact@v4
        if: ${{ always() }}
        with:
          name: output_${{ matrix.index }}
          path: output_${{ matrix.index }}.tar.xz
          compression-level: 0

  report:
    needs:
      - solve
    if: needs.solve.result == 'success' || needs.solve.result == 'failure'
    runs-on: 'ubuntu-latest'
    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          submodules: true

      - name: Download evaluation results
        uses: actions/download-artifact@v4
        with:
          path: ./eval-results

      - name: Unpack evaluation results
        run: |
          mkdir -p logs
          for file in eval-results/**/*.tar.xz; do
            tar --to-stdout -xJf "$file" predictions.jsonl >> predictions.jsonl
            tar -xJf "$file" logs
          done

      # Cache the conda environment
      - name: Cache conda environment
        id: cache-conda
        uses: actions/cache@v4
        with:
          path: /usr/share/miniconda/envs/swe-bench
          key: conda-${{ runner.os }}-${{ hashFiles('environment.yml') }}

      # Create conda env if cache miss happens
      - name: Create conda env
        if: steps.cache-conda.outputs.cache-hit != 'true'
        run: |
          conda init bash
          conda env create -f environment.yml

      - name: Generate AppMap report
        if: always()
        env:
          CONFIG: ${{ inputs.config }}
        run: |
          source /usr/share/miniconda/etc/profile.d/conda.sh
          conda activate swe-bench
          export PYTHONPATH=$PYTHONPATH:$(pwd)
          conda info

          config="${CONFIG:-lite/marshmallow}"
          instances=$(awk '/--instances/ { print }' ./bin/solve_config/$config.txt)
          split=$(awk '/--split/ { print }' ./bin/solve_config/$config.txt)

          python solver/report.py \
            $instances \
            $split

      - name: Archive predictions and logs
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: results-${{ github.run_id }}
          path: |
            logs/
            predictions.jsonl
            results.csv