diff --git a/.github/workflows/solve.yml b/.github/workflows/solve.yml new file mode 100644 index 00000000..8faca73d --- /dev/null +++ b/.github/workflows/solve.yml @@ -0,0 +1,139 @@ +on: + workflow_dispatch: + inputs: + filter: + description: "Instance filter" + required: true + default: marshmallow + dataset: + description: "Dataset name" + required: true + default: princeton-nlp/SWE-bench_Lite + split: + description: "Dataset split" + required: true + default: dev + retries: + description: "Number of retries to perform on each instance until a patch is found" + required: false + default: "3" + + pull_request: + +jobs: + solve: + if: ${{ contains(github.event.pull_request.labels.*.name, 'evaluate') || github.event_name == 'workflow_dispatch' }} + runs-on: swe-bench-ubuntu-latest + defaults: + run: + shell: bash -leo pipefail {0} + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + + - name: Set up Python + uses: actions/setup-python@v4 + + # Cache the conda environment + - name: Cache conda environment + id: cache-conda + uses: actions/cache@v3 + with: + path: /usr/share/miniconda/envs/swe-bench + key: conda-${{ runner.os }}-${{ hashFiles('environment.yml') }} + + # Create conda env if cache miss happens + - name: Create conda env + if: steps.cache-conda.outputs.cache-hit != 'true' + run: | + conda init bash + conda env create -f environment.yml + pip install flake8 black + + # Cache the appmap-js build + - name: Cache appmap-js build + uses: actions/cache@v3 + id: cache-appmap-js + with: + path: | + submodules/appmap-js/node_modules + submodules/appmap-js/packages/*/built + key: appmap-js-${{ runner.os }}-${{ hashFiles('submodules/appmap-js/package.json') }} + + - name: Build submodules + # TODO: figure out why it doesn't work with cache + # if: steps.cache-appmap-js.outputs.cache-hit != 'true' + env: + PUPPETEER_SKIP_DOWNLOAD: true + run: | + cd submodules/appmap-js + git checkout -- . + yarn + yarn build + chmod +x packages/cli/built/cli.js + + - name: Run benchmark + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + SWE_DATASET: ${{ inputs.dataset }} + SWE_SPLIT: ${{ inputs.split }} + SWE_FILTER: ${{ inputs.filter }} + SWE_RETRIES: ${{ inputs.retries }} + run: | + source /usr/share/miniconda/etc/profile.d/conda.sh + conda activate swe-bench + export PYTHONPATH=$PYTHONPATH:$(pwd) + python appmap/solve.py \ + --instances ${SWE_DATASET:-princeton-nlp/SWE-bench_Lite} \ + --split ${SWE_SPLIT:-dev} \ + --filter ${SWE_FILTER:-marshmallow} \ + --retries ${SWE_RETRIES:-3} \ + --appmap_command $(pwd)/submodules/appmap-js/packages/cli/built/cli.js \ + --lint_command "flake8 --extend-ignore=BLK100,W293,E201,E202,E303,E501,E128,E231,C408,F401,C402,E402,C416,E261,E302,D" \ + --temp_dir ${{ runner.temp }} \ + --num_workers 6 \ + --path_conda $(conda info --base) \ + --verbose + + - name: Run evaluation + env: + SWE_DATASET: ${{ inputs.dataset }} + run: | + mkdir -p logs + source /usr/share/miniconda/etc/profile.d/conda.sh + conda activate swe-bench + export PYTHONPATH=$PYTHONPATH:$(pwd) + python swebench/harness/run_evaluation.py \ + --predictions_path predictions.jsonl \ + --swe_bench_tasks ${SWE_DATASET:-princeton-nlp/SWE-bench_Lite} \ + --log_dir logs \ + --testbed ${{ runner.temp }} \ + --skip_existing \ + --timeout 900 \ + --verbose \ + --num_processes 8 \ + --path_conda $(conda info --base) + + - name: Generate AppMap report + env: + SWE_DATASET: ${{ inputs.dataset }} + SWE_SPLIT: ${{ inputs.split }} + run: | + source /usr/share/miniconda/etc/profile.d/conda.sh + conda activate swe-bench + export PYTHONPATH=$PYTHONPATH:$(pwd) + conda info + python appmap/report.py \ + --instances ${SWE_DATASET:-princeton-nlp/SWE-bench_Lite} \ + --split ${SWE_SPLIT:-dev} + + - name: Archive predictions and logs + uses: actions/upload-artifact@v4 + with: + name: results + path: | + logs/ + predictions.jsonl + results.csv diff --git a/.gitignore b/.gitignore index 5fd289a5..a0489ac7 100644 --- a/.gitignore +++ b/.gitignore @@ -174,3 +174,9 @@ analysis/evaluation/*.csv analysis/evaluation/*.pdf data/repos/copies notebooks/ +*.csv +appmap.sh +work +appmap/datasets +logs + diff --git a/.gitmodules b/.gitmodules index e69de29b..37673848 100644 --- a/.gitmodules +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "submodules/appmap-js"] + path = submodules/appmap-js + url = https://github.com/getappmap/appmap-js + branch = feat/apply-command diff --git a/appmap/__init__.py b/appmap/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/appmap/data.py b/appmap/data.py new file mode 100644 index 00000000..8d7ce2f2 --- /dev/null +++ b/appmap/data.py @@ -0,0 +1,17 @@ +from datasets import DatasetDict, load_dataset, load_from_disk +from pathlib import Path + +datasets_dir = Path(__file__).parent / "datasets" + + +def load_data(dataset_name, split) -> tuple[DatasetDict, str]: + dataset_dir = datasets_dir / dataset_name.replace("/", "__") + dataset = None + if Path(dataset_dir).exists(): + dataset = load_from_disk(str(dataset_dir)) + else: + dataset = load_dataset(dataset_name) + Path.mkdir(dataset_dir, parents=True) + dataset.save_to_disk(str(dataset_dir)) + + return dataset[split] diff --git a/appmap/make_appmaps.py b/appmap/make_appmaps.py index a1d09172..da48ba7e 100644 --- a/appmap/make_appmaps.py +++ b/appmap/make_appmaps.py @@ -1,12 +1,12 @@ import argparse, glob, itertools, os, tarfile, subprocess from multiprocessing import Pool, cpu_count -from swebench.harness.constants import MAP_REPO_TO_TEST_FRAMEWORK, PatchType +from swebench.harness.constants import MAP_REPO_TO_TEST_FRAMEWORK from swebench.harness.context_manager import ( TaskEnvContextManager, TestbedContextManager, ) -from swebench.harness.utils import get_instances, split_instances, DotDict +from swebench.harness.utils import split_instances, DotDict from swebench.metrics.getters import get_eval_refs @@ -36,9 +36,9 @@ def validate_args(args): # If value is provided, check that it is valid if args.timeout is not None and args.timeout < 0: - raise ValueError(f"Timeout must be a positive integer") + raise ValueError("Timeout must be a positive integer") if args.num_workers is not None and args.num_workers < 1: - raise ValueError(f"Number of workers must be a positive integer") + raise ValueError("Number of workers must be a positive integer") if not os.path.exists(appmap_bin): raise ValueError(f"Could not find appmap binary at {args.appmap_bin}") @@ -252,7 +252,7 @@ def main(args): "--num_workers", type=int, default=None, help="(Optional) Number of workers" ) parser.add_argument( - "--appmap-bin", + "--appmap_bin", type=str, help="path to appmap binary", default="~/.appmap/bin/appmap", diff --git a/appmap/navie_issue.py b/appmap/navie_issue.py index 1877e3f5..b3dbd1f8 100755 --- a/appmap/navie_issue.py +++ b/appmap/navie_issue.py @@ -9,7 +9,6 @@ from datasets import DatasetDict, load_dataset, load_from_disk from swebench.harness.utils import clone_to -from swebench.metrics.getters import get_eval_refs from subprocess import PIPE, Popen import json from filelock import FileLock diff --git a/appmap/report.py b/appmap/report.py new file mode 100644 index 00000000..4b7145d9 --- /dev/null +++ b/appmap/report.py @@ -0,0 +1,96 @@ +import argparse +import csv +import os + +from swebench import get_model_report +from appmap.data import load_data + + +def main(predictions, instances, log_dir, model, split, save_results, verbose, output): + report = get_model_report( + model=model, + predictions_path=os.path.abspath(predictions), + swe_bench_tasks=instances, + log_dir=os.path.join(log_dir, model), + verbose=verbose, + ) + + for k, v in report.items(): + print(f"{k}: {len(v)}") + + if save_results: + dataset = load_data(instances, split) + write_csv_report( + report, + dataset, + split, + output, + ) + + +def write_csv_report(report_map, dataset, split, output_csv_path): + # Prepare CSV headers + headers = ["instance_id", "split"] + [ + key for key in report_map.keys() if key != "no_generation" + ] + + all_preds = set() + for ids in report_map.values(): + all_preds.update(ids) + + # Write to CSV + with open(output_csv_path, "w", newline="") as csv_file: + writer = csv.DictWriter(csv_file, fieldnames=headers) + writer.writeheader() + for instance in dataset.to_list(): + if instance["instance_id"] not in all_preds: + continue + row = {"instance_id": instance["instance_id"], "split": split} + for category in headers[len(row) :]: + row[category] = instance["instance_id"] in report_map.get(category, []) + writer.writerow(row) + + print(f"Wrote {len(all_preds)} predictions to {output_csv_path}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--predictions", + type=str, + default="predictions.jsonl", + help="Path to predictions file", + ) + parser.add_argument( + "--instances", + type=str, + help="huggingface name of task instances dataset", + default="princeton-nlp/SWE-bench_Lite", + ) + parser.add_argument( + "--log_dir", type=str, help="Path to log directory", default="logs" + ) + parser.add_argument( + "--model", + type=str, + default="navie", + help="Name of folder containing model evaluation results (e.g. '20240402_sweagent_gpt4)", + ) + parser.add_argument( + "--split", + type=str, + default="test", + help="Name of split to get evaluation results for (should be parent folder, e.g. 'test', 'dev')", + choices=["test", "dev"], + ) + parser.add_argument( + "--save_results", default=True, action="store_true", help="Save results to file" + ) + parser.add_argument( + "--verbose", action="store_true", help="Show intermediate messages" + ) + parser.add_argument( + "--output", type=str, default="results.csv", help="Path to output file" + ) + args = parser.parse_args() + main(**vars(args)) diff --git a/appmap/solve.py b/appmap/solve.py new file mode 100644 index 00000000..6ff903d1 --- /dev/null +++ b/appmap/solve.py @@ -0,0 +1,256 @@ +import argparse +import json +from pathlib import Path +from multiprocessing import Pool, current_process, cpu_count +from swebench.harness.context_manager import ( + TestbedContextManager, + TaskEnvContextManager, +) +from swebench.harness.utils import split_instances, DotDict +from subprocess import run +from os.path import abspath +from filelock import FileLock +from data import load_data + + +def output_results(instance, output_file, patch): + if patch is None: + return + instance["model_patch"] = patch + instance["model_name_or_path"] = "navie" + with FileLock(f"{output_file}.lock"): + with open(output_file, "a+") as f: + f.write(json.dumps(instance) + "\n") + + +def solve_instance(instance, log_dir, testbed, appmap_command, lint_command, retries): + issue_dir = Path(log_dir) / "solve" / instance["instance_id"] + issue_dir.mkdir(parents=True, exist_ok=True) + issue_file = issue_dir / "issue.txt" + with open(issue_file, "w") as f: + f.write(instance["problem_statement"]) + + solver_path = Path(__file__).parent / "solve" / "solver.py" + run_args = [ + "python", + str(solver_path), + str(issue_file), + "--retries", + str(retries), + "--log-dir", + log_dir, + "--appmap-command", + appmap_command, + ] + if lint_command is not None: + run_args.extend(["--lint-command", lint_command]) + + try: + # Run this as a separate process so that it can change the working directory. + run(run_args, check=True, cwd=testbed) + output = run( + ["git", "--no-pager", "diff"], + check=True, + cwd=testbed, + capture_output=True, + text=True, + ) + return output.stdout + except Exception: + print(f"Error processing {instance['instance_id']}") + import traceback + + traceback.print_exc() + + +def worker_init(data: dict): + """ + Args: + data: Dict containing task instances and other data + conda_link: URL to conda installation to use + task_instances: List of task instances + log_dir: Path to log directory + path_conda: Path to miniconda3 or anaconda installation + testbed: Path to testbed directory + temp_dir: Path to temporary directory for storing virtual envs + timeout: Timeout (seconds) for testing script execution + verbose: Verbose mode + output_file: Path to output file + """ + data_dict = DotDict(data) + + assert data_dict.output is not None + assert data_dict.appmap_command is not None + assert data_dict.path_conda is not None + assert data_dict.retries is not None + + output_file = abspath(data_dict.output) + + try: + with TestbedContextManager( + data_dict.task_instances, + data_dict.log_dir, + conda_link=data_dict.conda_link, + path_conda=data_dict.path_conda, + testbed=data_dict.testbed, + temp_dir=data_dict.temp_dir, + timeout=data_dict.timeout, + verbose=data_dict.verbose, + keep=data_dict.keep, + ) as tcm: + for instance in data_dict.task_instances: + repo_prefix = instance["repo"].replace("/", "__") + env_name = f"{repo_prefix}__{instance['version']}" + testbed = Path(tcm.testbed) / env_name + log_dir = abspath(data_dict.log_dir) + try: + with TaskEnvContextManager( + instance, + testbed.as_posix(), + env_name, + log_dir, + data_dict.path_conda, + timeout=data_dict.timeout, + verbose=data_dict.verbose, + log_suffix=data_dict.log_suffix, + ) as task_manager: + if not task_manager.reset_task_env(instance): + return + patch = solve_instance( + instance, + log_dir, + testbed, + data_dict.appmap_command, + data_dict.lint_command, + data_dict.retries + ) + output_results(instance, output_file, patch) + except Exception: + print(f"Error processing {instance['instance_id']}") + import traceback + traceback.print_exc() + except Exception: + print("Error instantiating testbed") + import traceback + traceback.print_exc() + + +def solve_instances(instances, args): + if args.filter is not None: + instances = [ + instance for instance in instances if args.filter in instance["instance_id"] + ] + + instance_groups = split_instances(list(instances), args.num_workers) + data_groups = [ + { + "task_instances": g, + "func": solve_instance, + **vars(args), + } + for g in instance_groups + ] + + if args.num_workers == 1: + worker_init(data_groups[0]) + return + + pool = Pool(processes=args.num_workers) + pool.map(worker_init, data_groups) + pool.close() + pool.join() + + +def main(args): + dataset = load_data(args.instances_path, args.split) + solve_instances(dataset, args) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--instances_path", + "--instances", + type=str, + help="path or huggingface name of task instances dataset", + default="princeton-nlp/SWE-bench_Lite", + ) + parser.add_argument( + "--split", type=str, default="test", help="Dataset split to use" + ) + parser.add_argument( + "--log_dir", type=str, help="Path to log directory", default="logs" + ) + parser.add_argument( + "--conda_link", + type=str, + default=None, + help="(Optional) URL to conda installation to use", + ) + parser.add_argument( + "--log_suffix", + type=str, + default=None, + help="(Optional) Suffix to append to log file names", + ) + parser.add_argument( + "--path_conda", + type=str, + help="(Optional) Path to miniconda3 or anaconda installation", + ) + parser.add_argument( + "--testbed", type=str, help="(Optional) Path to testbed directory" + ) + parser.add_argument( + "--temp_dir", + type=str, + help="(Optional) Path to temporary directory for storing virtual envs", + ) + parser.add_argument( + "--timeout", + type=int, + default=None, + help="(Optional) Timeout (seconds) for testing script execution", + ) + parser.add_argument( + "--retries", + type=int, + default=3, + help="Number of times to try and create a code update for each test instance", + ) + parser.add_argument( + "--verbose", action="store_true", help="(Optional) Verbose mode" + ) + parser.add_argument( + "--num_workers", + type=int, + default=cpu_count(), + help="(Optional) Number of workers", + ) + parser.add_argument( + "--filter", + type=str, + default=None, + help="(Optional) Filter to apply to task instances", + ) + parser.add_argument( + "--appmap_command", type=str, default="appmap", help="Path to appmap command" + ) + parser.add_argument( + "--lint_command", + type=str, + help="Path to lint command. Example: flake8 --extend-ignore=BLK100,W293,E501,E302,D", + ) + parser.add_argument( + "--output", + type=str, + default="predictions.jsonl", + help="Path to output predictions", + ) + parser.add_argument( + "--keep", + action="store_true", + help="(Optional) Keep temporary directories after running", + ) + args = parser.parse_args() + main(args) diff --git a/appmap/solve/__init__.py b/appmap/solve/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/appmap/solve/format_instructions.py b/appmap/solve/format_instructions.py new file mode 100644 index 00000000..d74055a3 --- /dev/null +++ b/appmap/solve/format_instructions.py @@ -0,0 +1,80 @@ +import textwrap + + +def format_instructions(): + return textwrap.dedent( + """ + For each change you want to make, generate a pair of tags called and . + + Wrap these tags with a tag that also includes a tag with the file path. + + The tag should contain the original code that you want to change. Do not abbreviate + existing code using ellipses or similar. + + Always include an attribute "no-ellipsis" with the value "true" in the tag. + This should be a true statement about the tag. + + The code should contain an attribute that indicates about how many lines of context + it contains. You should plan for this context to contain the code that should be modified, plus + three lines before and after it. + + Do not output the entire original code, or long functions, if you only want to change a part of it. + Plan to output only the part that you want to change. + + If you need to make multiple changes to the same file, output multiple tags. + In the change, indicate the number of the change that this is, starting from 1. + + The tag should contain the modified code that you want to replace the original code with. + Do not abbreviate the modified code using ellipses or similar. You must place the exact modified code + in the tag. + + You do not need to output the entire modified code if you only want to change a part of it. Output + only the part that you want to change. + + Always include an attribute "no-ellipsis" with the value "true" in the tag. + This should be a true statement about the tag. + + Both the original code and the output code must contain the proper indentation and formatting. + For example, if the original code has 4 spaces of indentation, the output code must also have 4 + spaces of indentation. If the original code has 8 spaces of indentation, the output code must also have + 8 spaces of indentation. + + The and content should be wrapped in a CDATA section to avoid XML parsing issues. + + ## Example output + + + src/main/java/org/springframework/samples/petclinic/vet/Vet.java + specialties; + + protected Set getSpecialtiesInternal() { + if (this.specialties == null) { + this.specialties = new HashSet<>(); + } + return this.specialties; + }]]> + specialties; + + private String address; + + protected Set getSpecialtiesInternal() { + if (this.specialties == null) { + this.specialties = new HashSet<>(); + } + return this.specialties; + }]]> + + """ + ) diff --git a/appmap/solve/log.py b/appmap/solve/log.py new file mode 100644 index 00000000..a3189a61 --- /dev/null +++ b/appmap/solve/log.py @@ -0,0 +1,30 @@ +import os + +from filelock import FileLock + + +def log_command(dir, command): + command_lock_file = os.path.join(dir, "command.lock") + command_log_file = os.path.join(dir, "command.log") + + with FileLock(command_lock_file): + with open(command_log_file, "a+") as f: + f.write(command + "\n") + + +def log_lint(dir, file, lint_messages): + lint_lock_file = os.path.join(dir, "lint.lock") + lint_log_file = os.path.join(dir, "lint.log") + + with FileLock(lint_lock_file): + with open(lint_log_file, "a+") as f: + f.writelines("\n".join([file, "-" * len(file), lint_messages, "\n"])) + + +def log_diff(dir, file, diff): + diff_lock_file = os.path.join(dir, "diff.lock") + diff_log_file = os.path.join(dir, "diff.log") + + with FileLock(diff_lock_file): + with open(diff_log_file, "a+") as f: + f.writelines("\n".join([file, "-" * len(file), diff, "\n"])) diff --git a/appmap/solve/run_command.py b/appmap/solve/run_command.py new file mode 100644 index 00000000..56ec4b8f --- /dev/null +++ b/appmap/solve/run_command.py @@ -0,0 +1,13 @@ +import subprocess + +from .log import log_command + + +def run_command(log_dir, command, fail_on_error=True): + log_command(log_dir, command) + + result = subprocess.run(command, shell=True, capture_output=True) + if result.returncode != 0 and fail_on_error: + raise RuntimeError(f"Failed to execute command {command}") + + return result.stdout.decode() diff --git a/appmap/solve/run_navie_command.py b/appmap/solve/run_navie_command.py new file mode 100644 index 00000000..c1a7a677 --- /dev/null +++ b/appmap/solve/run_navie_command.py @@ -0,0 +1,45 @@ +import os + +from .log import log_command + + +def run_navie_command( + log_dir, + command, + output_path, + log_path, + context_path=None, + input_path=None, + additional_args=None, +): + """ + Execute the navie command with specified arguments. + + :param command: Command to execute (e.g., 'navie') + :param context_path: Path to the context file + :param input_path: Path to the input file + :param output_path: Path to the output file + :param log_path: Path to the log file + :param additional_args: Additional arguments for the command + :return: None + """ + # Build the command + cmd = f"{command} navie --log-navie" + # TODO: Add token limit option, e.g. --ai-option tokenLimit=4000 + if input_path: + cmd += f" -i {input_path}" + if context_path: + cmd += f" -c {context_path}" + cmd += f" -o {output_path}" + if additional_args: + cmd += f" {additional_args}" + cmd += f" > {log_path} 2>&1" + + log_command(log_dir, cmd) + + result = os.system(cmd) + + if result != 0: + raise RuntimeError( + f"Failed to execute command {cmd}. See {log_path} for details." + ) diff --git a/appmap/solve/solver.py b/appmap/solve/solver.py new file mode 100644 index 00000000..7818a6ae --- /dev/null +++ b/appmap/solve/solver.py @@ -0,0 +1,217 @@ +import argparse +import json +import os +import sys + +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(os.path.join(SCRIPT_DIR, "..", "..")) + +from appmap.solve.steps.step_lint_repair import step_lint_repair +from appmap.solve.steps.step_apply import step_apply +from appmap.solve.steps.step_generate import step_generate +from appmap.solve.steps.step_list import step_list +from appmap.solve.steps.step_plan import step_plan + +DEFAULT_STEPS = {"plan": True, "list": True, "generate": True, "apply": True} + + +class Solver: + def __init__( + self, + issue_file, + log_dir, + format_command=None, + lint_command=None, + lint_error_pattern=None, + appmap_command="appmap", + steps=None, + ): + self.issue_file = issue_file + self.log_dir = log_dir + self.format_command = format_command + self.lint_command = lint_command + self.lint_error_pattern = lint_error_pattern + self.appmap_command = appmap_command + self.steps = steps or DEFAULT_STEPS + + if self.lint_command and not self.steps["apply"]: + print("WARN: Lint command will not be executed without apply step.") + + if not os.path.isfile(self.issue_file): + raise FileNotFoundError(f"File '{self.issue_file}' not found.") + + self.work_dir = os.path.dirname(os.path.abspath(self.issue_file)) + + self.plan_file = os.path.join(self.work_dir, "plan.md") + self.solution_file = os.path.join(self.work_dir, "solution.md") + self.apply_file = os.path.join(self.work_dir, "apply.md") + self.files = [] + + def solve(self): + if self.steps["plan"]: + self.plan() + + if self.steps["list"]: + self.list_files() + + if self.steps["generate"]: + self.generate_code() + + self.base_file_content = {} + self.files_changed = [] + if self.steps["apply"]: + self.base_file_content = self.load_file_content() + + self.apply_changes() + + self.updated_file_content = self.load_file_content() + for file in self.updated_file_content: + if self.updated_file_content[file] != self.base_file_content[file]: + self.files_changed.append(file) + + if self.lint_command: + if len(self.files_changed) > 0: + self.lint_repair() + else: + print( + "WARN: No changes were applied. Lint repair step will be skipped." + ) + + def plan(self): + step_plan( + self.log_dir, + self, + self.issue_file, + self.work_dir, + self.appmap_command, + self.plan_file, + ) + + def list_files(self): + step_list(self.log_dir, self.work_dir, self.appmap_command, self.plan_file) + with open(os.path.join(self.work_dir, "files.json")) as f: + self.files = json.load(f) + + def generate_code(self): + step_generate( + self.log_dir, + self, + self.work_dir, + self.appmap_command, + self.plan_file, + self.solution_file, + self.files, + ) + + def load_file_content(self): + result = {} + for file in self.files: + if os.path.isfile(file): + with open(file, "r") as f: + result[file] = f.read() + return result + + def apply_changes(self): + step_apply( + self.log_dir, + self.work_dir, + self.appmap_command, + self.solution_file, + self.apply_file, + ) + + def lint_repair(self): + step_lint_repair( + self.log_dir, + self, + self.work_dir, + self.appmap_command, + self.base_file_content, + ) + + +def parse_arguments(): + parser = argparse.ArgumentParser( + description="Solve software issue described in a file." + ) + parser.add_argument( + "issue_file", type=str, help="File containing the issue description" + ) + + parser.add_argument( + "--retries", + type=int, + default=3, + help="Number of times to try and create a code update for each test instance", + ) + + parser.add_argument( + "--directory", + type=str, + help="Working directory of the project to modify", + default=None, + ) + parser.add_argument( + "--log-dir", type=str, help="Directory to store logs", default="logs" + ) + parser.add_argument( + "--format-command", type=str, help="Format command to use", default=None + ) + parser.add_argument( + "--lint-command", type=str, help="Lint command to use", default=None + ) + parser.add_argument( + "--lint-error-pattern", type=str, help="Lint error pattern to use", default=None + ) + parser.add_argument( + "--appmap-command", type=str, help="AppMap command to use", default="appmap" + ) + + parser.add_argument("--noplan", action="store_true", help="Do not generate a plan") + parser.add_argument( + "--nolist", action="store_true", help="Do not list files to be modified" + ) + parser.add_argument( + "--nogenerate", action="store_true", help="Do not generate code" + ) + parser.add_argument("--noapply", action="store_true", help="Do not apply changes") + + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_arguments() + steps = { + "plan": not args.noplan, + "list": not args.nolist, + "generate": not args.nogenerate, + "apply": not args.noapply, + } + + if args.directory: + os.chdir(args.directory) + + if args.log_dir: + os.makedirs(args.log_dir, exist_ok=True) + + attempt_number = 0 + files_changed = [] + while len(files_changed) == 0: + solver = Solver( + issue_file=args.issue_file, + log_dir=args.log_dir, + format_command=args.format_command, + lint_command=args.lint_command, + lint_error_pattern=args.lint_error_pattern, + appmap_command=args.appmap_command, + steps=steps, + ) + solver.solve() + files_changed = solver.files_changed + if len(files_changed) == 0: + print("No files were changed.") + attempt_number += 1 + if attempt_number == args.retries: + print("Giving up after {attempt_number} attempts") + else: + print(f"Retrying (attempt number {attempt_number + 1} of {args.retries})") diff --git a/appmap/solve/steps/__init__.py b/appmap/solve/steps/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/appmap/solve/steps/step_apply.py b/appmap/solve/steps/step_apply.py new file mode 100644 index 00000000..5dc9b2c4 --- /dev/null +++ b/appmap/solve/steps/step_apply.py @@ -0,0 +1,23 @@ +from ..run_navie_command import run_navie_command + + +import os + + +def step_apply(log_dir, work_dir, appmap_command, solution_file, apply_file): + apply_prompt = os.path.join(work_dir, "apply.txt") + with open(apply_prompt, "w") as apply_f: + apply_f.write("@apply /all\n\n") + with open(solution_file, "r") as sol_f: + apply_f.write(sol_f.read()) + + print("Applying changes to source files") + run_navie_command( + log_dir, + command=appmap_command, + input_path=apply_prompt, + output_path=apply_file, + log_path=os.path.join(work_dir, "apply.log"), + ) + + print("Changes applied") diff --git a/appmap/solve/steps/step_generate.py b/appmap/solve/steps/step_generate.py new file mode 100644 index 00000000..76303516 --- /dev/null +++ b/appmap/solve/steps/step_generate.py @@ -0,0 +1,86 @@ +from ..run_command import run_command +from ..run_navie_command import run_navie_command +from ..format_instructions import format_instructions + + +import os +import sys + + +def step_generate( + log_dir, args, work_dir, appmap_command, plan_file, solution_file, files +): + context_file = os.path.join(work_dir, "context.txt") + with open(context_file, "w") as context_f: + for file in files: + context_f.write("\n") + context_f.write(f"{file}\n") + context_f.write("\n") + if os.path.isfile(file): + if args.format_command: + print(f"Auto-formatting file {file}") + format_command = args.format_command.split() + [file] + run_command(" ".join(format_command)) + + with open(file, "r") as content_f: + file_content = content_f.read() + file_lines = file_content.split("\n") + any_line_starts_with_tabs = any( + line.startswith("\t") for line in file_lines + ) + if any_line_starts_with_tabs: + print( + f"Warning: File '{file}' starts with tabs. Code generation is not likely to be reliable. Please replace identation with spaces, or specify the --format-command option to have it done automatically.", + file=sys.stderr, + ) + + context_f.write(file_content) + else: + print( + f"Notice: File '{file}' does not exist. It will probably be created in the code generation step.", + file=sys.stderr, + ) + context_f.write("\n") + context_f.write("\n") + + generate_prompt = os.path.join(work_dir, "generate.txt") + with open(generate_prompt, "w") as generate_f: + generate_f.write( + f"""@generate /nocontext /noformat + +## Input format + +The plan is delineated by the XML tag. +The source files are delineated by XML tags. Each file has a tag with the file path and a tag with the file content. +Do not treat the XML tags as part of the source code. They are only there to help you parse the context. + +## Guidelines + +Try to solve the problem with a minimal set of code changes. +Avoid refactorings that will affect multiple parts of the codebase. + +## Output format + +{format_instructions()} + +""" + ) + + generate_f.write("\n") + with open(plan_file, "r") as plan_content: + generate_f.write(plan_content.read()) + generate_f.write("\n") + with open(context_file, "r") as context_content: + generate_f.write(context_content.read()) + + print("Solving plan", plan_file, "using", generate_prompt) + + run_navie_command( + log_dir, + command=appmap_command, + input_path=generate_prompt, + output_path=solution_file, + log_path=os.path.join(work_dir, "generate.log"), + ) + + print(f"Code generated in {solution_file}") diff --git a/appmap/solve/steps/step_lint_repair.py b/appmap/solve/steps/step_lint_repair.py new file mode 100644 index 00000000..a2e9bf54 --- /dev/null +++ b/appmap/solve/steps/step_lint_repair.py @@ -0,0 +1,215 @@ +from ..log import log_diff, log_lint +from ..run_command import run_command +from ..run_navie_command import run_navie_command +from ..format_instructions import format_instructions + + +import os +import re +import subprocess + + +def step_lint_repair(log_dir, args, work_dir, appmap_command, base_file_content): + lint_command = args.lint_command + lint_error_pattern = args.lint_error_pattern + + print("Linting source files") + + work_dir_base_name = os.path.basename(work_dir) + + for file in base_file_content.keys(): + print(f"Linting {file}") + norm_file = file.replace("/", "_") + + lint_args = lint_command.split() + [file] + + lint_result = subprocess.run( + lint_args, + capture_output=True, + text=True, + ) + + lint_output = lint_result.stdout + lint_result.stderr + + log_lint(log_dir, os.path.join(work_dir_base_name, file), lint_output) + + # If lint_error_pattern starts and ends with '/', treat it as a regular expression. + # Otherwise, treat it as a string literal. + # + # Find all lint errors reported in the output. Then select just those errors that + # are reported on lines that we have modified. + lint_errors = [] + if lint_error_pattern: + if lint_error_pattern.startswith("/") and lint_error_pattern.endswith("/"): + lint_errors = re.findall(lint_error_pattern[1:-1], lint_output) + else: + lint_errors = lint_output.split("\n").filter( + lambda line: lint_error_pattern in line + ) + else: + lint_errors = lint_output.split("\n") + + temp_dir = os.path.join(work_dir, "diff", norm_file) + os.makedirs(temp_dir, exist_ok=True) + # Write the base file content + with open(os.path.join(temp_dir, "base"), "w") as f: + f.write(base_file_content[file]) + with open(file, "r") as f: + with open(os.path.join(temp_dir, "updated"), "w") as f2: + f2.write(f.read()) + # Run the diff command + diff_command = f"diff -u {os.path.join(temp_dir, 'base')} {os.path.join(temp_dir, 'updated')}" + file_diff = run_command(log_dir, diff_command, fail_on_error=False) + + log_diff(log_dir, os.path.join(work_dir_base_name, file), file_diff) + + # Lint errors are formatted like this: + # bin/solve.py:257:80: E501 line too long (231 > 79 characters) + # Collect the line numbers of the lint errors. + lint_errors_by_line_number = {} + for error in lint_errors: + if error: + line_number = error.split(":")[1] + lint_errors_by_line_number[int(line_number)] = error + + # The file diff contains chunks like: + # @@ -147,15 +147,21 @@ + # Find the '+' number, which indicates the start line. Also find the number after the + # comma, which indicates the number of lines. Report these two numbers for each chunk. + diff_ranges = [ + [int(ch) for ch in chunk.split(" ")[2].split(",")] + for chunk in file_diff.split("\n") + if chunk.startswith("@@") + ] + + for diff_range in diff_ranges: + print( + f"The file has changes between lines {diff_range[0]} and {diff_range[0] + diff_range[1]}" + ) + + lint_error_line_numbers_within_diff_sections = [ + line_number + for line_number in lint_errors_by_line_number.keys() + for diff_range in diff_ranges + if diff_range[0] <= line_number <= diff_range[0] + diff_range[1] + ] + + if lint_error_line_numbers_within_diff_sections: + lint_errors = [ + lint_errors_by_line_number[line_number] + for line_number in lint_error_line_numbers_within_diff_sections + ] + + lint_error_message = "\n".join( + [ + "Lint errors within diff sections:", + *lint_errors, + ] + ) + + print(lint_error_message) + log_diff( + log_dir, os.path.join(work_dir_base_name, file), lint_error_message + ) + else: + print("There are no lint errors within diff sections") + log_diff( + log_dir, + os.path.join(work_dir_base_name, file), + "No lint errors within diff sections", + ) + + for line_number in lint_error_line_numbers_within_diff_sections: + lint_error = lint_errors_by_line_number[line_number] + print(f"Error reported on line {line_number}: {lint_error}") + + # Extract the chunk of code that contains the error + content_chunk_lines = [] + with open(file, "r") as f: + lines = f.readlines() + + range_min = max(0, line_number - 7) + range_max = min(len(lines), line_number + 7) + for line_number in range(range_min, range_max): + content_chunk_lines.append( + f"{line_number + 1}: {lines[line_number]}" + ) + + repair_dir = os.path.join(work_dir, "repair", norm_file, str(line_number)) + os.makedirs(repair_dir, exist_ok=True) + + repair_prompt, repair_output, repair_log = [ + os.path.join(repair_dir, f"generate.{ext}") + for ext in ["txt", "md", "log"] + ] + repair_apply_prompt, repair_apply_output, repair_apply_log = [ + os.path.join(repair_dir, f"apply.{ext}") for ext in ["txt", "md", "log"] + ] + + with open(repair_prompt, "w") as f: + f.write( + f"""@generate /nocontext /noformat + +Fix the linter errors indicated by the tag. + +## Output format + +{format_instructions()} + +In the and tags, do not emit line numbers. The line numbers are +only present in the file/content to help you identify which line has the lint error. + +## Error report + + +""" + ) + f.write(lint_error) + f.write( + """ + + +""" + ) + f.write(file) + f.write( + """ + + +""" + ) + f.write("".join(content_chunk_lines)) + f.write( + """ + + +""" + ) + + # Plan the repair + print(f"Generating code to repair {file}") + run_navie_command( + log_dir, + command=appmap_command, + input_path=repair_prompt, + output_path=repair_output, + log_path=repair_log, + ) + + print(f"Code generated to repair source file in {repair_output}") + + with open(repair_apply_prompt, "w") as f: + f.write("@apply /all\n\n") + with open(repair_output, "r") as plan_fp: + f.write(plan_fp.read()) + + print("Applying changes to source files") + run_navie_command( + log_dir, + command=appmap_command, + input_path=repair_apply_prompt, + output_path=repair_apply_output, + log_path=repair_apply_log, + ) + + print("Changes applied") diff --git a/appmap/solve/steps/step_list.py b/appmap/solve/steps/step_list.py new file mode 100644 index 00000000..b3c2ce12 --- /dev/null +++ b/appmap/solve/steps/step_list.py @@ -0,0 +1,18 @@ +from ..run_navie_command import run_navie_command + + +import os + + +def step_list(log_dir, work_dir, appmap_command, plan_file): + print("Detecting files to be modified") + run_navie_command( + log_dir, + command=appmap_command, + context_path=plan_file, + output_path=os.path.join(work_dir, "files.json"), + log_path=os.path.join(work_dir, "list-files.log"), + additional_args="@list-files /format=json /nofence", + ) + + print(f"Files to be modified stored in {os.path.join(work_dir, 'files.json')}") diff --git a/appmap/solve/steps/step_plan.py b/appmap/solve/steps/step_plan.py new file mode 100644 index 00000000..f8c81c60 --- /dev/null +++ b/appmap/solve/steps/step_plan.py @@ -0,0 +1,46 @@ +import textwrap +from ..run_navie_command import run_navie_command + + +import os +import re + + +def step_plan(log_dir, args, issue_file, work_dir, appmap_command, plan_file): + print(f"Generating a plan for {args.issue_file}") + + plan_prompt = os.path.join(work_dir, "plan.txt") + with open(plan_prompt, "w") as plan_f: + plan_f.write( + textwrap.dedent( + """@plan + + ## Guidelines + + * Try to solve the problem with a minimal set of code changes. + * Do not output code blocks or fenced code. Output only a text description of the suggested + changes, along with the file names. + """ + ) + ) + + run_navie_command( + log_dir, + command=appmap_command, + context_path=issue_file, + input_path=plan_prompt, + output_path=plan_file, + log_path=os.path.join(work_dir, "plan.log"), + ) + + print(f"Plan stored in {plan_file}") + + # Load the plan file and strip code blocks that are delimited by ``` + with open(plan_file, "r") as f: + plan_content = f.read() + original_plan_content = plan_content + plan_content = re.sub(r"```.*?```", "", plan_content, flags=re.DOTALL) + # Diff the original and stripped content + if original_plan_content != plan_content: + with open(plan_file, "w") as f: + f.write(plan_content) diff --git a/submodules/appmap-js b/submodules/appmap-js new file mode 160000 index 00000000..dc69da4c --- /dev/null +++ b/submodules/appmap-js @@ -0,0 +1 @@ +Subproject commit dc69da4c418e0e58dc62af78d1491bc966f720b6 diff --git a/swebench/harness/context_manager.py b/swebench/harness/context_manager.py index e74211b8..b12fdb4c 100644 --- a/swebench/harness/context_manager.py +++ b/swebench/harness/context_manager.py @@ -305,7 +305,6 @@ def __enter__(self): self.path_conda = os.path.abspath(self.path_conda) path_activate = os.path.join(self.path_conda, "bin", "activate") exec_cmd = os.path.join(self.path_conda, "bin", "conda") - env_list = get_conda_env_names(exec_cmd) # Set up testbed (environment, github repo) for each repo for repo, version_to_setup_ref in self.setup_refs.items(): @@ -335,83 +334,102 @@ def __enter__(self): else: self.log.write(f"Repo for {repo_prefix} version {version} exists: {repo_path}; skipping") - # Skip if conda environment already exists - if env_name in env_list: - self.log.write(f"Environment {env_name} already exists; skipping") - continue + self.create_conda_env( + version, + path_activate, + exec_cmd, + version_to_setup_ref, + install, + env_name, + ) - # Get setup reference instance - setup_ref_instance = version_to_setup_ref[version] + return self - # Create conda environment according to install instructinos - pkgs = install["packages"] if "packages" in install else "" - if pkgs == "requirements.txt": - # Create environment - cmd = ( - f"{exec_cmd} create -n {env_name} python={install['python']} -y" + def create_conda_env( + self, version, path_activate, exec_cmd, version_to_setup_ref, install, env_name + ): + with FileLock(f"/tmp/conda-env-setup-{env_name}.lock"): + if env_name in get_conda_env_names(exec_cmd): + self.log.write(f"Environment {env_name} already exists; skipping") + + # Get setup reference instance + setup_ref_instance = version_to_setup_ref[version] + + # Create conda environment according to install instructinos + pkgs = install["packages"] if "packages" in install else "" + if pkgs == "requirements.txt": + # Create environment + cmd = f"{exec_cmd} create -n {env_name} python={install['python']} -y" + self.log.write(f"Creating environment {env_name}") + self.exec(cmd.split(" ")) + + # Install dependencies + path_to_reqs = get_requirements(setup_ref_instance, self.testbed) + cmd = f". {path_activate} {env_name} && echo 'activate successful' && pip install -r {path_to_reqs}" + self.log.write( + f"Installing dependencies for {env_name}; Command: {cmd}" + ) + self.exec(["bash", "-c", cmd]) + os.remove(path_to_reqs) + elif pkgs == "environment.yml": + if "no_use_env" in install and install["no_use_env"]: + # Create environment from yml + path_to_reqs = get_environment_yml( + setup_ref_instance, env_name, save_path=self.testbed ) + + # `conda create` based installation + cmd = f"{exec_cmd} create -c conda-forge -n {env_name} python={install['python']} -y" self.log.write(f"Creating environment {env_name}") self.exec(cmd.split(" ")) # Install dependencies - path_to_reqs = get_requirements(setup_ref_instance, self.testbed) - cmd = f". {path_activate} {env_name} && echo 'activate successful' && pip install -r {path_to_reqs}" - self.log.write(f"Installing dependencies for {env_name}; Command: {cmd}") - self.exec(['bash', '-c', cmd]) - os.remove(path_to_reqs) - elif pkgs == "environment.yml": - if "no_use_env" in install and install["no_use_env"]: - # Create environment from yml - path_to_reqs = get_environment_yml( - setup_ref_instance, env_name, - save_path=self.testbed - ) - - # `conda create` based installation - cmd = f"{exec_cmd} create -c conda-forge -n {env_name} python={install['python']} -y" - self.log.write(f"Creating environment {env_name}") - self.exec(cmd.split(" ")) - - # Install dependencies - cmd = f"{exec_cmd} env update -f {path_to_reqs}" - self.log.write(f"Installing dependencies for {env_name}; Command: {cmd}") - self.exec(cmd.split(" ")) - else: - # Create environment from yml - path_to_reqs = get_environment_yml( - setup_ref_instance, env_name, - save_path=self.testbed, - python_version=install["python"] - ) - - # `conda env create` based installation - cmd = f"{exec_cmd} env create --file {path_to_reqs}" - self.log.write(f"Creating environment {env_name}") - self.exec(cmd.split(" ")) - - # Remove environment.yml - os.remove(path_to_reqs) + cmd = f"{exec_cmd} env update -f {path_to_reqs}" + self.log.write( + f"Installing dependencies for {env_name}; Command: {cmd}" + ) + self.exec(cmd.split(" ")) else: - # Create environment + install dependencies - cmd = f"{exec_cmd} create -n {env_name} python={install['python']} {pkgs} -y" + # Create environment from yml + path_to_reqs = get_environment_yml( + setup_ref_instance, + env_name, + save_path=self.testbed, + python_version=install["python"], + ) + + # `conda env create` based installation + cmd = f"{exec_cmd} env create --file {path_to_reqs}" self.log.write(f"Creating environment {env_name}") self.exec(cmd.split(" ")) - arch = platform.machine() - arch_specific_packages = install.get("arch_specific_packages", {}).get(arch, "") - if arch_specific_packages: - cmd = f". {path_activate} {env_name} && conda install {arch_specific_packages} -y" - self.log.write(f"Installing arch-specific packages for {env_name}; Command: {cmd}") - self.exec(['bash', '-c', cmd]) - - # Install additional packages if specified - if "pip_packages" in install: - pip_packages = " ".join(install["pip_packages"]) - cmd = f". {path_activate} {env_name} && pip install {pip_packages}" - self.log.write(f"Installing pip packages for {env_name}; Command: {cmd}") - self.exec(['bash', '-c', cmd]) - - return self + # Remove environment.yml + os.remove(path_to_reqs) + else: + # Create environment + install dependencies + cmd = f"{exec_cmd} create -n {env_name} python={install['python']} {pkgs} -y" + self.log.write(f"Creating environment {env_name}") + self.exec(cmd.split(" ")) + + arch = platform.machine() + arch_specific_packages = install.get("arch_specific_packages", {}).get( + arch, "" + ) + if arch_specific_packages: + cmd = f". {path_activate} {env_name} && conda install {arch_specific_packages} -y" + self.log.write( + f"Installing arch-specific packages for {env_name}; Command: {cmd}" + ) + self.exec(["bash", "-c", cmd]) + + # Install additional packages if specified + if "pip_packages" in install: + pip_packages = " ".join(install["pip_packages"]) + cmd = f". {path_activate} {env_name} && pip install {pip_packages}" + self.log.write( + f"Installing pip packages for {env_name}; Command: {cmd}" + ) + self.exec(["bash", "-c", cmd]) def get_distributed_tasks(self) -> list: """ @@ -434,7 +452,7 @@ def get_distributed_tasks(self) -> list: "timeout": self.timeout, "venv": env_name, "version": version, - "verbose": self.verbose, + "verbose": self.verbose } distributed_tasks.append(task_set) return distributed_tasks diff --git a/swebench/harness/engine_validation.py b/swebench/harness/engine_validation.py index 4007b159..8ec9c147 100644 --- a/swebench/harness/engine_validation.py +++ b/swebench/harness/engine_validation.py @@ -95,7 +95,7 @@ def setup_testbed(data: dict): testbed=data_dict.testbed, temp_dir=data_dict.temp_dir, timeout=data_dict.timeout, - verbose=data_dict.verbose, + verbose=data_dict.verbose ) as tcm: distributed_task_list = tcm.get_distributed_tasks() for task_list in distributed_task_list: diff --git a/viewer/script.js b/viewer/script.js index af700e07..4f005450 100644 --- a/viewer/script.js +++ b/viewer/script.js @@ -28,9 +28,31 @@ function addPreField(k, data) { const h1 = document.createElement("h2"); h1.textContent = k; div.appendChild(h1); - const p = document.createElement("pre"); - p.textContent = stringify(data[k]); - div.appendChild(p); + + if (k === "navie_context") { + data[k].forEach((item) => { + const d = document.createElement("div"); + d.style.background = "#eee"; + d.style.padding = "1rem"; + const h4 = document.createElement("h4"); + h4.style.marginBottom = "0"; + h4.textContent = item.directory; + const subtitle = document.createElement("h5"); + subtitle.style.marginBottom = "0"; + subtitle.style.marginTop = "0"; + subtitle.textContent = item.type; + const pre = document.createElement("pre"); + pre.textContent = item.content.replaceAll("\\n", "\n"); + d.appendChild(h4); + d.appendChild(subtitle); + d.appendChild(pre); + div.appendChild(d); + }); + } else { + const p = document.createElement("pre"); + p.textContent = stringify(data[k]); + div.appendChild(p); + } dataContainer.appendChild(div); } } @@ -52,7 +74,11 @@ function addMdField(k, data) { function stringify(value) { switch (typeof value) { case "object": - return JSON.stringify(value, undefined, 2); + return JSON.stringify(value, undefined, 2).replaceAll( + "\\n", + ` +` + ); default: return String(value); } diff --git a/viewer/viewer.py b/viewer/viewer.py index b34727c3..4627a0d7 100644 --- a/viewer/viewer.py +++ b/viewer/viewer.py @@ -5,7 +5,7 @@ import os import sys -PORT = 8080 +PORT = 8081 data_path = os.path.abspath(sys.argv[1])