From fbfc49dc7ff5512ce9ca0e9b6086be64077d56e6 Mon Sep 17 00:00:00 2001 From: Kevin Gilpin Date: Wed, 14 Aug 2024 12:38:10 -0400 Subject: [PATCH] fix: Anticipate a test may fail without error This has been observed in cases of timeout, for example. --- solver/solve/steps/run_test.py | 14 +++++--- solver/solve/steps/step_maketest.py | 55 +++++++++++++++++------------ solver/solve/steps/step_verify.py | 16 ++++----- 3 files changed, 48 insertions(+), 37 deletions(-) diff --git a/solver/solve/steps/run_test.py b/solver/solve/steps/run_test.py index 262c39d..3694880 100644 --- a/solver/solve/steps/run_test.py +++ b/solver/solve/steps/run_test.py @@ -1,11 +1,17 @@ import subprocess -from swebench.harness.constants import MAP_REPO_TO_TEST_FRAMEWORK +from typing import Optional -from ..run_command import run_command +from swebench.harness.constants import MAP_REPO_TO_TEST_FRAMEWORK from .test_files_to_modules import test_files_to_modules -def run_test(tcm, test_file, appmap=False, files_to_directives=True): +class TestResult: + def __init__(self, succeeded: bool, test_error: Optional[str]): + self.succeeded = succeeded + self.test_error = test_error + + +def run_test(tcm, test_file, appmap=False, files_to_directives=True) -> TestResult: print(f"[run_test] Running test {test_file}") instance = tcm.instance @@ -74,4 +80,4 @@ def run_test(tcm, test_file, appmap=False, files_to_directives=True): # Select log_lines after base_log_line_count test_error = "\n".join(log_lines[base_log_line_count:]) - return (succeeded, test_error) + return TestResult(succeeded, test_error) diff --git a/solver/solve/steps/step_maketest.py b/solver/solve/steps/step_maketest.py index 9f1cc12..94e7abe 100644 --- a/solver/solve/steps/step_maketest.py +++ b/solver/solve/steps/step_maketest.py @@ -239,19 +239,29 @@ def maketest( # instance_id = tcm.instance["instance_id"] # index_appmaps(instance_id, log_dir, appmap_command) - succeeded, test_error = run_test(tcm, test_file, appmap=False) + test_result = run_test(tcm, test_file, appmap=False) + + def revert_test_changes(): + with open(test_file, "w") as f: + f.write(original_test_content) # Verify that the test_error indicates that the issue is being reproduced fails_for_expected_reason = False - if succeeded: + if test_result.succeeded: print( f"[maketest] ({instance_id}) Test case {test_file} succeeded. This is unexpected!" ) + elif not test_result.test_error: + print( + f"[maketest] ({instance_id}) Test case {test_file} failed without any reported error (timeout?). Reverting test changes." + ) + revert_test_changes() else: print( f"[maketest] ({instance_id}) Test case {test_file} failed. This is expected. Let's see if it failed for the planned reason." ) + test_error = test_result.test_error if "ERROR" in test_error: error_lines = test_error.split("\n") # Find everything after the first line that includes "ERROR", "FAIL", or "activate successful" @@ -262,10 +272,10 @@ def maketest( ) test_error = "\n".join(error_lines[first_line_index_with_error:]) - whyfailed = Editor( - os.path.join(maketest_work_dir, "check"), log_dir=work_dir - ).ask( - f"""/nocontext + whyfailed = Editor( + os.path.join(maketest_work_dir, "check"), log_dir=work_dir + ).ask( + f"""/nocontext {test_error} @@ -275,8 +285,8 @@ def maketest( {issue_content} """, - context=[], - prompt="""## Task + context=[], + prompt="""## Task A test case has been created that is currently expected to fail due to a known issue. @@ -293,23 +303,22 @@ def maketest( - Emit "maybe" if the test error is possibly consistent with the described issue. - Emit "no" if the test error is NOT consistent with the described issue. """, - ) - - if whyfailed == "no": - print( - f"[maketest] ({instance_id}) Test case {test_file} DID NOT fail for the planned reason" - ) - print( - f"[maketest] ({instance_id}) Reverting test changes to {test_file} and trying again" - ) - with open(test_file, "w") as f: - f.write(original_test_content) - else: - fails_for_expected_reason = True - print( - f"[maketest] ({instance_id}) It looks like it failed for the planned reason" ) + if whyfailed == "no": + print( + f"[maketest] ({instance_id}) Test case {test_file} DID NOT fail for the planned reason" + ) + print( + f"[maketest] ({instance_id}) Reverting test changes to {test_file}." + ) + revert_test_changes() + else: + fails_for_expected_reason = True + print( + f"[maketest] ({instance_id}) It looks like it failed for the planned reason" + ) + if instance["repo"] == "django/django": test_directive = test_files_to_modules([test_file])[0] else: diff --git a/solver/solve/steps/step_verify.py b/solver/solve/steps/step_verify.py index 3b19a4d..338eeb8 100644 --- a/solver/solve/steps/step_verify.py +++ b/solver/solve/steps/step_verify.py @@ -135,11 +135,9 @@ def repair_test( print(f"[verify/repair] ({instance_id}) Retesting: {test_directive}") - (succeeded, test_output) = run_test( - task_manager, test_directive, files_to_directives=False - ) + test_result = run_test(task_manager, test_directive, files_to_directives=False) - if not succeeded: + if not test_result.succeeded: print(f"[verify/repair] ({instance_id}) Test failed: {test_directive}") print( f"[verify/repair] ({instance_id}) Review {task_manager.log_file} for more information" @@ -189,24 +187,22 @@ def step_verify( test_directives_repaired = [] for test_directive in test_directives: print(f"[verify] ({instance_id}) Running test: {test_directive}") - succeeded, test_output = run_test( - task_manager, test_directive, files_to_directives=False - ) + test_result = run_test(task_manager, test_directive, files_to_directives=False) - if not succeeded: + if not test_result.succeeded and test_result.test_error: repaired = repair_test( task_manager, verify_dir, work_dir, instance_id, test_directive, - test_output, + test_result.test_error, ) if repaired: test_directives_repaired.append(test_directive) succeeded = True - if succeeded: + if test_result.succeeded: test_directives_succeeded.append(test_directive) if test_directives_repaired: