fix: Anticipate a test may fail without error

This has been observed in cases of timeout, for example.
getappmap · Aug 14, 2024 · fbfc49d · fbfc49d
1 parent 6f50366
commit fbfc49d
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 37 deletions.
diff --git a/solver/solve/steps/run_test.py b/solver/solve/steps/run_test.py
@@ -1,11 +1,17 @@
 import subprocess
-from swebench.harness.constants import MAP_REPO_TO_TEST_FRAMEWORK
+from typing import Optional
 
-from ..run_command import run_command
+from swebench.harness.constants import MAP_REPO_TO_TEST_FRAMEWORK
 from .test_files_to_modules import test_files_to_modules
 
 
-def run_test(tcm, test_file, appmap=False, files_to_directives=True):
+class TestResult:
+    def __init__(self, succeeded: bool, test_error: Optional[str]):
+        self.succeeded = succeeded
+        self.test_error = test_error
+
+
+def run_test(tcm, test_file, appmap=False, files_to_directives=True) -> TestResult:
     print(f"[run_test] Running test {test_file}")
 
     instance = tcm.instance
@@ -74,4 +80,4 @@ def run_test(tcm, test_file, appmap=False, files_to_directives=True):
                 # Select log_lines after base_log_line_count
                 test_error = "\n".join(log_lines[base_log_line_count:])
 
-    return (succeeded, test_error)
+    return TestResult(succeeded, test_error)
diff --git a/solver/solve/steps/step_maketest.py b/solver/solve/steps/step_maketest.py
@@ -239,19 +239,29 @@ def maketest(
     #     instance_id = tcm.instance["instance_id"]
     #     index_appmaps(instance_id, log_dir, appmap_command)
 
-    succeeded, test_error = run_test(tcm, test_file, appmap=False)
+    test_result = run_test(tcm, test_file, appmap=False)
+
+    def revert_test_changes():
+        with open(test_file, "w") as f:
+            f.write(original_test_content)
 
     # Verify that the test_error indicates that the issue is being reproduced
     fails_for_expected_reason = False
-    if succeeded:
+    if test_result.succeeded:
         print(
             f"[maketest] ({instance_id}) Test case {test_file} succeeded. This is unexpected!"
         )
+    elif not test_result.test_error:
+        print(
+            f"[maketest] ({instance_id}) Test case {test_file} failed without any reported error (timeout?). Reverting test changes."
+        )
+        revert_test_changes()
     else:
         print(
             f"[maketest] ({instance_id}) Test case {test_file} failed. This is expected. Let's see if it failed for the planned reason."
         )
 
+        test_error = test_result.test_error
         if "ERROR" in test_error:
             error_lines = test_error.split("\n")
             # Find everything after the first line that includes "ERROR", "FAIL", or "activate successful"
@@ -262,10 +272,10 @@ def maketest(
             )
             test_error = "\n".join(error_lines[first_line_index_with_error:])
 
-        whyfailed = Editor(
-            os.path.join(maketest_work_dir, "check"), log_dir=work_dir
-        ).ask(
-            f"""/nocontext 
+            whyfailed = Editor(
+                os.path.join(maketest_work_dir, "check"), log_dir=work_dir
+            ).ask(
+                f"""/nocontext 
 
 <error>
 {test_error}
@@ -275,8 +285,8 @@ def maketest(
 {issue_content}
 </issue>
 """,
-            context=[],
-            prompt="""## Task
+                context=[],
+                prompt="""## Task
 
 A test case has been created that is currently expected to fail due to a known issue.
 
@@ -293,23 +303,22 @@ def maketest(
 - Emit "maybe" if the test error is possibly consistent with the described issue.
 - Emit "no" if the test error is NOT consistent with the described issue.
 """,
-        )
-
-        if whyfailed == "no":
-            print(
-                f"[maketest] ({instance_id}) Test case {test_file} DID NOT fail for the planned reason"
-            )
-            print(
-                f"[maketest] ({instance_id}) Reverting test changes to {test_file} and trying again"
-            )
-            with open(test_file, "w") as f:
-                f.write(original_test_content)
-        else:
-            fails_for_expected_reason = True
-            print(
-                f"[maketest] ({instance_id}) It looks like it failed for the planned reason"
             )
 
+            if whyfailed == "no":
+                print(
+                    f"[maketest] ({instance_id}) Test case {test_file} DID NOT fail for the planned reason"
+                )
+                print(
+                    f"[maketest] ({instance_id}) Reverting test changes to {test_file}."
+                )
+                revert_test_changes()
+            else:
+                fails_for_expected_reason = True
+                print(
+                    f"[maketest] ({instance_id}) It looks like it failed for the planned reason"
+                )
+
     if instance["repo"] == "django/django":
         test_directive = test_files_to_modules([test_file])[0]
     else:

diff --git a/solver/solve/steps/step_verify.py b/solver/solve/steps/step_verify.py
@@ -135,11 +135,9 @@ def repair_test(
 
     print(f"[verify/repair] ({instance_id}) Retesting: {test_directive}")
 
-    (succeeded, test_output) = run_test(
-        task_manager, test_directive, files_to_directives=False
-    )
+    test_result = run_test(task_manager, test_directive, files_to_directives=False)
 
-    if not succeeded:
+    if not test_result.succeeded:
         print(f"[verify/repair] ({instance_id}) Test failed: {test_directive}")
         print(
             f"[verify/repair] ({instance_id}) Review {task_manager.log_file} for more information"
@@ -189,24 +187,22 @@ def step_verify(
     test_directives_repaired = []
     for test_directive in test_directives:
         print(f"[verify] ({instance_id}) Running test: {test_directive}")
-        succeeded, test_output = run_test(
-            task_manager, test_directive, files_to_directives=False
-        )
+        test_result = run_test(task_manager, test_directive, files_to_directives=False)
 
-        if not succeeded:
+        if not test_result.succeeded and test_result.test_error:
             repaired = repair_test(
                 task_manager,
                 verify_dir,
                 work_dir,
                 instance_id,
                 test_directive,
-                test_output,
+                test_result.test_error,
             )
             if repaired:
                 test_directives_repaired.append(test_directive)
                 succeeded = True
 
-        if succeeded:
+        if test_result.succeeded:
             test_directives_succeeded.append(test_directive)
 
     if test_directives_repaired: