diff --git a/.circleci/config.yml b/.circleci/config.yml index c3877b59ab8c21..b475fec394ce7a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -657,8 +657,6 @@ jobs: TEST_CONFIG: << parameters.test-config >> SHARD_NUMBER: << parameters.shard-number >> NUM_TEST_SHARDS: << parameters.num-test-shards >> - PYTORCH_RETRY_TEST_CASES: 1 - PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1 steps: - checkout - attach_workspace: diff --git a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml index 695507840c445d..048766beea2134 100644 --- a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml +++ b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml @@ -211,8 +211,6 @@ TEST_CONFIG: << parameters.test-config >> SHARD_NUMBER: << parameters.shard-number >> NUM_TEST_SHARDS: << parameters.num-test-shards >> - PYTORCH_RETRY_TEST_CASES: 1 - PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1 steps: - checkout - attach_workspace: diff --git a/.github/workflows/_bazel-build-test.yml b/.github/workflows/_bazel-build-test.yml index 18302103386f95..d7fbdc8b1ded8c 100644 --- a/.github/workflows/_bazel-build-test.yml +++ b/.github/workflows/_bazel-build-test.yml @@ -122,8 +122,6 @@ jobs: GITHUB_RUN_NUMBER: ${{ github.run_number }} GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }} JOB_ID: ${{ steps.get-job-id.outputs.job-id }} - PYTORCH_RETRY_TEST_CASES: 1 - PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1 REENABLED_ISSUES: ${{ needs.filter.outputs.reenabled-issues }} # TODO duplicated AWS_DEFAULT_REGION: us-east-1 @@ -160,8 +158,6 @@ jobs: -e TORCH_CUDA_ARCH_LIST \ -e OUR_GITHUB_JOB_ID \ -e CUDA_VERSION \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PYTORCH_OVERRIDE_FLAKY_SIGNAL \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml index 4bd0e38a0f750e..29df91320e5c56 100644 --- a/.github/workflows/_linux-test.yml +++ b/.github/workflows/_linux-test.yml @@ -164,8 +164,6 @@ jobs: BRANCH: ${{ steps.parse-ref.outputs.branch }} SHA1: ${{ github.event.pull_request.head.sha || github.sha }} BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 - PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1 TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} @@ -219,8 +217,6 @@ jobs: -e NUM_TEST_SHARDS \ -e REENABLED_ISSUES \ -e CONTINUE_THROUGH_ERROR \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PYTORCH_OVERRIDE_FLAKY_SIGNAL \ -e PR_LABELS \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ diff --git a/.github/workflows/_mac-test-mps.yml b/.github/workflows/_mac-test-mps.yml index 786af9101f8d87..67763e1b24576e 100644 --- a/.github/workflows/_mac-test-mps.yml +++ b/.github/workflows/_mac-test-mps.yml @@ -96,8 +96,6 @@ jobs: ENV_NAME: conda-test-env-${{ github.run_id }} PY_VERS: 3.9 PR_BODY: ${{ github.event.pull_request.body }} - PYTORCH_RETRY_TEST_CASES: 1 - PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1 CONTINUE_THROUGH_ERROR: ${{ needs.filter.outputs.keep-going }} PIP_REQUIREMENTS_FILE: .github/requirements/pip-requirements-${{ runner.os }}.txt REENABLED_ISSUES: ${{ needs.filter.outputs.reenabled-issues }} diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml index cbba149c259f81..377cdd2a313c1f 100644 --- a/.github/workflows/_mac-test.yml +++ b/.github/workflows/_mac-test.yml @@ -57,8 +57,6 @@ jobs: SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} PR_BODY: ${{ github.event.pull_request.body }} - PYTORCH_RETRY_TEST_CASES: 1 - PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1 steps: - name: Clean up leftover processes on MacOS pet runner continue-on-error: true diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml index 9d25e18167e4bc..ac3a61d37b2535 100644 --- a/.github/workflows/_rocm-test.yml +++ b/.github/workflows/_rocm-test.yml @@ -131,8 +131,6 @@ jobs: JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} BRANCH: ${{ steps.parse-ref.outputs.branch }} SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 - PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1 CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} @@ -180,8 +178,6 @@ jobs: -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ -e REENABLED_ISSUES \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PYTORCH_OVERRIDE_FLAKY_SIGNAL \ -e CONTINUE_THROUGH_ERROR \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml index fa1da923066a14..ebfe4211b34c77 100644 --- a/.github/workflows/_win-test.yml +++ b/.github/workflows/_win-test.yml @@ -139,8 +139,6 @@ jobs: USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }} INSTALL_WINDOWS_SDK: 1 PYTHON_VERSION: 3.8 - PYTORCH_RETRY_TEST_CASES: 1 - PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1 CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} VC_PRODUCT: "BuildTools" VC_VERSION: "" diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index 1bf3b03355b684..a7025b5d95b338 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -192,10 +192,6 @@ def repro_env_var_prefix() -> str: TestEnvironment.def_flag("IS_REMOTE_GPU", env_var="PYTORCH_TEST_REMOTE_GPU", include_in_repro=False) -TestEnvironment.def_flag("RETRY_TEST_CASES", env_var="PYTORCH_RETRY_TEST_CASES", - include_in_repro=False) -TestEnvironment.def_flag("OVERRIDE_FLAKY_SIGNAL", env_var="PYTORCH_OVERRIDE_FLAKY_SIGNAL", - include_in_repro=False) TestEnvironment.def_flag( "DISABLE_RUNNING_SCRIPT_CHK", env_var="PYTORCH_DISABLE_RUNNING_SCRIPT_CHK", @@ -793,8 +789,6 @@ def run_unittest_help(argv): GRAPH_EXECUTOR = cppProfilingFlagsToProfilingMode() RERUN_DISABLED_TESTS = args.rerun_disabled_tests -# Rerun disabled tests many more times to make sure that they are not flaky anymore -MAX_NUM_RETRIES = 3 if not RERUN_DISABLED_TESTS else 50 SLOW_TESTS_FILE = args.import_slow_tests DISABLED_TESTS_FILE = args.import_disabled_tests @@ -2668,63 +2662,8 @@ def wrapper(self, *args, **kwargs): def wrap_with_cuda_memory_check(self, method): return self.wrap_method_with_policy(method, self.assertLeaksNoCudaTensors) - # Recursive function that incorporates retry logic when PYTORCH_RETRY_TEST_CASES=1 and enables early test - # termination. [DISCLAIMER: ONLY WORKS WITH UNITTEST] - # When report_only is True, flaky tests are only reported, but the signal remains the same (the test will still - # show up red). - # Otherwise, the flaky test will show up green while its stats are captured by test reports. - def _run_with_retry(self, result=None, num_runs_left=0, report_only=True, num_red=0, num_green=0): + def _run_custom(self, result=None): using_unittest = isinstance(result, unittest.TestResult) - if num_runs_left == 0: - # The logic when RERUN_DISABLED_TESTS is set to true is as follows: - # |-if the disabled test passes: - # |-- if it's flaky: - # |--- Do nothing because it's still flaky - # |-- elif it isn't flaky anymore: - # |--- Close the disabled ticket (later) - # | - # |- elif the disabled test fails after n retries: - # |-- This is expected, report this but don't fail the job - skipped_msg = { - "num_red": num_red, - "num_green": num_green, - "max_num_retries": MAX_NUM_RETRIES, - "rerun_disabled_test": RERUN_DISABLED_TESTS, - } - - traceback_str = "" - if RERUN_DISABLED_TESTS and using_unittest: - # Hide all failures and errors when RERUN_DISABLED_TESTS is enabled. This is - # a verification check, we don't want more red signals coming from it - if result.failures: - _, traceback_str = result.failures.pop(-1) - if result.errors: - _, traceback_str = result.errors.pop(-1) - - if traceback_str: - skipped_msg["traceback_str"] = traceback_str - - if num_green == 0: - # The disabled test fails, report as skipped but don't fail the job - result.addSkip(self, json.dumps(skipped_msg)) - - if num_red == 0: - # The test passes after re-running multiple times. This acts as a signal - # to confirm that it's not flaky anymore - result.addSuccess(self) - - if num_green > 0 and num_red > 0 and using_unittest: - skipped_msg["flaky"] = True - # Still flaky, do nothing - result.addSkip(self, json.dumps(skipped_msg)) - - return - - if using_unittest: - # Keep track of the number of tests marked as failures, errors, and skipped before starting - failures_before = 0 if result is None else len(result.failures) - errors_before = 0 if result is None else len(result.errors) - skipped_before = 0 if result is None else len(result.skipped) super_run = super().run test_cls = super_run.__self__ @@ -2759,8 +2698,8 @@ def _run_with_retry(self, result=None, num_runs_left=0, report_only=True, num_re if strict_mode: torch._dynamo.reset() - # Early terminate test if necessary. - if self._should_stop_test_suite(): + # Early terminate test if necessary. If using pytest, use the -x flag instead + if using_unittest and self._should_stop_test_suite(): if result.wasSuccessful(): case = TestCase() if TEST_SAVE_XML is not None: @@ -2777,63 +2716,14 @@ def _run_with_retry(self, result=None, num_runs_left=0, report_only=True, num_re assert result.wasSuccessful() is False result.stop() - if not RETRY_TEST_CASES or not using_unittest: - return - - err = sys.exc_info() - num_retries_left = num_runs_left - 1 - if failures_before < len(result.failures): - print(f" {self._testMethodName} failed - num_retries_left: {num_retries_left}") - if (report_only and num_retries_left < MAX_NUM_RETRIES) or (not report_only and num_retries_left > 0): - _, traceback_str = result.failures.pop(-1) - print(traceback_str) - result.addExpectedFailure(self, err) - self._run_with_retry(result=result, num_runs_left=num_retries_left, report_only=report_only, - num_red=num_red + 1, num_green=num_green) - elif errors_before < len(result.errors): - print(f" {self._testMethodName} errored - num_retries_left: {num_retries_left}") - if (report_only and num_retries_left < MAX_NUM_RETRIES) or (not report_only and num_retries_left > 0): - _, traceback_str = result.errors.pop(-1) - print(traceback_str) - result.addExpectedFailure(self, err) - self._run_with_retry(result=result, num_runs_left=num_retries_left, report_only=report_only, - num_red=num_red + 1, num_green=num_green) - elif RERUN_DISABLED_TESTS and num_retries_left <= MAX_NUM_RETRIES and skipped_before == len(result.skipped): - # Always re-run up to MAX_NUM_RETRIES when running under rerun disabled tests modes if the test successes. - # The parameter num_retries_left can be equal to MAX_NUM_RETRIES here because num_runs_left is initially - # set to MAX_NUM_RETRIES + 1, i.e. the first run successes - # - # Also if the result is skipped, this is due to check_if_enable skipping non-disabled tests, thus we - # want to ignore them, not retrying and skipping multiple times - print(f" {self._testMethodName} succeeded - num_retries_left: {num_retries_left}") - result.addSuccess(self) - self._run_with_retry(result=result, num_runs_left=num_retries_left, report_only=report_only, - num_red=num_red, num_green=num_green + 1) - elif report_only and num_retries_left < MAX_NUM_RETRIES: - # The original logic here is that num_retries_left must be smaller than MAX_NUM_RETRIES indicating - # that at least one retry has been spent - print(f" {self._testMethodName} succeeded - num_retries_left: {num_retries_left}") - result.addUnexpectedSuccess(self) - self._run_with_retry(result=result, num_runs_left=num_retries_left, report_only=report_only, - num_red=num_red, num_green=num_green + 1) - elif not report_only and num_retries_left < MAX_NUM_RETRIES: - # in this case, our test was rerun (as a retry has been used) and it just passed. - # we incur one more recursive call with num_runs_left = 0 to allow for accurate flaky reporting - self._run_with_retry(result=result, num_runs_left=0, report_only=report_only, - num_red=num_red, num_green=num_green + 1) - def run(self, result=None): with contextlib.ExitStack() as stack: if TEST_WITH_CROSSREF: stack.enter_context(CrossRefMode()) - num_runs = MAX_NUM_RETRIES + 1 if RETRY_TEST_CASES else 1 - self._run_with_retry( + self._run_custom( result=result, - num_runs_left=num_runs, - report_only=not OVERRIDE_FLAKY_SIGNAL, - num_red=0, - num_green=0) + ) def setUp(self): check_if_enable(self)