Mark expected failures under crosshair

HypothesisWorks · Oct 10, 2024 · 049dc2f · 049dc2f
1 parent e455d5d
commit 049dc2f
Show file tree

Hide file tree

Showing 33 changed files with 163 additions and 36 deletions.
diff --git a/hypothesis-python/tests/common/utils.py b/hypothesis-python/tests/common/utils.py
@@ -9,6 +9,7 @@
 # obtain one at https://mozilla.org/MPL/2.0/.
 
 import contextlib
+import enum
 import sys
 import warnings
 from io import StringIO
@@ -249,3 +250,36 @@ def capture_observations():
 # config option, so *linking against* something built this way can break us.
 # Everything is terrible
 PYTHON_FTZ = next_down(sys.float_info.min) == 0.0
+
+
+class Why(enum.Enum):
+    # Use an enum here so it's easier to find and/or exclude some cases later
+
+    # things we want to fix
+    flaky_replay = "Inconsistent results from replaying a failing test..."
+    symbolic_outside_context = "CrosshairInternal error (using value outside context)"
+    floats = "crosshair doesn't reason about signed zero (and other edge cases?)"
+    no_unsatisfiable = "doesn't raise Unsatisfiable for some reason"
+
+    # things that are basically fine to leave alone
+
+    # nested_given: https://github.com/pschanely/hypothesis-crosshair/issues/11
+    nested_given = "nested @given decorators don't work with crosshair"
+    other = "reasons not elsewhere categorized"
+
+
+def xfail_on_crosshair(why: Why, /, *, strict=True, as_marks=False):
+    try:
+        import pytest
+    except ImportError:
+        return lambda fn: fn
+
+    current_backend = settings.get_profile(settings._current_profile).backend
+    kw = {
+        "strict": strict,
+        "reason": f"Expected failure due to: {why.value}",
+        "condition": current_backend == "crosshair",
+    }
+    if as_marks:  # for use with pytest.param(..., marks=xfail_on_crosshair())
+        return (pytest.mark.xf_crosshair, pytest.mark.xfail(**kw))
+    return lambda fn: pytest.mark.xf_crosshair(pytest.mark.xfail(**kw)(fn))
diff --git a/hypothesis-python/tests/conftest.py b/hypothesis-python/tests/conftest.py
@@ -44,6 +44,7 @@ def pytest_configure(config):
         "markers",
         "xp_min_version(api_version): run when greater or equal to api_version",
     )
+    config.addinivalue_line("markers", "xf_crosshair: selection for xfailing symbolics")
 
 
 def pytest_addoption(parser):

diff --git a/hypothesis-python/tests/cover/test_cache_implementation.py b/hypothesis-python/tests/cover/test_cache_implementation.py
@@ -26,7 +26,7 @@
 from hypothesis.errors import InvalidArgument
 from hypothesis.internal.cache import GenericCache, LRUCache, LRUReusedCache
 
-from tests.common.utils import skipif_emscripten
+from tests.common.utils import Why, skipif_emscripten, xfail_on_crosshair
 
 
 class LRUCacheAlternative(GenericCache):
@@ -116,6 +116,7 @@ def test_behaves_like_a_dict_with_losses(implementation, writes, size):
         assert len(target) <= min(len(model), size)
 
 
+@xfail_on_crosshair(Why.symbolic_outside_context)
 @settings(
     suppress_health_check={HealthCheck.too_slow}
     | set(settings.get_profile(settings._current_profile).suppress_health_check),

diff --git a/hypothesis-python/tests/cover/test_core.py b/hypothesis-python/tests/cover/test_core.py
@@ -17,7 +17,10 @@
 from hypothesis.database import InMemoryExampleDatabase
 from hypothesis.errors import InvalidArgument, NoSuchExample, Unsatisfiable
 
+from tests.common.utils import Why, xfail_on_crosshair
 
+
+@xfail_on_crosshair(Why.other)
 def test_stops_after_max_examples_if_satisfying():
     tracker = []
 
@@ -33,6 +36,7 @@ def track(x):
     assert len(tracker) == max_examples
 
 
+@xfail_on_crosshair(Why.symbolic_outside_context)
 def test_stops_after_ten_times_max_examples_if_not_satisfying():
     count = [0]
 

diff --git a/hypothesis-python/tests/cover/test_filter_rewriting.py b/hypothesis-python/tests/cover/test_filter_rewriting.py
@@ -31,7 +31,7 @@
 from hypothesis.strategies._internal.strings import BytesStrategy, TextStrategy
 
 from tests.common.debug import check_can_generate_examples
-from tests.common.utils import fails_with
+from tests.common.utils import Why, fails_with, xfail_on_crosshair
 
 A_FEW = 15  # speed up massively-parametrized tests
 
@@ -374,9 +374,11 @@ def test_isidentifier_filter_properly_rewritten(al, data):
     assert example.isidentifier()
 
 
-@pytest.mark.parametrize("al", ["¥¦§©"])
-def test_isidentifer_filter_unsatisfiable(al):
-    fs = st.text(alphabet=al).filter(str.isidentifier)
+@xfail_on_crosshair(Why.no_unsatisfiable)  # maybe a bug?
+def test_isidentifer_filter_unsatisfiable():
+    alphabet = "¥¦§©"
+    assert not any(f"_{c}".isidentifier() for c in alphabet)
+    fs = st.text(alphabet=alphabet).filter(str.isidentifier)
     with pytest.raises(Unsatisfiable):
         check_can_generate_examples(fs)
 
@@ -571,6 +573,7 @@ def test_filter_rewriting_lambda_len_unique_elements(
     assert predicate(value)
 
 
+@xfail_on_crosshair(Why.no_unsatisfiable)
 @pytest.mark.parametrize(
     "predicate",
     [

diff --git a/hypothesis-python/tests/cover/test_find.py b/hypothesis-python/tests/cover/test_find.py
@@ -12,7 +12,10 @@
 
 from hypothesis import Phase, find, settings, strategies as st
 
+from tests.common.utils import Why, xfail_on_crosshair
 
+
+@xfail_on_crosshair(Why.symbolic_outside_context)
 def test_find_uses_provided_random():
     prev = None
 

diff --git a/hypothesis-python/tests/cover/test_flakiness.py b/hypothesis-python/tests/cover/test_flakiness.py
@@ -17,13 +17,14 @@
 from hypothesis.internal.scrutineer import Tracer
 from hypothesis.strategies import booleans, composite, integers, lists, random_module
 
-from tests.common.utils import no_shrink
+from tests.common.utils import Why, no_shrink, xfail_on_crosshair
 
 
 class Nope(Exception):
     pass
 
 
+@xfail_on_crosshair(Why.symbolic_outside_context)
 def test_fails_only_once_is_flaky():
     first_call = [True]
 
@@ -40,6 +41,7 @@ def rude(x):
     assert isinstance(exceptions[0], Nope)
 
 
+@xfail_on_crosshair(Why.symbolic_outside_context)
 def test_gives_flaky_error_if_assumption_is_flaky():
     seen = set()
 
@@ -78,6 +80,7 @@ def test(x):
     assert isinstance(exceptions[0], ZeroDivisionError)
 
 
+@xfail_on_crosshair(Why.symbolic_outside_context)
 def test_does_not_attempt_to_shrink_flaky_errors():
     values = []
 
@@ -109,6 +112,7 @@ def single_bool_lists(draw):
     return result
 
 
+@xfail_on_crosshair(Why.nested_given)
 @example([True, False, False, False], [3], None)
 @example([False, True, False, False], [3], None)
 @example([False, False, True, False], [3], None)

diff --git a/hypothesis-python/tests/cover/test_float_nastiness.py b/hypothesis-python/tests/cover/test_float_nastiness.py
@@ -26,6 +26,7 @@
 )
 
 from tests.common.debug import find_any, minimal
+from tests.common.utils import Why, xfail_on_crosshair
 
 try:
     import numpy
@@ -66,6 +67,7 @@ def test_does_not_generate_negative_if_right_boundary_is_positive(x):
     assert math.copysign(1, x) == 1
 
 
+@xfail_on_crosshair(Why.floats)
 @given(st.floats(-1.0, -0.0))
 def test_does_not_generate_positive_if_right_boundary_is_negative(x):
     assert math.copysign(1, x) == -1
@@ -76,6 +78,7 @@ def test_half_bounded_generates_zero():
     find_any(st.floats(max_value=1.0), lambda x: x == 0.0)
 
 
+@xfail_on_crosshair(Why.floats)
 @given(st.floats(max_value=-0.0))
 def test_half_bounded_respects_sign_of_upper_bound(x):
     assert math.copysign(1, x) == -1

diff --git a/hypothesis-python/tests/cover/test_given_error_conditions.py b/hypothesis-python/tests/cover/test_given_error_conditions.py
@@ -14,16 +14,14 @@
 from hypothesis.errors import InvalidArgument, Unsatisfiable
 from hypothesis.strategies import booleans, integers, nothing
 
-from tests.common.utils import fails_with
+from tests.common.utils import Why, fails_with, xfail_on_crosshair
 
 
-def test_raises_unsatisfiable_if_all_false_in_finite_set():
-    @given(booleans())
-    def test_assume_false(x):
-        reject()
-
-    with pytest.raises(Unsatisfiable):
-        test_assume_false()
+@xfail_on_crosshair(Why.no_unsatisfiable)
+@fails_with(Unsatisfiable)
+@given(booleans())
+def test_raises_unsatisfiable_if_all_false_in_finite_set(x):
+    reject()
 
 
 def test_does_not_raise_unsatisfiable_if_some_false_in_finite_set():

diff --git a/hypothesis-python/tests/cover/test_interactive_example.py b/hypothesis-python/tests/cover/test_interactive_example.py
@@ -24,7 +24,7 @@
 from hypothesis.internal.compat import WINDOWS
 
 from tests.common.debug import find_any
-from tests.common.utils import fails_with, skipif_emscripten
+from tests.common.utils import Why, fails_with, skipif_emscripten, xfail_on_crosshair
 
 pytest_plugins = "pytester"
 
@@ -45,6 +45,7 @@ def test_exception_in_compare_can_still_have_example():
     st.one_of(st.none().map(lambda n: Decimal("snan")), st.just(Decimal(0))).example()
 
 
+@xfail_on_crosshair(Why.symbolic_outside_context)
 def test_does_not_always_give_the_same_example():
     s = st.integers()
     assert len({s.example() for _ in range(100)}) >= 10

diff --git a/hypothesis-python/tests/cover/test_lookup.py b/hypothesis-python/tests/cover/test_lookup.py
@@ -52,7 +52,7 @@
     find_any,
     minimal,
 )
-from tests.common.utils import fails_with, temp_registered
+from tests.common.utils import Why, fails_with, temp_registered, xfail_on_crosshair
 
 sentinel = object()
 BUILTIN_TYPES = tuple(

diff --git a/hypothesis-python/tests/cover/test_reproduce_failure.py b/hypothesis-python/tests/cover/test_reproduce_failure.py
@@ -27,7 +27,7 @@
 from hypothesis.core import decode_failure, encode_failure
 from hypothesis.errors import DidNotReproduce, InvalidArgument, UnsatisfiedAssumption
 
-from tests.common.utils import capture_out, no_shrink
+from tests.common.utils import Why, capture_out, no_shrink, xfail_on_crosshair
 
 
 @example(bytes(20))  # shorter compressed
@@ -118,6 +118,7 @@ def test(x):
         test()
 
 
+@xfail_on_crosshair(Why.symbolic_outside_context)
 def test_prints_reproduction_if_requested():
     failing_example = [None]
 

diff --git a/hypothesis-python/tests/cover/test_sampled_from.py b/hypothesis-python/tests/cover/test_sampled_from.py
@@ -34,7 +34,7 @@
     assert_simple_property,
     check_can_generate_examples,
 )
-from tests.common.utils import fails_with
+from tests.common.utils import Why, fails_with, xfail_on_crosshair
 
 an_enum = enum.Enum("A", "a b c")
 a_flag = enum.Flag("A", "a b c")
@@ -69,6 +69,7 @@ def test_unsat_filtered_sampling(x):
     raise AssertionError
 
 
+@xfail_on_crosshair(Why.no_unsatisfiable)
 @fails_with(Unsatisfiable)
 @settings(suppress_health_check=[])
 @given(sampled_from(range(2)).filter(lambda x: x < 0))
@@ -144,12 +145,14 @@ def test_efficient_sets_of_samples_with_chained_transformations_slow_path(x):
     assert x == {x * 2 for x in range(20) if x % 3}
 
 
+@xfail_on_crosshair(Why.no_unsatisfiable)
 @fails_with(Unsatisfiable)
 @given(FilteredStrategy(st.sampled_from([None, False, ""]), conditions=(bool,)))
 def test_unsatisfiable_explicit_filteredstrategy_sampled(x):
     raise AssertionError("Unreachable because there are no valid examples")
 
 
+@xfail_on_crosshair(Why.no_unsatisfiable)
 @fails_with(Unsatisfiable)
 @given(FilteredStrategy(st.none(), conditions=(bool,)))
 def test_unsatisfiable_explicit_filteredstrategy_just(x):

diff --git a/hypothesis-python/tests/cover/test_searchstrategy.py b/hypothesis-python/tests/cover/test_searchstrategy.py
@@ -25,7 +25,7 @@
 from hypothesis.strategies._internal.utils import to_jsonable
 
 from tests.common.debug import assert_simple_property, check_can_generate_examples
-from tests.common.utils import checks_deprecated_behaviour
+from tests.common.utils import Why, checks_deprecated_behaviour, xfail_on_crosshair
 
 
 def test_or_errors_when_given_non_strategy():
@@ -69,6 +69,7 @@ def test_can_map():
     assert_simple_property(s, lambda v: v == "foo")
 
 
+@xfail_on_crosshair(Why.no_unsatisfiable)
 def test_example_raises_unsatisfiable_when_too_filtered():
     with pytest.raises(Unsatisfiable):
         check_can_generate_examples(integers().filter(lambda x: False))

diff --git a/hypothesis-python/tests/cover/test_settings.py b/hypothesis-python/tests/cover/test_settings.py
@@ -35,11 +35,13 @@
 from hypothesis.utils.conventions import not_set
 
 from tests.common.utils import (
+    Why,
     checks_deprecated_behaviour,
     counts_calls,
     fails_with,
     skipif_emscripten,
     validate_deprecation,
+    xfail_on_crosshair,
 )
 
 
@@ -297,6 +299,7 @@ def test_database_is_reference_preserved():
     assert s.database is s.database
 
 
+@xfail_on_crosshair(Why.other)
 @settings(verbosity=Verbosity.verbose)
 @example(x=99)
 @given(st.integers())

diff --git a/hypothesis-python/tests/cover/test_stateful.py b/hypothesis-python/tests/cover/test_stateful.py
@@ -42,7 +42,12 @@
 )
 from hypothesis.strategies import binary, data, integers, just, lists
 
-from tests.common.utils import capture_out, validate_deprecation
+from tests.common.utils import (
+    Why,
+    capture_out,
+    validate_deprecation,
+    xfail_on_crosshair,
+)
 from tests.nocover.test_stateful import DepthMachine
 
 NO_BLOB_SETTINGS = Settings(print_blob=False, phases=tuple(Phase)[:-1])
@@ -1175,6 +1180,8 @@ def teardown(self):
         assert self.a >= 2
 
 
+# Replay overruns after we trigger a crosshair.util.IgnoreAttempt exception for n=3
+@xfail_on_crosshair(Why.other)
 def test_min_steps_argument():
     # You must pass a non-negative integer...
     for n_steps in (-1, "nan", 5.0):

diff --git a/hypothesis-python/tests/cover/test_testdecorators.py b/hypothesis-python/tests/cover/test_testdecorators.py
@@ -30,13 +30,15 @@
 )
 
 from tests.common.utils import (
+    Why,
     assert_falsifying_output,
     capture_out,
     fails,
     fails_with,
     no_shrink,
     raises,
     skipif_emscripten,
+    xfail_on_crosshair,
 )
 
 # This particular test file is run under both pytest and nose, so it can't
@@ -304,6 +306,7 @@ def test_has_ascii(x):
     assert any(c in ascii_characters for c in x)
 
 
+@xfail_on_crosshair(Why.symbolic_outside_context, strict=False)
 def test_can_derandomize():
     values = []
 
@@ -393,6 +396,7 @@ def test_mixed_text(x):
     assert set(x).issubset(set("abcdefg"))
 
 
+@xfail_on_crosshair(Why.other, strict=False)  # runs ~five failing examples
 def test_when_set_to_no_simplifies_runs_failing_example_twice():
     failing = []
 
@@ -478,6 +482,7 @@ def test_empty_lists(xs):
     assert xs == []
 
 
+@xfail_on_crosshair(Why.other, strict=False)
 def test_given_usable_inline_on_lambdas():
     xs = []
     given(booleans())(lambda x: xs.append(x))()