From c7816d04324fb77f27b1f15b0a52deca197397e6 Mon Sep 17 00:00:00 2001
From: Zac Hatfield-Dodds <zac.hatfield.dodds@gmail.com>
Date: Thu, 10 Oct 2024 10:25:55 -0700
Subject: [PATCH] Mark failures for crosshair to fix?

---
 hypothesis-python/tests/common/utils.py                     | 3 ++-
 hypothesis-python/tests/cover/test_datetimes.py             | 2 ++
 hypothesis-python/tests/cover/test_filter_rewriting.py      | 1 +
 hypothesis-python/tests/cover/test_lookup.py                | 1 +
 hypothesis-python/tests/cover/test_reproduce_failure.py     | 1 +
 hypothesis-python/tests/cover/test_sampled_from.py          | 1 +
 hypothesis-python/tests/cover/test_stateful.py              | 1 +
 hypothesis-python/tests/cover/test_targeting.py             | 3 +++
 hypothesis-python/tests/cover/test_testdecorators.py        | 3 +++
 hypothesis-python/tests/datetime/test_dateutil_timezones.py | 2 ++
 hypothesis-python/tests/datetime/test_pytz_timezones.py     | 1 +
 hypothesis-python/tests/datetime/test_zoneinfo_timezones.py | 2 ++
 hypothesis-python/tests/nocover/test_characters.py          | 3 +++
 hypothesis-python/tests/nocover/test_database_usage.py      | 1 +
 hypothesis-python/tests/nocover/test_duplication.py         | 3 +++
 hypothesis-python/tests/nocover/test_flatmap.py             | 3 +++
 hypothesis-python/tests/nocover/test_floating.py            | 4 +++-
 hypothesis-python/tests/nocover/test_recursive.py           | 3 ++-
 hypothesis-python/tests/nocover/test_regressions.py         | 5 +++++
 hypothesis-python/tests/nocover/test_sampled_from.py        | 3 ++-
 hypothesis-python/tests/nocover/test_simple_numbers.py      | 3 +++
 21 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/hypothesis-python/tests/common/utils.py b/hypothesis-python/tests/common/utils.py
index 9557c98e90..208ef0e2c7 100644
--- a/hypothesis-python/tests/common/utils.py
+++ b/hypothesis-python/tests/common/utils.py
@@ -265,6 +265,7 @@ class Why(enum.Enum):
 
     # nested_given: https://github.com/pschanely/hypothesis-crosshair/issues/11
     nested_given = "nested @given decorators don't work with crosshair"
+    undiscovered = "crosshair may not find the failing input"
     other = "reasons not elsewhere categorized"
 
 
@@ -276,7 +277,7 @@ def xfail_on_crosshair(why: Why, /, *, strict=True, as_marks=False):
 
     current_backend = settings.get_profile(settings._current_profile).backend
     kw = {
-        "strict": strict,
+        "strict": strict and why != Why.undiscovered,
         "reason": f"Expected failure due to: {why.value}",
         "condition": current_backend == "crosshair",
     }
diff --git a/hypothesis-python/tests/cover/test_datetimes.py b/hypothesis-python/tests/cover/test_datetimes.py
index 51677c7df2..a4c3aded1b 100644
--- a/hypothesis-python/tests/cover/test_datetimes.py
+++ b/hypothesis-python/tests/cover/test_datetimes.py
@@ -16,6 +16,7 @@
 from hypothesis.strategies import dates, datetimes, timedeltas, times
 
 from tests.common.debug import assert_simple_property, find_any, minimal
+from tests.common.utils import Why, xfail_on_crosshair
 
 
 def test_can_find_positive_delta():
@@ -104,6 +105,7 @@ def test_single_date(val):
     assert find_any(dates(val, val)) is val
 
 
+@xfail_on_crosshair(Why.undiscovered)
 def test_can_find_midnight():
     find_any(times(), lambda x: x.hour == x.minute == x.second == 0)
 
diff --git a/hypothesis-python/tests/cover/test_filter_rewriting.py b/hypothesis-python/tests/cover/test_filter_rewriting.py
index 33a419142b..f38b4a3649 100644
--- a/hypothesis-python/tests/cover/test_filter_rewriting.py
+++ b/hypothesis-python/tests/cover/test_filter_rewriting.py
@@ -181,6 +181,7 @@ def test_rewrite_unsatisfiable_filter(s, pred):
     assert s.filter(pred).is_empty
 
 
+@xfail_on_crosshair(Why.undiscovered)
 @pytest.mark.parametrize(
     "pred",
     [
diff --git a/hypothesis-python/tests/cover/test_lookup.py b/hypothesis-python/tests/cover/test_lookup.py
index 3972e49ee7..2020fed0bf 100644
--- a/hypothesis-python/tests/cover/test_lookup.py
+++ b/hypothesis-python/tests/cover/test_lookup.py
@@ -876,6 +876,7 @@ def test_supportsop_types_support_protocol(protocol, data):
     assert issubclass(type(value), protocol)
 
 
+@xfail_on_crosshair(Why.undiscovered)
 @pytest.mark.parametrize("restrict_custom_strategy", [True, False])
 def test_generic_aliases_can_be_conditionally_resolved_by_registered_function(
     restrict_custom_strategy,
diff --git a/hypothesis-python/tests/cover/test_reproduce_failure.py b/hypothesis-python/tests/cover/test_reproduce_failure.py
index 18e5f85894..58a0ef43f9 100644
--- a/hypothesis-python/tests/cover/test_reproduce_failure.py
+++ b/hypothesis-python/tests/cover/test_reproduce_failure.py
@@ -168,6 +168,7 @@ def test(data):
     assert "@reproduce_failure" not in o.getvalue()
 
 
+@xfail_on_crosshair(Why.undiscovered)
 def test_does_not_print_reproduction_for_large_data_examples_by_default():
     @settings(phases=no_shrink, print_blob=False)
     @given(st.data())
diff --git a/hypothesis-python/tests/cover/test_sampled_from.py b/hypothesis-python/tests/cover/test_sampled_from.py
index 11450ba7e4..c6be4a49cf 100644
--- a/hypothesis-python/tests/cover/test_sampled_from.py
+++ b/hypothesis-python/tests/cover/test_sampled_from.py
@@ -138,6 +138,7 @@ def stupid_sampled_sets(draw):
     return result
 
 
+@xfail_on_crosshair(Why.undiscovered)
 @given(stupid_sampled_sets())
 def test_efficient_sets_of_samples_with_chained_transformations_slow_path(x):
     # This deliberately exercises the standard filtering logic without going
diff --git a/hypothesis-python/tests/cover/test_stateful.py b/hypothesis-python/tests/cover/test_stateful.py
index b02fcb1f90..4ac86660f6 100644
--- a/hypothesis-python/tests/cover/test_stateful.py
+++ b/hypothesis-python/tests/cover/test_stateful.py
@@ -1277,6 +1277,7 @@ def fail_fast(self, a1, a2, a3, b1, b2, b3):
     )
 
 
+@xfail_on_crosshair(Why.undiscovered)
 def test_multiple_common_targets():
     class Machine(RuleBasedStateMachine):
         a = Bundle("a")
diff --git a/hypothesis-python/tests/cover/test_targeting.py b/hypothesis-python/tests/cover/test_targeting.py
index 67784f06b2..9fc70c131c 100644
--- a/hypothesis-python/tests/cover/test_targeting.py
+++ b/hypothesis-python/tests/cover/test_targeting.py
@@ -16,6 +16,8 @@
 from hypothesis.control import current_build_context
 from hypothesis.errors import InvalidArgument
 
+from tests.common.utils import Why, xfail_on_crosshair
+
 
 @example(0.0, "this covers the branch where context.data is None")
 @given(
@@ -100,6 +102,7 @@ def test_cannot_target_same_label_twice(_):
         target(1.0, label="label")
 
 
+@xfail_on_crosshair(Why.undiscovered)
 @given(st.none())
 def test_cannot_target_default_label_twice(_):
     target(0.0)
diff --git a/hypothesis-python/tests/cover/test_testdecorators.py b/hypothesis-python/tests/cover/test_testdecorators.py
index f225f78f1a..812919de98 100644
--- a/hypothesis-python/tests/cover/test_testdecorators.py
+++ b/hypothesis-python/tests/cover/test_testdecorators.py
@@ -139,6 +139,7 @@ def test_can_be_given_keyword_args(x, name):
     assert len(name) < x
 
 
+@xfail_on_crosshair(Why.undiscovered)
 @fails
 @given(one_of(floats(), booleans()), one_of(floats(), booleans()))
 def test_one_of_produces_different_values(x, y):
@@ -185,6 +186,7 @@ def test_removing_an_element_from_a_unique_list(xs, y):
     assert y not in xs
 
 
+@xfail_on_crosshair(Why.undiscovered)
 @fails
 @given(lists(integers(), min_size=2), data())
 def test_removing_an_element_from_a_non_unique_list(xs, data):
@@ -208,6 +210,7 @@ def test_can_mix_sampling_with_generating(x, y):
     assert type(x) == type(y)
 
 
+@xfail_on_crosshair(Why.undiscovered)
 @fails
 @given(frozensets(integers()))
 def test_can_find_large_sum_frozenset(xs):
diff --git a/hypothesis-python/tests/datetime/test_dateutil_timezones.py b/hypothesis-python/tests/datetime/test_dateutil_timezones.py
index 933b574a91..d7849924e5 100644
--- a/hypothesis-python/tests/datetime/test_dateutil_timezones.py
+++ b/hypothesis-python/tests/datetime/test_dateutil_timezones.py
@@ -109,6 +109,7 @@ def test_dateutil_exists_our_not_exists_are_inverse(value):
     assert datetime_does_not_exist(value) == (not tz.datetime_exists(value))
 
 
+@xfail_on_crosshair(Why.undiscovered)
 def test_datetimes_can_exclude_imaginary():
     find_any(
         datetimes(**DAY_WITH_IMAGINARY_HOUR_KWARGS, allow_imaginary=True),
@@ -120,6 +121,7 @@ def test_datetimes_can_exclude_imaginary():
     )
 
 
+@xfail_on_crosshair(Why.undiscovered)
 @fails_with(FailedHealthCheck)
 @given(
     datetimes(
diff --git a/hypothesis-python/tests/datetime/test_pytz_timezones.py b/hypothesis-python/tests/datetime/test_pytz_timezones.py
index b48987f0e9..5ce5b38b72 100644
--- a/hypothesis-python/tests/datetime/test_pytz_timezones.py
+++ b/hypothesis-python/tests/datetime/test_pytz_timezones.py
@@ -105,6 +105,7 @@ def test_time_bounds_must_be_naive(name, val):
         times(**{name: val}).validate()
 
 
+@xfail_on_crosshair(Why.undiscovered)
 @pytest.mark.parametrize(
     "bound",
     [
diff --git a/hypothesis-python/tests/datetime/test_zoneinfo_timezones.py b/hypothesis-python/tests/datetime/test_zoneinfo_timezones.py
index 1772321cbb..6c2dbe9cb8 100644
--- a/hypothesis-python/tests/datetime/test_zoneinfo_timezones.py
+++ b/hypothesis-python/tests/datetime/test_zoneinfo_timezones.py
@@ -17,12 +17,14 @@
 from hypothesis.errors import InvalidArgument
 
 from tests.common.debug import assert_no_examples, find_any, minimal
+from tests.common.utils import Why, xfail_on_crosshair
 
 
 def test_utc_is_minimal():
     assert minimal(st.timezones()) is zoneinfo.ZoneInfo("UTC")
 
 
+@xfail_on_crosshair(Why.undiscovered)
 def test_can_generate_non_utc():
     find_any(
         st.datetimes(timezones=st.timezones()).filter(lambda d: d.tzinfo.key != "UTC")
diff --git a/hypothesis-python/tests/nocover/test_characters.py b/hypothesis-python/tests/nocover/test_characters.py
index f938f8d162..48f9c69770 100644
--- a/hypothesis-python/tests/nocover/test_characters.py
+++ b/hypothesis-python/tests/nocover/test_characters.py
@@ -15,6 +15,8 @@
 
 from hypothesis import given, settings, strategies as st
 
+from tests.common.utils import Why, xfail_on_crosshair
+
 IDENTIFIER_CHARS = string.ascii_letters + string.digits + "_"
 
 
@@ -23,6 +25,7 @@ def test_large_blacklist(c):
     assert c not in IDENTIFIER_CHARS
 
 
+@xfail_on_crosshair(Why.symbolic_outside_context)  # seems like a crosshair bug here
 @given(st.data())
 def test_arbitrary_blacklist(data):
     blacklist = data.draw(st.text(st.characters(max_codepoint=1000), min_size=1))
diff --git a/hypothesis-python/tests/nocover/test_database_usage.py b/hypothesis-python/tests/nocover/test_database_usage.py
index ba79bb7b9d..5548ba0564 100644
--- a/hypothesis-python/tests/nocover/test_database_usage.py
+++ b/hypothesis-python/tests/nocover/test_database_usage.py
@@ -32,6 +32,7 @@ def has_a_non_zero_byte(x):
     return any(bytes(x))
 
 
+@xfail_on_crosshair(Why.undiscovered)
 def test_saves_incremental_steps_in_database():
     key = b"a database key"
     database = InMemoryExampleDatabase()
diff --git a/hypothesis-python/tests/nocover/test_duplication.py b/hypothesis-python/tests/nocover/test_duplication.py
index ff89a9978f..566abefda9 100644
--- a/hypothesis-python/tests/nocover/test_duplication.py
+++ b/hypothesis-python/tests/nocover/test_duplication.py
@@ -15,6 +15,8 @@
 from hypothesis import given, settings
 from hypothesis.strategies._internal import SearchStrategy
 
+from tests.common.utils import Why, xfail_on_crosshair
+
 
 class Blocks(SearchStrategy):
     def __init__(self, n):
@@ -37,6 +39,7 @@ def test(b):
     assert set(counts.values()) == {1}
 
 
+@xfail_on_crosshair(Why.other, strict=False)  # CrosshairInternal for n>0
 @pytest.mark.parametrize("n", range(1, 5))
 def test_mostly_does_not_duplicate_blocks_even_when_failing(n):
     counts = Counter()
diff --git a/hypothesis-python/tests/nocover/test_flatmap.py b/hypothesis-python/tests/nocover/test_flatmap.py
index 239d256a8c..aa238f1fc9 100644
--- a/hypothesis-python/tests/nocover/test_flatmap.py
+++ b/hypothesis-python/tests/nocover/test_flatmap.py
@@ -26,6 +26,7 @@
 )
 
 from tests.common.debug import find_any, minimal
+from tests.common.utils import Why, xfail_on_crosshair
 
 ConstantLists = integers().flatmap(lambda i: lists(just(i)))
 
@@ -97,6 +98,7 @@ def criterion(ls):
     assert set(result) == {False, ""}
 
 
+@xfail_on_crosshair(Why.undiscovered)  # for n >= 8 at least
 @pytest.mark.parametrize("n", range(1, 10))
 def test_can_shrink_through_a_binding(n):
     bool_lists = integers(0, 100).flatmap(
@@ -105,6 +107,7 @@ def test_can_shrink_through_a_binding(n):
     assert minimal(bool_lists, lambda x: x.count(True) >= n) == [True] * n
 
 
+@xfail_on_crosshair(Why.undiscovered)  # for n >= 8 at least
 @pytest.mark.parametrize("n", range(1, 10))
 def test_can_delete_in_middle_of_a_binding(n):
     bool_lists = integers(1, 100).flatmap(
diff --git a/hypothesis-python/tests/nocover/test_floating.py b/hypothesis-python/tests/nocover/test_floating.py
index fd679c3429..77ebfab28a 100644
--- a/hypothesis-python/tests/nocover/test_floating.py
+++ b/hypothesis-python/tests/nocover/test_floating.py
@@ -20,7 +20,7 @@
 from hypothesis.strategies import data, floats, lists
 
 from tests.common.debug import find_any
-from tests.common.utils import fails
+from tests.common.utils import Why, fails, xfail_on_crosshair
 
 TRY_HARDER = settings(
     max_examples=1000, suppress_health_check=[HealthCheck.filter_too_much]
@@ -93,6 +93,7 @@ def test_is_not_int(x):
     assert x != int(x)
 
 
+@xfail_on_crosshair(Why.undiscovered)
 @fails
 @given(floats())
 @TRY_HARDER
@@ -128,6 +129,7 @@ def test_floats_are_in_range(x, y, data):
     assert x <= t <= y
 
 
+@xfail_on_crosshair(Why.undiscovered)
 @pytest.mark.parametrize("neg", [False, True])
 @pytest.mark.parametrize("snan", [False, True])
 def test_can_find_negative_and_signaling_nans(neg, snan):
diff --git a/hypothesis-python/tests/nocover/test_recursive.py b/hypothesis-python/tests/nocover/test_recursive.py
index b47ec10b7e..c12a2ae105 100644
--- a/hypothesis-python/tests/nocover/test_recursive.py
+++ b/hypothesis-python/tests/nocover/test_recursive.py
@@ -17,7 +17,7 @@
 from hypothesis import HealthCheck, given, settings, strategies as st
 
 from tests.common.debug import find_any, minimal
-from tests.common.utils import flaky
+from tests.common.utils import Why, flaky, xfail_on_crosshair
 
 
 def test_can_generate_with_large_branching():
@@ -79,6 +79,7 @@ def test_drawing_many_near_boundary():
     assert len(ls) == size
 
 
+@xfail_on_crosshair(Why.undiscovered)
 def test_can_use_recursive_data_in_sets():
     nested_sets = st.recursive(st.booleans(), st.frozensets, max_leaves=3)
     find_any(nested_sets, settings=settings(deadline=None))
diff --git a/hypothesis-python/tests/nocover/test_regressions.py b/hypothesis-python/tests/nocover/test_regressions.py
index 980be13891..0870daf75c 100644
--- a/hypothesis-python/tests/nocover/test_regressions.py
+++ b/hypothesis-python/tests/nocover/test_regressions.py
@@ -14,7 +14,10 @@
 from hypothesis._settings import note_deprecation
 from hypothesis.errors import HypothesisDeprecationWarning
 
+from tests.common.utils import Why, xfail_on_crosshair
 
+
+@xfail_on_crosshair(Why.other)
 def test_note_deprecation_blames_right_code_issue_652():
     msg = "this is an arbitrary deprecation warning message"
 
@@ -58,6 +61,8 @@ def test_unique_floats_with_nan_is_not_flaky_3926(ls):
 
 # this will take a while to find the regression, but will eventually trigger it.
 # min_value=0 is critical to trigger the probing behavior which exhausts our buffer.
+# https://github.com/pschanely/CrossHair/issues/285 for an upstream fix.
+@xfail_on_crosshair(Why.other, strict=False)
 @given(st.integers(min_value=0, max_value=1 << 25_000))
 def test_overrun_during_datatree_simulation_3874(n):
     pass
diff --git a/hypothesis-python/tests/nocover/test_sampled_from.py b/hypothesis-python/tests/nocover/test_sampled_from.py
index f72beef0ab..bf2bcb2389 100644
--- a/hypothesis-python/tests/nocover/test_sampled_from.py
+++ b/hypothesis-python/tests/nocover/test_sampled_from.py
@@ -21,7 +21,7 @@
 from hypothesis.strategies._internal.strategies import SampledFromStrategy
 
 from tests.common.debug import find_any, minimal
-from tests.common.utils import fails_with
+from tests.common.utils import Why, fails_with, xfail_on_crosshair
 
 
 @pytest.mark.parametrize("size", [100, 10**5, 10**6, 2**25])
@@ -101,6 +101,7 @@ def test_flag_enum_repr_uses_class_not_a_list():
     assert lazy_repr == "sampled_from(tests.nocover.test_sampled_from.AFlag)"
 
 
+@xfail_on_crosshair(Why.undiscovered)
 def test_exhaustive_flags():
     # Generate powerset of flag combinations. There are only 2^3 of them, so
     # we can reasonably expect that they are all are found.
diff --git a/hypothesis-python/tests/nocover/test_simple_numbers.py b/hypothesis-python/tests/nocover/test_simple_numbers.py
index 8b66aaedd2..29cd20a2da 100644
--- a/hypothesis-python/tests/nocover/test_simple_numbers.py
+++ b/hypothesis-python/tests/nocover/test_simple_numbers.py
@@ -17,6 +17,7 @@
 from hypothesis.strategies import floats, integers, lists
 
 from tests.common.debug import minimal
+from tests.common.utils import Why, xfail_on_crosshair
 
 
 def test_minimize_negative_int():
@@ -116,6 +117,7 @@ def test_can_minimal_infinite_negative_float():
     assert minimal(floats(), lambda x: x < -sys.float_info.max)
 
 
+@xfail_on_crosshair(Why.undiscovered)  # sometimes
 def test_can_minimal_float_on_boundary_of_representable():
     minimal(floats(), lambda x: x + 1 == x and not math.isinf(x))
 
@@ -153,6 +155,7 @@ def test_minimal_fractional_float():
     assert minimal(floats(), lambda x: x >= 1.5) == 2
 
 
+@xfail_on_crosshair(Why.undiscovered)
 def test_minimizes_lists_of_negative_ints_up_to_boundary():
     result = minimal(
         lists(integers(), min_size=10),