diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java index d237633e7..ac507c58e 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java @@ -525,6 +525,15 @@ private static void propertywiseAlikeLine( } } + private static String stringAt(UnicodeSet set, int i) { + final int codePointsSize = set.size() - set.strings().size(); + if (i < codePointsSize) { + return Character.toString(set.charAt(i)); + } else { + return set.strings().stream().skip(i - codePointsSize).findFirst().get(); + } + } + private static void propertywiseCorrespondenceLine( Set ignoredProperties, UnicodeSet firstSet, @@ -538,13 +547,13 @@ private static void propertywiseCorrespondenceLine( final List sets = new ArrayList<>(); sets.add(firstSet); expectToken(":", pp, source); + + // Index of the first set of multi-character strings (and of the first multi-character + // reference string). + // This is `m` in the documentation in UnicodeInvariantTest.txt. + int firstMultiCharacterIndex = -1; do { final var set = parseUnicodeSet(source, pp); - if (set.hasStrings()) { - throw new BackwardParseException( - "Set should contain only single code points for property comparison", - pp.getIndex()); - } if (set.size() != firstSet.size()) { throw new BackwardParseException( "Sets should have the same size for property correspondence (got " @@ -554,18 +563,41 @@ private static void propertywiseCorrespondenceLine( + ")", pp.getIndex()); } + if (set.hasStrings() && set.strings().size() != set.size()) { + throw new BackwardParseException( + "Sets should be all strings or all code points for property correspondence", + pp.getIndex()); + } + if (firstMultiCharacterIndex == -1) { + if (set.hasStrings()) { + firstMultiCharacterIndex = sets.size(); + } + } else if (!set.hasStrings()) { + throw new BackwardParseException( + "Code points should come before strings in property correspondence", + pp.getIndex()); + } sets.add(set); } while (Lookahead.oneToken(pp, source).accept(":")); - final List referenceCodePoints = new ArrayList<>(); + if (firstMultiCharacterIndex == -1) { + firstMultiCharacterIndex = sets.size(); + } + final List referenceCodePoints = new ArrayList<>(); expectToken("CorrespondTo", pp, source); do { final var referenceSet = parseUnicodeSet(source, pp); - if (referenceSet.hasStrings() || referenceSet.size() != 1) { + if (referenceSet.size() != 1) { + throw new BackwardParseException( + "reference should be a single code point or string for property correspondence", + pp.getIndex()); + } + if (referenceSet.hasStrings() + != (referenceCodePoints.size() >= firstMultiCharacterIndex)) { throw new BackwardParseException( - "reference should be a single code point for property correspondence", + "Strings should correspond to strings for property correspondence", pp.getIndex()); } - referenceCodePoints.add(referenceSet.charAt(0)); + referenceCodePoints.add(referenceSet.iterator().next()); } while (Lookahead.oneToken(pp, source).accept(":")); if (referenceCodePoints.size() != sets.size()) { throw new BackwardParseException( @@ -608,8 +640,8 @@ public ExpectedPropertyDifference(String actualValueAlias, String referenceValue expectedDifference = expectedPropertyDifferences.get(alias); } if (expectedDifference != null) { - for (int k = 0; k < sets.size(); ++k) { - final int rk = referenceCodePoints.get(k); + for (int k = 0; k < firstMultiCharacterIndex; ++k) { + final int rk = referenceCodePoints.get(k).codePointAt(0); final String pRk = property.getValue(rk); if (!Objects.equals(pRk, expectedDifference.referenceValueAlias)) { errorMessageLines.add( @@ -638,9 +670,9 @@ public ExpectedPropertyDifference(String actualValueAlias, String referenceValue } } } else { - for (int k = 0; k < sets.size(); ++k) { + for (int k = 0; k < firstMultiCharacterIndex; ++k) { final UnicodeSet set = sets.get(k); - final int rk = referenceCodePoints.get(k); + final int rk = referenceCodePoints.get(k).codePointAt(0); final String pRk = property.getValue(rk); loop_over_set: for (int i = 0; i < set.size(); ++i) { @@ -652,10 +684,9 @@ public ExpectedPropertyDifference(String actualValueAlias, String referenceValue Integer lMatchingForReference = null; for (int l = 0; l < sets.size(); ++l) { final boolean pCkEqualsCl = - Objects.equals(pCk, Character.toString(sets.get(l).charAt(i))); + Objects.equals(pCk, stringAt(sets.get(l), i)); final boolean pRkEqualsRl = - Objects.equals( - pRk, Character.toString(referenceCodePoints.get(l))); + Objects.equals(pRk, referenceCodePoints.get(l)); if (pRkEqualsRl) { lMatchingForReference = l; if (pCkEqualsCl) { @@ -685,8 +716,7 @@ public ExpectedPropertyDifference(String actualValueAlias, String referenceValue + ")\t=\t" + pCk + "\t≠\t" - + Character.toString( - sets.get(lMatchingForReference).charAt(i)) + + stringAt(sets.get(lMatchingForReference), i) + "\twhereas\t" + property.getName() + "(" diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index 34c9de0d2..79bcdcbb4 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -147,16 +147,19 @@ # CorrespondTo : ... : # [ UpTo: ( vs ) {, ( vs ) }] # -# The Sₖ must be Unicode sets of equal size with no strings. They are considered in code -# point order for the correspondence check (item 2 below). -# The references Rₖ must be Unicode sets each containing a single code point; by a slight abuse of -# notation we refer to the code point as Rₖ in the explanation below. +# The Sₖ must be Unicode sets of equal size, either with no strings or only strings. +# They are considered in code point order for the correspondence check (item 2 below). +# The references Rₖ must be Unicode sets each containing a single code point or a single string; +# by a slight abuse of notation we refer to the code point or string as Rₖ in the explanation below. +# For some m in 2 .. n, the following must hold: +# a. Rₖ is a code point and Sₖ must contain only code points for k ≤ m, and +# b. Rₖ is a string and Sₖ must contain only strings for m < k ≤ n, and # For every non-ignored property P that does not appear in the optional UpTo clause, -# checks that for each k in 1 .. n, for the ith character C in Sₖ, either: +# checks that for each k in 1 .. m, for the ith character C in Sₖ, either: # 1. P(C) = P(Rₖ), or # 2. for some l in 1 .. n, both: # — P(Rₖ) is equal to Rₗ, and -# — P(C) is equal to the ith character in Sₗ. +# — P(C) is equal to the ith character (or string, if l > m) in Sₗ. # For every non-ignored property P that appears in the UpTo clause, checks all characters in the # sets Sₖ have the SValue and all R characters have the RValue. # @@ -174,9 +177,9 @@ Propertywise [[α-ω] - [ς]] : [[Α-Ω] - \p{gc=Cn}] CorrespondTo [g] : [G] UpTo: Block (Greek_And_Coptic vs Basic_Latin), - Script (Greek vs Latin), - Script_Extensions (Greek vs Latin), - East_Asian_Width (Ambiguous vs Narrow) + Script (Greek vs Latin), + Script_Extensions (Greek vs Latin), + East_Asian_Width (Ambiguous vs Narrow) # The modifier letters ʳʷʸ are related to their non-superscripted counterparts in the same way # that ʰ is related to h. The capitals must be part of the correspondence because they are # property values of the lowercase letters. @@ -1369,6 +1372,13 @@ Ignoring Unicode_1_Name Confusable_MA: CorrespondTo [ⁱ] : [i] : [I] end Ignoring; + Propertywise [ゟ] : [{より}] + CorrespondTo [ヿ] : [{コト}] + UpTo: Block (Hiragana vs Katakana), + Script (Hiragana vs Katakana), + Script_Extensions (Hiragana vs Katakana), + Word_Break (Other vs Katakana) + end Ignoring; end Ignoring; \ No newline at end of file