Skip to content

Commit

Permalink
Added support for comparing different ucdxml files
Browse files Browse the repository at this point in the history
  • Loading branch information
jowilco committed Aug 23, 2024
1 parent cb314e8 commit 776e00e
Show file tree
Hide file tree
Showing 8 changed files with 987 additions and 226 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -760,10 +760,23 @@ private static void parsePropertyValueFile(
assert propInfo.property.getType() == PropertyType.Binary;
value = "Yes";
} else {
value =
propInfo.property.getType() == PropertyType.Binary
? "Yes"
: line.getParts()[2];
if (propInfo.property.getType() == PropertyType.Binary) {
//Handle @missing values for binary attributes (see 13.0.0 emoji-data.txt)
if (line.getParts().length == 3) {
if (line.getParts()[2].equals("No")) {
value = "No";
}
else {
value = "Yes";
}
}
else {
value = "Yes";
}
}
else {
value = line.getParts()[2];
}
// The value should not be an empty string.
// Exception: NFKC_Casefold does remove some characters by mapping them to nothing.
assert !value.isEmpty()
Expand Down
47 changes: 30 additions & 17 deletions unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import com.ibm.icu.dev.util.UnicodeMap;
import java.util.*;

import com.ibm.icu.util.VersionInfo;
import org.unicode.cldr.draft.FileUtilities;
import org.unicode.props.*;

Expand Down Expand Up @@ -53,14 +55,14 @@ public class AttributeResolver {

// If there is a change in any of these properties between two adjacent characters, it will
// result in a new range.
private final UcdProperty[] rangeDefiningProperties = {
UcdProperty.Age,
UcdProperty.Bidi_Class,
UcdProperty.Block,
UcdProperty.Decomposition_Mapping,
UcdProperty.Numeric_Type,
UcdProperty.Numeric_Value,
UcdProperty.Vertical_Orientation
private final UcdPropertyDetail[] rangeDefiningPropertyDetails = {
UcdPropertyDetail.Age_Detail,
UcdPropertyDetail.Bidi_Class_Detail,
UcdPropertyDetail.Block_Detail,
UcdPropertyDetail.Decomposition_Mapping_Detail,
UcdPropertyDetail.Numeric_Type_Detail,
UcdPropertyDetail.Numeric_Value_Detail,
UcdPropertyDetail.Vertical_Orientation_Detail
};

public AttributeResolver(IndexUnicodeProperties iup) {
Expand Down Expand Up @@ -120,7 +122,8 @@ private enum AliasType {
ALTERNATE("alternate"),
CONTROL("control"),
CORRECTION("correction"),
FIGMENT("figment");
FIGMENT("figment"),
NONE("none");

private final String aliasType;

Expand Down Expand Up @@ -171,8 +174,13 @@ private HashMap<Integer, LinkedList<NameAlias>> loadNameAliases() {
for (UcdLineParser.UcdLine line : parser) {
String[] parts = line.getParts();
int codepoint = Integer.parseInt(parts[0], 16);
NameAlias nameAlias =
new NameAlias(parts[1], AliasType.valueOf(parts[2].toUpperCase(Locale.ROOT)));
NameAlias nameAlias;
if(parts.length < 3) {
nameAlias = new NameAlias(parts[1], AliasType.NONE);
}
else {
nameAlias = new NameAlias(parts[1], AliasType.valueOf(parts[2].toUpperCase(Locale.ROOT)));
}

if (nameAliasesByCodepoint.containsKey(codepoint)) {
LinkedList<NameAlias> nameAliases =
Expand Down Expand Up @@ -412,13 +420,18 @@ private String getMappingValue(
return sb.toString().trim();
}

public boolean isDifferentRange(int codepointA, int codepointB) {
public boolean isDifferentRange(VersionInfo ucdVersion, int codepointA, int codepointB) {
boolean isDifference = false;
for (UcdProperty property : rangeDefiningProperties) {
isDifference =
isDifference
|| !getAttributeValue(property, codepointA)
.equals(getAttributeValue(property, codepointB));
for (UcdPropertyDetail propDetail : rangeDefiningPropertyDetails) {
UcdProperty prop = propDetail.getUcdProperty();
if (ucdVersion.compareTo(propDetail.getMinVersion()) >= 0
&& (propDetail.getMaxVersion() == null
|| ucdVersion.compareTo(propDetail.getMaxVersion()) < 0)) {
isDifference =
isDifference
|| !getAttributeValue(prop, codepointA)
.equals(getAttributeValue(prop, codepointB));
}
}
return isDifference;
}
Expand Down
180 changes: 180 additions & 0 deletions unicodetools/src/main/java/org/unicode/xml/CompareUcdXML.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
package org.unicode.xml;

import com.ibm.icu.dev.tool.UOption;
import com.ibm.icu.dev.util.UnicodeMap;
import com.ibm.icu.text.UnicodeSet;
import org.unicode.props.UcdProperty;

import java.io.*;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Objects;

public class CompareUcdXML {

private static final String NEWLINE = System.getProperty("line.separator");
private static final UOption[] options = {
UOption.HELP_H(),
UOption.create("fileA", 'a', UOption.REQUIRES_ARG),
UOption.create("fileB", 'b', UOption.REQUIRES_ARG)
};

private static final UcdProperty[] codepointSequenceProperties = new UcdProperty[]{
UcdProperty.Named_Sequences,
UcdProperty.Named_Sequences_Prov,
UcdProperty.Standardized_Variant,
UcdProperty.Emoji_DCM,
UcdProperty.Emoji_KDDI,
UcdProperty.Emoji_SB,
UcdProperty.Do_Not_Emit_Preferred
};

private static final HashMap<Integer, String[]> knownDifferences;

static {
knownDifferences = new HashMap<>();

//https://github.com/unicode-org/properties/issues/296
knownDifferences.put(0x31E4, new String [] {"Hani", "Zyyy"});
knownDifferences.put(0x31E5, new String [] {"Hani", "Zyyy"});

//https://github.com/unicode-org/unicodetools/issues/325
knownDifferences.put(0x109F7, new String [] {"1/6", "2/12"});
knownDifferences.put(0x109F8, new String [] {"1/4", "3/12"});
knownDifferences.put(0x109F9, new String [] {"1/3", "4/12"});
knownDifferences.put(0x109FB, new String [] {"1/2", "6/12"});
knownDifferences.put(0x109FD, new String [] {"2/3", "8/12"});
knownDifferences.put(0x109FE, new String [] {"3/4", "9/12"});
knownDifferences.put(0x109FF, new String [] {"5/6", "10/12"});

//https://github.com/unicode-org/properties/issues/172
knownDifferences.put(0x5146, new String [] {"1000000", "1000000 1000000000000"});
knownDifferences.put(0x79ED, new String [] {"1000000000", "1000000000 1000000000000"});
}

private static final int HELP = 0, FILE_A = 1, FILE_B = 2, LOGFILE = 3;

public static void main(String[] args) throws Exception {
File fileA = null;
File fileB = null;
int errorCount = 0;

UOption.parseArgs(args, options);

if (options[HELP].doesOccur) {
System.out.println(
"CompareUcdXML --fileA {file path} --fileB {file path}");
System.exit(0);
}

if (options[FILE_A].doesOccur) {
try {
fileA =
new File(options[FILE_A].value);
if (!fileA.exists()) {
throw new IOException();
}
} catch (Exception e) {
throw new IllegalArgumentException(
"Could not find " + options[FILE_A].value);
}
} else {
throw new IllegalArgumentException(
"Missing command line option: --fileA (or -a)");
}

if (options[FILE_B].doesOccur) {
try {
fileB =
new File(options[FILE_B].value);
if (!fileB.exists()) {
throw new IOException();
}
} catch (Exception e) {
throw new IllegalArgumentException(
"Could not find " + options[FILE_B].value);
}
} else {
throw new IllegalArgumentException(
"Missing command line option: --fileB (or -b)");
}

System.out.println("Comparing " + fileA + " and " + fileB);

final XMLProperties xmlPropsA = new XMLProperties(fileA);
final XMLProperties xmlPropsB = new XMLProperties(fileB);

//First, iterate through the UcdProperties on each codepoint.
for (final UcdProperty prop : UcdProperty.values()) {
UnicodeMap<String> fileAMap = xmlPropsA.getMap(prop);
UnicodeMap<String> fileBMap = xmlPropsB.getMap(prop);
if (!fileAMap.equals(fileBMap)) {
for (int i = 0; i <= 0x10ffff; ++i) {
try {
String xmlValA = fileAMap.get(i);
String xmlValB = fileBMap.get(i);
if(!Objects.equals(xmlValA, xmlValB)) {
//At least one string is != null and the strings are different, but we don't care if one
// is null and one is empty_string
//As far as we care, empty_string == null == "00000"
int lenA = (xmlValA == null ? 0 : (xmlValA.equals("00000") ? 0 : xmlValA.length()));
int lenB = (xmlValB == null ? 0 : (xmlValB.equals("00000") ? 0 : xmlValB.length()));
if (!(lenA == 0 && lenB == 0) && !isKnownDifference(i, xmlValA, xmlValB)) {
errorCount++;
System.out.println("For UCDProperty " + prop.name() + " (" + prop.getShortName() +
") [" + String.format("0x%04X", i) + "], ");
System.out.println("\t" + fileA + " = " + xmlValA);
System.out.println("\t" + fileB + " = " + xmlValB);
}
}
}
catch (Exception e) {
System.out.println("Exception thrown for " + String.format("0x%04X", i));
System.out.println(e.getMessage());
}
}
}
}
//Now handle anything that contains codepoint sequences.
for (UcdProperty prop : codepointSequenceProperties) {
UnicodeMap<String> fileAMap = xmlPropsA.getMap(prop);
UnicodeMap<String> fileBMap = xmlPropsB.getMap(prop);
UnicodeSet differences = fileAMap.keySet().addAll(fileBMap.keySet());
for (String key : differences) {
try {
String xmlValA = fileAMap.get(key);
String xmlValB = fileBMap.get(key);
if(!Objects.equals(xmlValA, xmlValB)) {
//At least one string is != null and the strings are different, but we don't care if one
// is null and one is empty_string
//As far as we care, empty_string == null == "00000"
int lenA = (xmlValA == null ? 0 : (xmlValA.equals("00000") ? 0 : xmlValA.length()));
int lenB = (xmlValB == null ? 0 : (xmlValB.equals("00000") ? 0 : xmlValB.length()));
if (!(lenA == 0 && lenB == 0)) {
errorCount++;
System.out.println("For UCDProperty " + prop.name() + " (" + prop.getShortName() +
") [" + key + "], ");
System.out.println("\t" + fileA + " = " + xmlValA);
System.out.println("\t" + fileB + " = " + xmlValB);
}
}
}
catch (Exception e) {
System.out.println("Exception thrown for " + String.format("0x%04X", key));
System.out.println(e.getMessage());
}
}
}
System.exit(errorCount);
}

private static boolean isKnownDifference(int codepoint, String xmlValA, String xmlValB) {
if (knownDifferences.containsKey(codepoint)) {
String knownValue1 = knownDifferences.get(codepoint)[0];
String knownValue2 = knownDifferences.get(codepoint)[1];
return (knownValue1.equals(xmlValA) && knownValue2.equals(xmlValB)) ||
(knownValue1.equals(xmlValB) && knownValue2.equals(xmlValA));
}
return false;
}
}
18 changes: 18 additions & 0 deletions unicodetools/src/main/java/org/unicode/xml/UCDDataResolver.java
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,24 @@ public void buildSection(UcdSectionDetail.UcdSection ucdSection) throws SAXExcep
}
}
break;
case PROVISIONALNAMEDSEQUENCES:
HashMap<String, String> provisionalNamedSequences = new HashMap<>();
for (UcdLineParser.UcdLine line : parser) {
String[] parts = line.getParts();
provisionalNamedSequences.put(parts[0], parts[1]);
}
List<String> psNames = new ArrayList<>(provisionalNamedSequences.keySet());
Collections.sort(psNames);
for (String name : psNames) {
AttributesImpl attributes =
getNamedSequenceAttributes(
namespace, name, provisionalNamedSequences);
writer.startElement(childTag, attributes);
{
writer.endElement(childTag);
}
}
break;
default:
for (UcdLineParser.UcdLine line : parser) {
AttributesImpl attributes =
Expand Down
Loading

0 comments on commit 776e00e

Please sign in to comment.