Skip to content

Commit

Permalink
feat(core): fixup markers as regex πŸ™€
Browse files Browse the repository at this point in the history
- needed to remove/re-add the β€œβ‰ˆβ‰ˆ[\u0001-\uD7FE]” format  (aka LDML_MARKER_ANY_INDEX)

#9121
  • Loading branch information
srl295 committed Dec 31, 2023
1 parent 80ba5aa commit 02ba64d
Show file tree
Hide file tree
Showing 3 changed files with 219 additions and 13 deletions.
150 changes: 142 additions & 8 deletions core/src/ldml/ldml_transforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -518,8 +518,8 @@ transform_entry::init() {
}
// TODO-LDML: if we have mapFrom, may need to do other processing.
std::u16string patstr = km::core::kmx::u32string_to_u16string(fFrom);
// normalize, including markers
normalize_nfd_markers(patstr);
// normalize, including markers, for regex
normalize_nfd_markers(patstr, true);
UErrorCode status = U_ZERO_ERROR;
/* const */ icu::UnicodeString patustr = icu::UnicodeString(patstr.data(), (int32_t)patstr.length());
// add '$' to match to end
Expand Down Expand Up @@ -1048,7 +1048,48 @@ bool normalize_nfc(std::u16string &str) {
return normalize(nfc, str, status);
}

std::u32string remove_markers(const std::u32string &str, marker_map *markers, bool _kmn_unused(for_regex)) {
void
prepend_hex_quad(std::u32string &str, KMX_DWORD marker) {
for (auto i = 0; i < 4; i++) {
KMX_DWORD remainder = marker & 0xF; // get the last nibble
char32_t ch;
if (remainder < 0xA) {
ch = U'0' + remainder;
} else {
ch = U'A' + (remainder - 0xA);
}
str.insert(0, 1, ch); // prepend
marker >>= 4;
}
}

inline int xdigitval(km_core_usv ch) {
if (ch >= U'0' && ch <= U'9') {
return (ch - U'0');
} else if (ch >= U'a' && ch <= U'f') {
return (0xA + ch - U'a');
} else if (ch >= U'A' && ch <= U'F') {
return (0xA + ch - U'A');
} else {
return -1;
}
}

KMX_DWORD parse_hex_quad(const km_core_usv hex_str[]) {
KMX_DWORD mark_no = 0;
for(auto i = 0; i < 4; i++) {
mark_no <<= 4;
auto c = hex_str[i];
auto n = xdigitval(c);
if (n == -1) {
return 0;
}
mark_no |= n;
}
return mark_no;
}

std::u32string remove_markers(const std::u32string &str, marker_map *markers, bool for_regex) {
std::u32string out;
auto i = str.begin();
auto last = i;
Expand All @@ -1074,14 +1115,107 @@ std::u32string remove_markers(const std::u32string &str, marker_map *markers, bo
break; // hit end
}

// #3 marker number
const KMX_DWORD marker_no = *i;
assert(marker_no >= LDML_MARKER_MIN_INDEX && marker_no <= LDML_MARKER_MAX_INDEX);
i++; // if end, we'll break out of the loop
KMX_DWORD marker_no;
if (!for_regex) {
// #3 marker number
marker_no = *i;
i++; // if end, we'll break out of the loop
} else {
// is it an escape or a range?
if (*i == U'\\') {
if (++i == str.end()) {
break;
}
assert(*i == U'u');
if (++i == str.end()) {
break;
}
km_core_usv markno[4];

markno[0] = *(i++);
if (i == str.end()) {
break;
}
markno[1] = *(i++);
if (i == str.end()) {
break;
}
markno[2] = *(i++);
if (i == str.end()) {
break;
}
markno[3] = *(i++);
marker_no = parse_hex_quad(markno);
assert (marker_no != 0); // illegal marker number
} else if (*i == U'[') {
if (++i == str.end()) {
break;
}
assert(*i == U'\\');
if (++i == str.end()) {
break;
}
assert(*i == U'u');
if (++i == str.end()) {
break;
}
assert(xdigitval(*i) != -1);
if (++i == str.end()) {
break;
}
assert(xdigitval(*i) != -1);
if (++i == str.end()) {
break;
}
assert(xdigitval(*i) != -1);
if (++i == str.end()) {
break;
}
assert(xdigitval(*i) != -1);
if (++i == str.end()) {
break;
}
assert(*i == U'-');
if (++i == str.end()) {
break;
}
assert(*i == U'\\');
if (++i == str.end()) {
break;
}
assert(*i == U'u');
if (++i == str.end()) {
break;
}
assert(xdigitval(*i) != -1);
if (++i == str.end()) {
break;
}
assert(xdigitval(*i) != -1);
if (++i == str.end()) {
break;
}
assert(xdigitval(*i) != -1);
if (++i == str.end()) {
break;
}
assert(xdigitval(*i) != -1);
if (++i == str.end()) {
break;
}
assert(*i == U']');
i++;
marker_no = LDML_MARKER_ANY_INDEX;
} else {
assert(*i == U'\\' || *i == U'['); // error.
marker_no = 0; // error, don't record
}
}
assert(marker_no >= LDML_MARKER_MIN_INDEX && marker_no <= LDML_MARKER_ANY_INDEX);
last = i;

// record the marker
if (markers != nullptr) {
if (marker_no >= LDML_MARKER_MIN_INDEX && markers != nullptr) {
if (i == str.end()) {
markers->emplace(MARKER_BEFORE_EOT, marker_no);
} else {
Expand Down
37 changes: 33 additions & 4 deletions core/src/ldml/ldml_transforms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -340,12 +340,41 @@ inline std::u32string remove_markers(const std::u32string &str, marker_map &mark
}

/** prepend the marker string in UC_SENTINEL format to the str */
inline static void prepend_marker(std::u32string &str, KMX_DWORD marker, bool for_regex = false);
inline void prepend_marker(std::u32string &str, KMX_DWORD marker, bool for_regex = false);

/** format 'marker' as 0001...FFFF and put it at the beginning of the string */
void prepend_hex_quad(std::u32string &str, KMX_DWORD marker);

/** parse 0001...FFFF into a KMX_DWORD. Returns 0 on failure */
KMX_DWORD parse_hex_quad(const km_core_usv hex_str[]);

void
prepend_marker(std::u32string &str, KMX_DWORD marker, bool _kmn_unused(for_regex)) {
km_core_usv triple[] = {LDML_UC_SENTINEL, LDML_MARKER_CODE, marker};
str.insert(0, triple, 3);
prepend_marker(std::u32string &str, KMX_DWORD marker, bool for_regex) {
if (!for_regex) {
km_core_usv markstr[] = {LDML_UC_SENTINEL, LDML_MARKER_CODE, marker};
str.insert(0, markstr, 3);
} else {
if (marker == LDML_MARKER_ANY_INDEX) {
// recreate the regex from back to front
str.insert(0, 1, U']');
prepend_hex_quad(str, LDML_MARKER_MAX_INDEX);
str.insert(0, 1, U'u');
str.insert(0, 1, U'\\');
str.insert(0, 1, U'-');
prepend_hex_quad(str, LDML_MARKER_MIN_INDEX);
str.insert(0, 1, U'u');
str.insert(0, 1, U'\\');
str.insert(0, 1, U'[');
str.insert(0, 1, LDML_MARKER_CODE);
str.insert(0, 1, LDML_UC_SENTINEL);
} else {
// add hex part
prepend_hex_quad(str, marker);
// add static part
km_core_usv markstr[] = {LDML_UC_SENTINEL, LDML_MARKER_CODE, u'\\', u'u'};
str.insert(0, markstr, 4);
}
}
}

bool normalize_nfd_markers(std::u16string &str, bool for_regex) {
Expand Down
45 changes: 44 additions & 1 deletion core/tests/unit/ldml/test_transforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -658,6 +658,32 @@ int test_strutils() {
assert_equal(map[0x0300], 0x3L);
assert_equal(map[MARKER_BEFORE_EOT], 0x4L);
}
{
std::cout << __FILE__ << ":" << __LINE__ << " - prepend hex quad" << std::endl;
{
std::u32string dst;
prepend_hex_quad(dst, 0x0001);
zassert_string_equal(dst, U"0001");
}
{
std::u32string dst;
prepend_hex_quad(dst, 0xCAFE);
zassert_string_equal(dst, U"CAFE");
}
{
std::u32string dst;
prepend_hex_quad(dst, 0xFFFF);
zassert_string_equal(dst, U"FFFF");
}
}
{
std::cout << __FILE__ << ":" << __LINE__ << " - parse hex quad" << std::endl;
assert_equal(parse_hex_quad(U"0001"), 0x0001);
assert_equal(parse_hex_quad(U"CAFE"), 0xCAFE);
assert_equal(parse_hex_quad(U"D00d"), 0xD00D);
assert_equal(parse_hex_quad(U"FFFF"), 0xFFFF);
assert_equal(parse_hex_quad(U"zzzz"), 0); // err
}
return EXIT_SUCCESS;
}

Expand Down Expand Up @@ -785,7 +811,7 @@ int test_normalize() {
const std::u32string src = U"9ce\u0300\uFFFF\b\\u0002\u0320\uFFFF\b\\u0001";
const std::u32string expect = U"9ce\uFFFF\b\\u0002\u0320\u0300\uFFFF\b\\u0001";
std::u32string dst = src;
assert(normalize_nfd_markers(dst, map)); // TODO-LDML: need regex flag
assert(normalize_nfd_markers(dst, map, true)); // TODO-LDML: need regex flag
if (dst != expect) {
std::cout << "dst: " << Debug_UnicodeString(dst) << std::endl;
std::cout << "exp: " << Debug_UnicodeString(expect) << std::endl;
Expand All @@ -795,6 +821,23 @@ int test_normalize() {
assert_equal(map[0x0320], 0x2L);
assert_equal(map[MARKER_BEFORE_EOT], 0x1L);
}
{
// from tests - regex edition
marker_map map;
std::cout << __FILE__ << ":" << __LINE__ << " - complex test \\m{.}" << std::endl;
const std::u32string src = U"9ce\u0300\uFFFF\b[\\u0001-\\uD7FE]\u0320\uFFFF\b\\u0001";
const std::u32string expect = U"9ce\uFFFF\b[\\u0001-\\uD7FE]\u0320\u0300\uFFFF\b\\u0001";
std::u32string dst = src;
assert(normalize_nfd_markers(dst, map, true)); // TODO-LDML: need regex flag
if (dst != expect) {
std::cout << "dst: " << Debug_UnicodeString(dst) << std::endl;
std::cout << "exp: " << Debug_UnicodeString(expect) << std::endl;
}
zassert_string_equal(dst, expect);
assert_equal(map.size(), 2);
assert_equal(map[0x0320], LDML_MARKER_ANY_INDEX);
assert_equal(map[MARKER_BEFORE_EOT], 0x1L);
}


return EXIT_SUCCESS;
Expand Down

0 comments on commit 02ba64d

Please sign in to comment.