Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(developer): escape bad markers 🙀 #10306

Merged
merged 6 commits into from
Jan 3, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 22 additions & 6 deletions common/web/types/src/ldml-keyboard/pattern-parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,18 @@ export class MarkerParser {
/** Max count of markers */
public static readonly MAX_MARKER_COUNT = constants.marker_max_count;

/** 0000 … FFFF */
private static hexQuad(n: number): string {
if (n < 0x000 || n > 0xFFFF) {
throw RangeError(`${n} not in [0x0000,0xFFFF]`);
}
return (`0000` + (n).toString(16)).slice(-4);
srl295 marked this conversation as resolved.
Show resolved Hide resolved
}

private static anyMarkerMatch() : string {
const start = (`0000` + (this.MIN_MARKER_INDEX).toString(16)).slice(-4);
const end = (`0000` + (this.MAX_MARKER_INDEX).toString(16)).slice(-4);
return `${this.SENTINEL}${this.MARKER_CODE}[\\u${start}-\\u${end}]`;
const start = MarkerParser.hexQuad(this.MIN_MARKER_INDEX);
const end = MarkerParser.hexQuad(this.MAX_MARKER_INDEX);
return `${this.SENTINEL}${this.MARKER_CODE}[\\u${start}-\\u${end}]`; // TODO-LDML: #9121 wrong escape format
}

/** Expression that matches any marker */
Expand All @@ -91,12 +99,20 @@ export class MarkerParser {
return matchArray(str, this.REFERENCE);
}

private static markerCodeToString(n: number, forMatch?: boolean): string {
if (!forMatch) {
return String.fromCharCode(n);
} else {
return `\\u${MarkerParser.hexQuad(n)}`; // TODO-LDML: #9121 wrong escape format
}
}

/** @returns string for marker #n */
public static markerOutput(n: number): string {
public static markerOutput(n: number, forMatch?: boolean): string {
if (n < MarkerParser.MIN_MARKER_INDEX || n > MarkerParser.ANY_MARKER_INDEX) {
throw RangeError(`Internal Error: marker index out of range ${n}`);
}
return this.SENTINEL + this.MARKER_CODE + String.fromCharCode(n);
return this.SENTINEL + this.MARKER_CODE + this.markerCodeToString(n, forMatch);
}

/** @returns all marker strings as sentinel values */
Expand All @@ -118,7 +134,7 @@ export class MarkerParser {
} else if(order > MarkerParser.MAX_MARKER_INDEX) {
throw RangeError(`Internal Error: marker \\m{${arg}} has out of range index ${order}`);
} else {
return MarkerParser.markerOutput(order + 1);
return MarkerParser.markerOutput(order + 1, forMatch);
}
});
}
Expand Down
172 changes: 153 additions & 19 deletions core/src/ldml/ldml_transforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -518,8 +518,8 @@ transform_entry::init() {
}
// TODO-LDML: if we have mapFrom, may need to do other processing.
std::u16string patstr = km::core::kmx::u32string_to_u16string(fFrom);
// normalize, including markers
normalize_nfd_markers(patstr);
// normalize, including markers, for regex
normalize_nfd_markers(patstr, true);
UErrorCode status = U_ZERO_ERROR;
/* const */ icu::UnicodeString patustr = icu::UnicodeString(patstr.data(), (int32_t)patstr.length());
// add '$' to match to end
Expand Down Expand Up @@ -950,17 +950,17 @@ bool normalize_nfd(std::u16string &str) {
return normalize(nfd, str, status);
}

bool normalize_nfd_markers(std::u16string &str, marker_map &map) {
bool normalize_nfd_markers(std::u16string &str, marker_map &map, bool for_regex) {
srl295 marked this conversation as resolved.
Show resolved Hide resolved
std::u32string rstr = km::core::kmx::u16string_to_u32string(str);
if(!normalize_nfd_markers(rstr, map)) {
if(!normalize_nfd_markers(rstr, map, for_regex)) {
return false;
} else {
str = km::core::kmx::u32string_to_u16string(rstr);
return true;
}
}

void add_back_markers(std::u32string &str, const std::u32string &src, const marker_map &map) {
static void add_back_markers(std::u32string &str, const std::u32string &src, const marker_map &map, bool for_regex) {
// need to reconstitute.
marker_map map2(map); // make a copy of the map
// clear the string
Expand All @@ -970,7 +970,7 @@ void add_back_markers(std::u32string &str, const std::u32string &src, const mark
const auto ch = MARKER_BEFORE_EOT;
const auto m = map2.find(ch);
if (m != map2.end()) {
prepend_marker(str, m->second);
prepend_marker(str, m->second, for_regex);
map2.erase(ch); // remove it
}
}
Expand All @@ -981,7 +981,7 @@ void add_back_markers(std::u32string &str, const std::u32string &src, const mark

const auto m = map2.find(ch);
if (m != map2.end()) {
prepend_marker(str, m->second);
prepend_marker(str, m->second, for_regex);
map2.erase(ch); // remove it
}
}
Expand All @@ -992,9 +992,9 @@ void add_back_markers(std::u32string &str, const std::u32string &src, const mark
* - doesn't support >1 marker per char - may need a set instead of a map!
* - ideally this should be used on a normalization safe subsequence
*/
bool normalize_nfd_markers(std::u32string &str, marker_map &map) {
bool normalize_nfd_markers(std::u32string &str, marker_map &map, bool for_regex) {
/** original string, but no markers */
std::u32string str_unmarked = remove_markers(str, map);
std::u32string str_unmarked = remove_markers(str, map, for_regex);
/** original string, no markers, NFD */
std::u32string str_unmarked_nfd = str_unmarked;
if(!normalize_nfd(str_unmarked_nfd)) {
Expand All @@ -1006,14 +1006,14 @@ bool normalize_nfd_markers(std::u32string &str, marker_map &map) {
// Normalization produced no change when markers were removed.
// So, we'll call this a no-op.
} else {
add_back_markers(str, str_unmarked_nfd, map);
add_back_markers(str, str_unmarked_nfd, map, for_regex);
}
return true; // all OK
}

bool normalize_nfc_markers(std::u32string &str, marker_map &map) {
bool normalize_nfc_markers(std::u32string &str, marker_map &map, bool for_regex) {
/** original string, but no markers */
std::u32string str_unmarked = remove_markers(str, map);
std::u32string str_unmarked = remove_markers(str, map, for_regex);
/** original string, no markers, NFC */
std::u32string str_unmarked_nfc = str_unmarked;
if(!normalize_nfc(str_unmarked_nfc)) {
Expand All @@ -1025,7 +1025,7 @@ bool normalize_nfc_markers(std::u32string &str, marker_map &map) {
// Normalization produced no change when markers were removed.
// So, we'll call this a no-op.
} else {
add_back_markers(str, str_unmarked_nfc, map);
add_back_markers(str, str_unmarked_nfc, map, for_regex);
}
return true; // all OK
}
Expand All @@ -1048,7 +1048,48 @@ bool normalize_nfc(std::u16string &str) {
return normalize(nfc, str, status);
}

std::u32string remove_markers(const std::u32string &str, marker_map *markers) {
void
prepend_hex_quad(std::u32string &str, KMX_DWORD marker) {
for (auto i = 0; i < 4; i++) {
KMX_DWORD remainder = marker & 0xF; // get the last nibble
char32_t ch;
if (remainder < 0xA) {
ch = U'0' + remainder;
} else {
ch = U'A' + (remainder - 0xA);
}
str.insert(0, 1, ch); // prepend
marker >>= 4;
}
}

inline int xdigitval(km_core_usv ch) {
if (ch >= U'0' && ch <= U'9') {
return (ch - U'0');
} else if (ch >= U'a' && ch <= U'f') {
return (0xA + ch - U'a');
} else if (ch >= U'A' && ch <= U'F') {
return (0xA + ch - U'A');
} else {
return -1;
}
}

KMX_DWORD parse_hex_quad(const km_core_usv hex_str[]) {
KMX_DWORD mark_no = 0;
for(auto i = 0; i < 4; i++) {
mark_no <<= 4;
auto c = hex_str[i];
auto n = xdigitval(c);
if (n == -1) {
return 0;
}
mark_no |= n;
}
return mark_no;
}

std::u32string remove_markers(const std::u32string &str, marker_map *markers, bool for_regex) {
std::u32string out;
auto i = str.begin();
auto last = i;
Expand All @@ -1074,14 +1115,107 @@ std::u32string remove_markers(const std::u32string &str, marker_map *markers) {
break; // hit end
}

// #3 marker number
const KMX_DWORD marker_no = *i;
assert(marker_no >= LDML_MARKER_MIN_INDEX && marker_no <= LDML_MARKER_MAX_INDEX);
i++; // if end, we'll break out of the loop
KMX_DWORD marker_no;
if (!for_regex) {
// #3 marker number
marker_no = *i;
i++; // if end, we'll break out of the loop
} else {
// is it an escape or a range?
if (*i == U'\\') {
if (++i == str.end()) {
break;
}
assert(*i == U'u');
if (++i == str.end()) {
break;
}
km_core_usv markno[4];

markno[0] = *(i++);
if (i == str.end()) {
break;
}
markno[1] = *(i++);
if (i == str.end()) {
break;
}
markno[2] = *(i++);
if (i == str.end()) {
break;
}
markno[3] = *(i++);
marker_no = parse_hex_quad(markno);
assert (marker_no != 0); // illegal marker number
} else if (*i == U'[') {
if (++i == str.end()) {
break;
}
assert(*i == U'\\');
if (++i == str.end()) {
break;
}
assert(*i == U'u');
if (++i == str.end()) {
break;
}
assert(xdigitval(*i) != -1);
if (++i == str.end()) {
break;
}
assert(xdigitval(*i) != -1);
if (++i == str.end()) {
break;
}
assert(xdigitval(*i) != -1);
if (++i == str.end()) {
break;
}
assert(xdigitval(*i) != -1);
if (++i == str.end()) {
break;
}
assert(*i == U'-');
if (++i == str.end()) {
break;
}
assert(*i == U'\\');
if (++i == str.end()) {
break;
}
assert(*i == U'u');
if (++i == str.end()) {
break;
}
assert(xdigitval(*i) != -1);
if (++i == str.end()) {
break;
}
assert(xdigitval(*i) != -1);
if (++i == str.end()) {
break;
}
assert(xdigitval(*i) != -1);
if (++i == str.end()) {
break;
}
assert(xdigitval(*i) != -1);
if (++i == str.end()) {
break;
}
assert(*i == U']');
i++;
marker_no = LDML_MARKER_ANY_INDEX;
} else {
assert(*i == U'\\' || *i == U'['); // error.
marker_no = 0; // error, don't record
}
}
assert(marker_no >= LDML_MARKER_MIN_INDEX && marker_no <= LDML_MARKER_ANY_INDEX);
last = i;

// record the marker
if (markers != nullptr) {
if (marker_no >= LDML_MARKER_MIN_INDEX && markers != nullptr) {
if (i == str.end()) {
markers->emplace(MARKER_BEFORE_EOT, marker_no);
} else {
Expand Down
Loading