Skip to content

Commit

Permalink
feat(core): ldml normalization 🙀
Browse files Browse the repository at this point in the history
- more progress in the normalization pipeline. Trying to keep it from leaking.
- normalize output to NFC.
- normalize json test data to NFC (both context and expected).
- we do NOT try to normalize the 'embedded' strings currently.

For: #9468
  • Loading branch information
srl295 committed Oct 12, 2023
1 parent 736af4b commit 426668b
Show file tree
Hide file tree
Showing 6 changed files with 182 additions and 80 deletions.
192 changes: 115 additions & 77 deletions core/src/ldml/ldml_processor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
*/

#include <fstream>
#include <algorithm>
#include "ldml/ldml_processor.hpp"
#include "state.hpp"
#include "kmx_file.h"
Expand Down Expand Up @@ -260,81 +261,117 @@ ldml_processor::process_event(

void
ldml_processor::process_key_string(km_core_state *state, const std::u16string &key_str) const {
// found a string - push it into the context and actions
// we convert it here instead of using the emit_text() overload
// so that we don't have to reconvert it inside the transform code.
std::u32string str32 = kmx::u16string_to_u32string(key_str);
UErrorCode status = U_ZERO_ERROR;
// We know that key_str is not empty per the caller.

if (!transforms) {
// No transforms: just emit the string.
emit_text(state, str32);
// we convert the keys str to UTF-32 here instead of using the emit_text() overload
// so that we don't have to reconvert it inside the transform code.
std::u32string key_str32 = kmx::u16string_to_u32string(key_str);
// normalize the keystroke to NFD
ldml::normalize_nfd(key_str32, status);

// extract context string, in NFC
std::u32string old_ctxtstr_nfc;
(void)context_to_string(state, old_ctxtstr_nfc, false);
ldml::normalize_nfc(old_ctxtstr_nfc, status);
assert(U_SUCCESS(status));

// context string in NFD
std::u32string ctxtstr;
(void)context_to_string(state, ctxtstr, true); // with markers
// add the newly added key output to ctxtstr
ctxtstr.append(key_str32);
ldml::normalize_nfd(ctxtstr, status);
assert(U_SUCCESS(status));

/** transform output string */
std::u32string outputString;
/** how many chars of the ctxtstr to replace */
size_t matchedContext = 0; // zero if no transforms

// begin modifications to the string

if(transforms) {
matchedContext = transforms->apply(ctxtstr, outputString);
} else {
// Process transforms here
/**
* a copy of the current/changed context, for transform use.
*
*/
std::u32string ctxtstr;
(void)context_to_string(state, ctxtstr);
// add the newly added key output to ctxtstr
ctxtstr.append(str32);
// and normalize

/** the output buffer for transforms */
std::u32string outputString;

// apply the transform, get how much matched (at the end)
const size_t matchedContext = transforms->apply(ctxtstr, outputString);


if (matchedContext == 0) {
// No match, just emit the original string
emit_text(state, str32);
} else {
// We have a match.

ctxtstr.resize(ctxtstr.length() - str32.length());
/** how many chars of the context we need to clear */
auto charsToDelete = matchedContext - str32.length(); /* we don't need to clear the output of the current key */

/** how many context items need to be removed */
size_t contextRemoved = 0;
for (auto c = state->context().rbegin(); charsToDelete > 0 && c != state->context().rend(); c++, contextRemoved++) {
/** last char of context */
km_core_usv lastCtx = ctxtstr.back();
uint8_t type = c->type;
assert(type == KM_CORE_BT_CHAR || type == KM_CORE_BT_MARKER);
if (type == KM_CORE_BT_CHAR) {
// single char, drop it
charsToDelete--;
assert(c->character == lastCtx);
ctxtstr.pop_back();
state->actions().push_backspace(KM_CORE_BT_CHAR, c->character); // Cause prior char to be removed
} else if (type == KM_CORE_BT_MARKER) {
// it's a marker, 'worth' 3 uchars
assert(charsToDelete >= 3);
assert(lastCtx == c->marker); // end of list
charsToDelete -= 3;
// pop off the three-part sentinel string
ctxtstr.pop_back();
ctxtstr.pop_back();
ctxtstr.pop_back();
// push a special backspace to delete the marker
state->actions().push_backspace(KM_CORE_BT_MARKER, c->marker);
}
}
// now, pop the right number of context items
for (size_t i = 0; i < contextRemoved; i++) {
// we don't pop during the above loop because the iterator gets confused
state->context().pop_back();
}
// Now, add in the updated text. This will convert UC_SENTINEL, etc back to marker actions.
emit_text(state, outputString);
// If we needed it further. we could update ctxtstr here:
// ctxtstr.append(outputString);
// ... but it is no longer needed at this point.
} // end of transform match
} // end of processing transforms
// no transforms, no output
}

// drop last 'matchedContext':
ctxtstr.resize(ctxtstr.length() - matchedContext);
ctxtstr.append(outputString); // TODO-LDML: should be able to do a normalization-safe append here.
ldml::normalize_nfd(ctxtstr, status);
assert(U_SUCCESS(status));

// Ok. We've done all the happy manipulations.

/** NFC and no markers */
std::u32string ctxtstr_cleanedup = ctxtstr;
// TODO-LDML: remove markers!
ldml::normalize_nfc(ctxtstr_cleanedup, status);

ctxtstr_cleanedup.find_first_not_of(old_ctxtstr_nfc);

// find common prefix
auto ctxt_prefix = mismatch(old_ctxtstr_nfc.begin(), old_ctxtstr_nfc.end(), ctxtstr_cleanedup.begin(), ctxtstr_cleanedup.end());
/** the part of the old str that changed */
std::u32string old_ctxtstr_changed(ctxt_prefix.first,old_ctxtstr_nfc.end());
std::u32string new_ctxtstr_changed(ctxt_prefix.second,ctxtstr_cleanedup.end());

// drop the old suffix. Note: this mutates old_ctxtstr_changed.
remove_text(state, old_ctxtstr_changed, old_ctxtstr_changed.length());
assert(old_ctxtstr_changed.length() == 0);
emit_text(state, new_ctxtstr_changed);

// OLD HAPPY PATH:
// if (matchedContext == 0) {
// // No match, just emit the original string
// emit_text(state, key_str32);
// } else {
// // We have a match.
// ctxtstr.resize(ctxtstr.length() - key_str32.length());
// /** how many chars of the context we need to clear */
// size_t charsToDelete = matchedContext - key_str32.length(); /* we don't need to clear the output of the current key */

// remove_text(state, ctxtstr, charsToDelete);
// // Now, add in the updated text. This will convert UC_SENTINEL, etc back to marker actions.
// }
// emit_text(state, outputString);
}

void
ldml_processor::remove_text(km_core_state *state, std::u32string &str, size_t length) {
/** how many context items need to be removed */
size_t contextRemoved = 0;
for (auto c = state->context().rbegin(); length > 0 && c != state->context().rend(); c++, contextRemoved++) {
/** last char of context */
km_core_usv lastCtx = str.back();
uint8_t type = c->type;
assert(type == KM_CORE_BT_CHAR || type == KM_CORE_BT_MARKER);
if (type == KM_CORE_BT_CHAR) {
// single char, drop it
length--;
assert(c->character == lastCtx);
str.pop_back();
state->actions().push_backspace(KM_CORE_BT_CHAR, c->character); // Cause prior char to be removed
} else if (type == KM_CORE_BT_MARKER) {
// it's a marker, 'worth' 3 uchars
assert(length >= 3);
assert(lastCtx == c->marker); // end of list
length -= 3;
// pop off the three-part sentinel string
str.pop_back();
str.pop_back();
str.pop_back();
// push a special backspace to delete the marker
state->actions().push_backspace(KM_CORE_BT_MARKER, c->marker);
}
}
// now, pop the right number of context items
for (size_t i = 0; i < contextRemoved; i++) {
// we don't pop during the above loop because the iterator gets confused
state->context().pop_back();
}
}

km_core_attr const & ldml_processor::attributes() const {
Expand Down Expand Up @@ -402,25 +439,26 @@ ldml_processor::emit_marker(km_core_state *state, KMX_DWORD marker_no) {
}

size_t
ldml_processor::context_to_string(km_core_state *state, std::u32string &str) {
ldml_processor::context_to_string(km_core_state *state, std::u32string &str, bool include_markers) {
str.clear();
auto &cp = state->context();
size_t ctxlen = 0; // TODO-LDML: is this needed?
size_t ctxlen = 0; // TODO-LDML: not used by callers?
uint8_t last_type = KM_CORE_BT_UNKNOWN;
for (auto c = cp.rbegin(); c != cp.rend(); c++, ctxlen++) {
last_type = c->type;
if (last_type == KM_CORE_BT_CHAR) {
str.insert(0, 1, c->character);
} else if (last_type == KM_CORE_BT_MARKER) {
assert(km::kbp::kmx::is_valid_marker(c->marker));
prepend_marker(str, c->marker);
if (include_markers) {
prepend_marker(str, c->marker);
}
} else {
break;
}
}
return ctxlen; // consumed the entire context buffer.
}


} // namespace kbp
} // namespace km
8 changes: 7 additions & 1 deletion core/src/ldml/ldml_processor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,12 @@ namespace kbp {
static void emit_text(km_core_state *state, km_core_usv ch);
/** emit a marker */
static void emit_marker(km_core_state *state, KMX_DWORD marker);
/**
* Delete text from the state.
* @param str string with text to remove, from the end
* @param length number of chars from the end of str to drop
*/
static void remove_text(km_core_state *state, std::u32string &str, size_t length);

/** process a typed key */
void process_key_string(km_core_state *state, const std::u16string &key_str) const;
Expand All @@ -103,7 +109,7 @@ namespace kbp {
* Convert markers into the UC_SENTINEL format.
* @return the number of context items consumed
*/
static size_t context_to_string(km_core_state *state, std::u32string &str);
static size_t context_to_string(km_core_state *state, std::u32string &str, bool include_markers = true);

/** prepend the marker string in UC_SENTINEL format to the str */
inline static void prepend_marker(std::u32string &str, KMX_DWORD marker);
Expand Down
47 changes: 47 additions & 0 deletions core/src/ldml/ldml_transforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -846,6 +846,7 @@ transforms::load(

// string

// TODO-LDML: copypasta -> refactor
std::u32string &normalize_nfd(std::u32string &str, UErrorCode &status) {
const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status);
if (U_FAILURE(status)) {
Expand Down Expand Up @@ -873,6 +874,7 @@ std::u32string &normalize_nfd(std::u32string &str, UErrorCode &status) {
return str;
}

// TODO-LDML: copypasta -> refactor
std::u16string &normalize_nfd(std::u16string &str, UErrorCode &status) {
const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status);
if (U_FAILURE(status)) {
Expand All @@ -888,6 +890,51 @@ std::u16string &normalize_nfd(std::u16string &str, UErrorCode &status) {
return str;
}

// TODO-LDML: copypasta -> refactor
std::u32string &normalize_nfc(std::u32string &str, UErrorCode &status) {
const icu::Normalizer2 *nfc = icu::Normalizer2::getNFCInstance(status);
if (U_FAILURE(status)) {
return str;
}
icu::UnicodeString dest;
const std::u16string rstr = km::kbp::kmx::u32string_to_u16string(str);
icu::UnicodeString src = icu::UnicodeString(rstr.data(), (int32_t)rstr.length());
nfc->normalize(src, dest, status);
if (U_FAILURE(status)) {
return str;
}

UErrorCode preflightStatus = U_ZERO_ERROR;
// calculate how big the buffer is
auto out32len = dest.toUTF32(nullptr, 0, preflightStatus); // preflightStatus will be an err, because we know the buffer overruns zero bytes
// allocate
char32_t *s = new char32_t[out32len + 1];
assert(s != nullptr);
// convert
dest.toUTF32((UChar32 *)s, out32len + 1, status);
assert(U_SUCCESS(status));
str.assign(s, out32len);
delete [] s;
return str;
}

// TODO-LDML: copypasta -> refactor
std::u16string &normalize_nfc(std::u16string &str, UErrorCode &status) {
const icu::Normalizer2 *nfc = icu::Normalizer2::getNFCInstance(status);
if (U_FAILURE(status)) {
return str;
}
icu::UnicodeString dest;
icu::UnicodeString src = icu::UnicodeString(str.data(), (int32_t)str.length());
nfc->normalize(src, dest, status);
if (U_FAILURE(status)) {
return str;
}
str.assign(dest.getBuffer(), dest.length());
return str;
}




} // namespace ldml
Expand Down
4 changes: 4 additions & 0 deletions core/src/ldml/ldml_transforms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,10 @@ class transforms {
std::u32string &normalize_nfd(std::u32string &str, UErrorCode &status);
/** Normalize a u16string inplace. Returns a reference to the same string. */
std::u16string &normalize_nfd(std::u16string &str, UErrorCode &status);
/** Normalize a u32string inplace. Returns a reference to the same string. */
std::u32string &normalize_nfc(std::u32string &str, UErrorCode &status);
/** Normalize a u16string inplace. Returns a reference to the same string. */
std::u16string &normalize_nfc(std::u16string &str, UErrorCode &status);

} // namespace ldml
} // namespace kbp
Expand Down
2 changes: 1 addition & 1 deletion core/tests/unit/ldml/keyboards/k_003_transform.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-keyboards.md#element-transform
@@keys: [K_Q][K_U][K_BKQUOTE][K_E]
@@expected: que\u0302
@@expected: qu\u00EA
-->
<!DOCTYPE keyboard3 SYSTEM "../../../../../resources/standards-data/ldml-keyboards/techpreview/dtd/ldmlKeyboard3.dtd">
Expand Down
9 changes: 8 additions & 1 deletion core/tests/unit/ldml/ldml_test_source.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <kmx/kmx_plus.h>
#include "ldml/keyboardprocessor_ldml.h"
#include "ldml/ldml_processor.hpp"
#include "ldml/ldml_transforms.hpp"

#include "path.hpp"
#include "state.hpp"
Expand Down Expand Up @@ -467,6 +468,9 @@ LdmlJsonTestSource::next_action(ldml_action &fillin) {
if (as_check.is_string()) {
fillin.type = LDML_ACTION_CHECK_EXPECTED;
fillin.string = LdmlTestSource::parse_u8_source_string(as_check.get<std::string>());
UErrorCode status = U_ZERO_ERROR;
km::kbp::ldml::normalize_nfc(fillin.string, status);
assert(U_SUCCESS(status));
return;
}

Expand All @@ -485,6 +489,8 @@ LdmlJsonTestSource::next_action(ldml_action &fillin) {
if (as_emit.is_string()) {
fillin.type = LDML_ACTION_EMIT_STRING;
fillin.string = LdmlTestSource::parse_u8_source_string(as_emit.get<std::string>());
UErrorCode status = U_ZERO_ERROR;
km::kbp::ldml::normalize_nfc(fillin.string, status);
return;
}

Expand All @@ -502,7 +508,8 @@ int LdmlJsonTestSource::load(const nlohmann::json &data) {
this->data = data; // TODO-LDML
auto startContext = data["/startContext/to"_json_pointer];
context = LdmlTestSource::parse_u8_source_string(startContext);

UErrorCode status = U_ZERO_ERROR;
km::kbp::ldml::normalize_nfc(context, status);
return 0;
}

Expand Down

0 comments on commit 426668b

Please sign in to comment.