Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(core): initial normalization 🙀 #9728

Merged
merged 23 commits into from
Oct 20, 2023
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
dcb6796
chore(core): Some failing normalization transform tests 🙀
srl295 Oct 9, 2023
8c4642a
Merge branch 'master' into feat/core/9468-normalization-epic-ldml
srl295 Oct 10, 2023
15e6fb6
chore(core): dx: improve test output 🙀
srl295 Oct 10, 2023
daa0523
feat(core): ldml normalization 🙀
srl295 Oct 11, 2023
522d2ef
feat(core): ldml normalization 🙀
srl295 Oct 12, 2023
e64f596
feat(core): ldml normalization 🙀
srl295 Oct 12, 2023
736af4b
feat(core): ldml normalization 🙀
srl295 Oct 12, 2023
a577bd2
feat(core): ldml normalization 🙀
srl295 Oct 12, 2023
84b4836
Merge branch 'master' into feat/core/9468-normalization-epic-ldml
srl295 Oct 12, 2023
b3de1ff
feat(core): ldml normalization 🙀
srl295 Oct 12, 2023
95ad1c4
feat(core): ldml normalization 🙀
srl295 Oct 12, 2023
bf15feb
feat(core): ldml normalization 🙀
srl295 Oct 13, 2023
79f37f7
Merge branch 'master' into feat/core/9468-normalization-epic-ldml
srl295 Oct 16, 2023
9ee183c
fix(core): ldml update to normalization per review 🙀
srl295 Oct 16, 2023
93e1c18
fix(core): ldml update to normalization per review 🙀
srl295 Oct 17, 2023
9dbf70c
fix(core): ldml more updates to normalization per review 🙀
srl295 Oct 18, 2023
f1b12a1
fix(core): ldml more updates to normalization per review 🙀
srl295 Oct 18, 2023
c4c7361
Merge branch 'master' into feat/core/9468-normalization-epic-ldml
srl295 Oct 18, 2023
3723e6b
fix(core): ldml more updates to normalization per review 🙀
srl295 Oct 18, 2023
4afe038
fix(core): ldml more updates to normalization per review 🙀
srl295 Oct 19, 2023
1d9820b
fix(core): ldml more updates to normalization per review 🙀
srl295 Oct 19, 2023
9292410
Merge branch 'master' into feat/core/9468-normalization-epic-ldml
srl295 Oct 19, 2023
4cdc2c3
fix(core): ldml more updates to normalization per review 🙀
srl295 Oct 19, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
186 changes: 107 additions & 79 deletions core/src/ldml/ldml_processor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
*/

#include <fstream>
#include <algorithm>
#include "ldml/ldml_processor.hpp"
#include "state.hpp"
#include "kmx_file.h"
Expand Down Expand Up @@ -236,88 +237,16 @@ ldml_processor::process_event(
// all other VKs
{
// Look up the key
const std::u16string str = keys.lookup(vk, modifier_state);
const std::u16string key_str = keys.lookup(vk, modifier_state);

if (str.empty()) {
if (key_str.empty()) {
// no key was found, so pass the keystroke on to the Engine
state->actions().push_invalidate_context();
state->actions().push_emit_keystroke();
break; // ----- commit and exit
}

// found a string - push it into the context and actions
// we convert it here instead of using the emit_text() overload
// so that we don't have to reconvert it inside the transform code.
const std::u32string str32 = kmx::u16string_to_u32string(str);

if (!transforms) {
// No transforms: just emit the string.
emit_text(state, str32);
} else {
// Process transforms here
/**
* a copy of the current/changed context, for transform use.
*
*/
std::u32string ctxtstr;
(void)context_to_string(state, ctxtstr);
// add the newly added key output to ctxtstr
ctxtstr.append(str32);

/** the output buffer for transforms */
std::u32string outputString;

// apply the transform, get how much matched (at the end)
const size_t matchedContext = transforms->apply(ctxtstr, outputString);

if (matchedContext == 0) {
// No match, just emit the original string
emit_text(state, str32);
} else {
// We have a match.

ctxtstr.resize(ctxtstr.length() - str32.length());
/** how many chars of the context we need to clear */
auto charsToDelete = matchedContext - str32.length(); /* we don't need to clear the output of the current key */

/** how many context items need to be removed */
size_t contextRemoved = 0;
for (auto c = state->context().rbegin(); charsToDelete > 0 && c != state->context().rend(); c++, contextRemoved++) {
/** last char of context */
km_core_usv lastCtx = ctxtstr.back();
uint8_t type = c->type;
assert(type == KM_CORE_BT_CHAR || type == KM_CORE_BT_MARKER);
if (type == KM_CORE_BT_CHAR) {
// single char, drop it
charsToDelete--;
assert(c->character == lastCtx);
ctxtstr.pop_back();
state->actions().push_backspace(KM_CORE_BT_CHAR, lastCtx); // Cause prior char to be removed
} else if (type == KM_CORE_BT_MARKER) {
// it's a marker, 'worth' 3 uchars
assert(charsToDelete >= 3);
assert(lastCtx == c->marker); // end of list
charsToDelete -= 3;
// pop off the three-part sentinel string
ctxtstr.pop_back();
ctxtstr.pop_back();
ctxtstr.pop_back();
// push a special backspace to delete the marker
state->actions().push_backspace(KM_CORE_BT_MARKER, c->marker);
}
}
// now, pop the right number of context items
for (size_t i = 0; i < contextRemoved; i++) {
// we don't pop during the above loop because the iterator gets confused
state->context().pop_back();
}
// Now, add in the updated text. This will convert UC_SENTINEL, etc back to marker actions.
emit_text(state, outputString);
// If we needed it further. we could update ctxtstr here:
// ctxtstr.append(outputString);
// ... but it is no longer needed at this point.
} // end of transform match
} // end of processing transforms
process_key_string(state, key_str);
} // end of processing a 'normal' vk
} // end of switch
// end of normal processing: commit and exit
Expand All @@ -330,6 +259,104 @@ ldml_processor::process_event(
return KM_CORE_STATUS_OK;
}

void
ldml_processor::process_key_string(km_core_state *state, const std::u16string &key_str) const {
UErrorCode status = U_ZERO_ERROR;
// We know that key_str is not empty per the caller.
mcdurdin marked this conversation as resolved.
Show resolved Hide resolved

// we convert the keys str to UTF-32 here instead of using the emit_text() overload
// so that we don't have to reconvert it inside the transform code.
std::u32string key_str32 = kmx::u16string_to_u32string(key_str);
// normalize the keystroke to NFD
ldml::normalize_nfd(key_str32, status);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
ldml::normalize_nfd(key_str32, status);
ldml::normalize_nfd(key_str32, status);
assert(U_SUCCESS(status));


// extract context string, in NFC
std::u32string old_ctxtstr_nfc;
(void)context_to_string(state, old_ctxtstr_nfc, false);
ldml::normalize_nfc(old_ctxtstr_nfc, status);
assert(U_SUCCESS(status));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to return if !U_SUCCESS(status) when in release builds, because old_ctxtstr_nfc will be in an uncertain state?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It'll be in an unchanged state but yes.


// context string in NFD
std::u32string ctxtstr;
(void)context_to_string(state, ctxtstr, true); // with markers
// add the newly added key output to ctxtstr
ctxtstr.append(key_str32);
ldml::normalize_nfd(ctxtstr, status);
assert(U_SUCCESS(status));

/** transform output string */
std::u32string outputString;
/** how many chars of the ctxtstr to replace */
size_t matchedContext = 0; // zero if no transforms

// begin modifications to the string

if(transforms) {
matchedContext = transforms->apply(ctxtstr, outputString);
} else {
// no transforms, no output
}

// drop last 'matchedContext':
ctxtstr.resize(ctxtstr.length() - matchedContext);
ctxtstr.append(outputString); // TODO-LDML: should be able to do a normalization-safe append here.
ldml::normalize_nfd(ctxtstr, status);
assert(U_SUCCESS(status));

// Ok. We've done all the happy manipulations.

/** NFC and no markers */
std::u32string ctxtstr_cleanedup = ctxtstr;
// TODO-LDML: remove markers!
ldml::normalize_nfc(ctxtstr_cleanedup, status);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
ldml::normalize_nfc(ctxtstr_cleanedup, status);
ldml::normalize_nfc(ctxtstr_cleanedup, status);
assert(U_SUCCESS(status));

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see a pattern emerging with these assertions which we should potentially cleanup, e.g.

if(!ldml::normalize_nfc_asserted(ctxtstr_cleanedup)) {
  return;
}

and something like:

bool ldml::normalize_nfc_asserted(std::u32string &str) {
  UErrorCode status = U_ZERO_ERROR;  
  ldml::normalize_nfc(str, status);
  assert(U_SUCCESS(status));
  return U_SUCCESS(status);
}

The function name I've come up with could be better!

The fact that two assertions are missing in your PR highlights the need for the refactor 😁

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

makes sense- thanks


// find common prefix
auto ctxt_prefix = mismatch(old_ctxtstr_nfc.begin(), old_ctxtstr_nfc.end(), ctxtstr_cleanedup.begin(), ctxtstr_cleanedup.end());
/** the part of the old str that changed */
std::u32string old_ctxtstr_changed(ctxt_prefix.first,old_ctxtstr_nfc.end());
std::u32string new_ctxtstr_changed(ctxt_prefix.second,ctxtstr_cleanedup.end());

// drop the old suffix. Note: this mutates old_ctxtstr_changed.
remove_text(state, old_ctxtstr_changed, old_ctxtstr_changed.length());
assert(old_ctxtstr_changed.length() == 0);
emit_text(state, new_ctxtstr_changed);
}

void
ldml_processor::remove_text(km_core_state *state, std::u32string &str, size_t length) {
Copy link
Contributor

@rc-swag rc-swag Oct 13, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having this remove_text is good because with the planned action object change, remove text will be able to more efficiently update the action object.

/** how many context items need to be removed */
size_t contextRemoved = 0;
for (auto c = state->context().rbegin(); length > 0 && c != state->context().rend(); c++, contextRemoved++) {
/** last char of context */
km_core_usv lastCtx = str.back();
uint8_t type = c->type;
assert(type == KM_CORE_BT_CHAR || type == KM_CORE_BT_MARKER);
if (type == KM_CORE_BT_CHAR) {
// single char, drop it
length--;
assert(c->character == lastCtx);
str.pop_back();
state->actions().push_backspace(KM_CORE_BT_CHAR, c->character); // Cause prior char to be removed
} else if (type == KM_CORE_BT_MARKER) {
// it's a marker, 'worth' 3 uchars
assert(length >= 3);
assert(lastCtx == c->marker); // end of list
length -= 3;
// pop off the three-part sentinel string
str.pop_back();
str.pop_back();
str.pop_back();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should consider asserting the values of the UC_SENTINEL and CODE_DEADKEY?

// push a special backspace to delete the marker
state->actions().push_backspace(KM_CORE_BT_MARKER, c->marker);
}
}
// now, pop the right number of context items
for (size_t i = 0; i < contextRemoved; i++) {
// we don't pop during the above loop because the iterator gets confused
state->context().pop_back();
}
}

km_core_attr const & ldml_processor::attributes() const {
return engine_attrs;
}
Expand Down Expand Up @@ -395,25 +422,26 @@ ldml_processor::emit_marker(km_core_state *state, KMX_DWORD marker_no) {
}

size_t
ldml_processor::context_to_string(km_core_state *state, std::u32string &str) {
ldml_processor::context_to_string(km_core_state *state, std::u32string &str, bool include_markers) {
str.clear();
auto &cp = state->context();
size_t ctxlen = 0; // TODO-LDML: is this needed?
size_t ctxlen = 0; // TODO-LDML: not used by callers?
uint8_t last_type = KM_CORE_BT_UNKNOWN;
for (auto c = cp.rbegin(); c != cp.rend(); c++, ctxlen++) {
last_type = c->type;
if (last_type == KM_CORE_BT_CHAR) {
str.insert(0, 1, c->character);
} else if (last_type == KM_CORE_BT_MARKER) {
assert(km::kbp::kmx::is_valid_marker(c->marker));
prepend_marker(str, c->marker);
if (include_markers) {
prepend_marker(str, c->marker);
}
} else {
break;
}
}
return ctxlen; // consumed the entire context buffer.
}


} // namespace kbp
} // namespace km
11 changes: 10 additions & 1 deletion core/src/ldml/ldml_processor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,23 @@ namespace kbp {
static void emit_text(km_core_state *state, km_core_usv ch);
/** emit a marker */
static void emit_marker(km_core_state *state, KMX_DWORD marker);
/**
* Delete text from the state.
* @param str string with text to remove, from the end
* @param length number of chars from the end of str to drop
*/
static void remove_text(km_core_state *state, std::u32string &str, size_t length);

/** process a typed key */
void process_key_string(km_core_state *state, const std::u16string &key_str) const;

/**
* add the string+marker portion of the context to the beginning of str.
* Stop when a non-string and non-marker is hit.
* Convert markers into the UC_SENTINEL format.
* @return the number of context items consumed
*/
static size_t context_to_string(km_core_state *state, std::u32string &str);
static size_t context_to_string(km_core_state *state, std::u32string &str, bool include_markers = true);

/** prepend the marker string in UC_SENTINEL format to the str */
inline static void prepend_marker(std::u32string &str, KMX_DWORD marker);
Expand Down
82 changes: 78 additions & 4 deletions core/src/ldml/ldml_transforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -462,9 +462,13 @@ transform_entry::init() {
// TODO-LDML: if we have mapFrom, may need to do other processing.
const std::u16string patstr = km::kbp::kmx::u32string_to_u16string(fFrom);
UErrorCode status = U_ZERO_ERROR;
/* const */ icu::UnicodeString patustr = icu::UnicodeString(patstr.data(), (int32_t)patstr.length());
/* const */ icu::UnicodeString patustr_raw = icu::UnicodeString(patstr.data(), (int32_t)patstr.length());
// add '$' to match to end
patustr.append(u'$');
patustr_raw.append(u'$');
icu::UnicodeString patustr;
const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status);
// NFD normalize on pattern creation
nfd->normalize(patustr_raw, patustr, status);
fFromPattern.reset(icu::RegexPattern::compile(patustr, 0, status));
mcdurdin marked this conversation as resolved.
Show resolved Hide resolved
assert(U_SUCCESS(status)); // TODO-LDML: may be best to propagate status up ^^
}
Expand Down Expand Up @@ -545,12 +549,21 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons
rustr = icu::UnicodeString(rstr.data(), (int32_t)rstr.length());
// and we return to the regular code flow.
}
const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status);
icu::UnicodeString rustr2;
nfd->normalize(rustr, rustr2, status);
assert(U_SUCCESS(status));
// here we replace the match output.
icu::UnicodeString entireOutput = matcher->replaceFirst(rustr, status);
icu::UnicodeString entireOutput = matcher->replaceFirst(rustr2, status);
assert(U_SUCCESS(status)); // TODO-LDML: could fail here due to bad input (syntax err)

// entireOutput includes all of 'input', but modified. Need to substring it.
icu::UnicodeString outu = entireOutput.tempSubString(matchStart);
icu::UnicodeString outu_raw = entireOutput.tempSubString(matchStart);

// normalize the replaced string
icu::UnicodeString outu;
nfd->normalize(outu_raw, outu, status);
assert(U_SUCCESS(status));

// Special case if there's no output, save some allocs
if (outu.length() == 0) {
Expand Down Expand Up @@ -831,6 +844,67 @@ transforms::load(
return transforms;
}

// string

// TODO-LDML: copypasta -> refactor
std::u32string &normalize_nfd(std::u32string &str, UErrorCode &status) {
std::u16string rstr = km::kbp::kmx::u32string_to_u16string(str);

normalize_nfd(rstr, status);
if (U_SUCCESS(status)) {
// if failure, leave it alone
str = km::kbp::kmx::u16string_to_u32string(rstr);
}
return str;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I'd prefer the failure cases in each function to be tested and return str just like the following functions do, for consistency.

Also, is there a good reason to return str as well as modify the input parameter? This seems likely to lead to unintentional wrong uses of the functions. Why not return bool success value instead (as suggested earlier in my review)?

}

// TODO-LDML: copypasta -> refactor
std::u16string &normalize_nfd(std::u16string &str, UErrorCode &status) {
const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status);
if (U_FAILURE(status)) {
return str;
}
icu::UnicodeString dest;
icu::UnicodeString src = icu::UnicodeString(str.data(), (int32_t)str.length());
nfd->normalize(src, dest, status);
if (U_FAILURE(status)) {
return str;
}
str.assign(dest.getBuffer(), dest.length());
return str;
}

// TODO-LDML: copypasta -> refactor
std::u32string &normalize_nfc(std::u32string &str, UErrorCode &status) {
std::u16string rstr = km::kbp::kmx::u32string_to_u16string(str);

normalize_nfc(rstr, status);
if (U_SUCCESS(status)) {
// if failure, leave it alone
str = km::kbp::kmx::u16string_to_u32string(rstr);
}
return str;
}

// TODO-LDML: copypasta -> refactor
std::u16string &normalize_nfc(std::u16string &str, UErrorCode &status) {
const icu::Normalizer2 *nfc = icu::Normalizer2::getNFCInstance(status);
if (U_FAILURE(status)) {
return str;
}
icu::UnicodeString dest;
icu::UnicodeString src = icu::UnicodeString(str.data(), (int32_t)str.length());
nfc->normalize(src, dest, status);
if (U_FAILURE(status)) {
return str;
}
str.assign(dest.getBuffer(), dest.length());
return str;
}




} // namespace ldml
} // namespace kbp
} // namespace km
12 changes: 12 additions & 0 deletions core/src/ldml/ldml_transforms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "unicode/unistr.h"
#include "unicode/regex.h"
#include "unicode/utext.h"
#include "unicode/normalizer2.h"

namespace km {
namespace kbp {
Expand Down Expand Up @@ -270,6 +271,17 @@ class transforms {
const kbp::kmx::COMP_KMXPLUS_TRAN_Helper &tranHelper);
};

// string routines

/** Normalize a u32string inplace. Returns a reference to the same string. */
std::u32string &normalize_nfd(std::u32string &str, UErrorCode &status);
/** Normalize a u16string inplace. Returns a reference to the same string. */
std::u16string &normalize_nfd(std::u16string &str, UErrorCode &status);
/** Normalize a u32string inplace. Returns a reference to the same string. */
std::u32string &normalize_nfc(std::u32string &str, UErrorCode &status);
/** Normalize a u16string inplace. Returns a reference to the same string. */
std::u16string &normalize_nfc(std::u16string &str, UErrorCode &status);
srl295 marked this conversation as resolved.
Show resolved Hide resolved

} // namespace ldml
} // namespace kbp
} // namespace km
Loading