diff --git a/core/src/ldml/ldml_processor.cpp b/core/src/ldml/ldml_processor.cpp index f4bb2648f1e..ff2fc60b731 100644 --- a/core/src/ldml/ldml_processor.cpp +++ b/core/src/ldml/ldml_processor.cpp @@ -196,42 +196,7 @@ ldml_processor::process_event( switch (vk) { // Special handling for backspace VK case KM_CORE_VKEY_BKSP: - { - if (!!bksp_transforms) { - // TODO-LDML: process bksp - // std::u16string outputString; - // // don't bother if no backspace transforms! - // // TODO-LDML: unroll ctxt into a str - // std::u16string ctxtstr; - // for (size_t i = 0; i < ctxt.size(); i++) { - // ctxtstr.append(ctxt[i]); - // } - // const size_t matchedContext = transforms->apply(ctxtstr, outputString); - } - KMX_DWORD last_char = 0UL; - // attempt to get the last char - auto end = state->context().rbegin(); - if(end != state->context().rend()) { - if((*end).type == KM_CORE_CT_CHAR) { - last_char = (*end).character; - // TODO-LDML: markers! - } - } - if (last_char == 0UL) { - /* - We couldn't find a character at end of context (context is empty), - so we'll pass the backspace keystroke on to the app to process; the - app might want to use backspace to move between contexts or delete - a text box, etc. Or it might be a legacy app and we've had our caret - dumped in somewhere unknown, so we will have to depend on the app to - be sensible about backspacing because we know nothing. - */ - state->actions().push_backspace(KM_CORE_BT_UNKNOWN); - } else { - state->actions().push_backspace(KM_CORE_BT_CHAR, last_char); - state->context().pop_back(); - } - } + process_backspace(state); break; default: // all other VKs @@ -259,6 +224,44 @@ ldml_processor::process_event( return KM_CORE_STATUS_OK; } +void +ldml_processor::process_backspace(km_core_state *state) const { + if (!!bksp_transforms) { + // TODO-LDML: process bksp + // std::u16string outputString; + // // don't bother if no backspace transforms! + // // TODO-LDML: unroll ctxt into a str + // std::u16string ctxtstr; + // for (size_t i = 0; i < ctxt.size(); i++) { + // ctxtstr.append(ctxt[i]); + // } + // const size_t matchedContext = transforms->apply(ctxtstr, outputString); + } + + // Find out what the last actual character was and remove it. + // attempt to get the last char + auto end = state->context().rbegin(); + if (end != state->context().rend()) { + if ((*end).type == KM_CORE_CT_CHAR) { + state->actions().push_backspace(KM_CORE_BT_CHAR, (*end).character); + state->context().pop_back(); + return; + } else if ((*end).type == KM_CORE_BT_MARKER) { + state->actions().push_backspace(KM_CORE_BT_MARKER, (*end).marker); + state->context().pop_back(); + } + } + /* + We couldn't find a character at end of context (context is empty), + so we'll pass the backspace keystroke on to the app to process; the + app might want to use backspace to move between contexts or delete + a text box, etc. Or it might be a legacy app and we've had our caret + dumped in somewhere unknown, so we will have to depend on the app to + be sensible about backspacing because we know nothing. + */ + state->actions().push_backspace(KM_CORE_BT_UNKNOWN); +} + void ldml_processor::process_key_string(km_core_state *state, const std::u16string &key_str) const { // We know that key_str is not empty per the caller. @@ -267,19 +270,18 @@ ldml_processor::process_key_string(km_core_state *state, const std::u16string &k // we convert the keys str to UTF-32 here instead of using the emit_text() overload // so that we don't have to reconvert it inside the transform code. std::u32string key_str32 = kmx::u16string_to_u32string(key_str); - assert(ldml::normalize_nfd(key_str32)); // TODO-LDML: else fail? - - // extract context string, in NFC - std::u32string old_ctxtstr_nfc; - (void)context_to_string(state, old_ctxtstr_nfc, false); - assert(ldml::normalize_nfc(old_ctxtstr_nfc)); // TODO-LDML: else fail? + assert(ldml::normalize_nfd_markers(key_str32)); // TODO-LDML: else fail? + // extract context string, in NFD + std::u32string old_ctxtstr_nfd; + (void)context_to_string(state, old_ctxtstr_nfd, false); + assert(ldml::normalize_nfd_markers(old_ctxtstr_nfd)); // TODO-LDML: else fail? // context string in NFD std::u32string ctxtstr; (void)context_to_string(state, ctxtstr, true); // with markers // add the newly added key output to ctxtstr ctxtstr.append(key_str32); - assert(ldml::normalize_nfd(ctxtstr)); // TODO-LDML: else fail? + assert(ldml::normalize_nfd_markers(ctxtstr)); // TODO-LDML: else fail? /** transform output string */ std::u32string outputString; @@ -297,25 +299,38 @@ ldml_processor::process_key_string(km_core_state *state, const std::u16string &k // drop last 'matchedContext': ctxtstr.resize(ctxtstr.length() - matchedContext); ctxtstr.append(outputString); // TODO-LDML: should be able to do a normalization-safe append here. - assert(ldml::normalize_nfd(ctxtstr)); // TODO-LDML: else fail? + ldml::marker_map markers; + assert(ldml::normalize_nfd_markers(ctxtstr, markers)); // TODO-LDML: Need marker-safe normalize here. // Ok. We've done all the happy manipulations. - /** NFC and no markers */ - std::u32string ctxtstr_cleanedup = ctxtstr; - // TODO-LDML: remove markers! - assert(ldml::normalize_nfc(ctxtstr_cleanedup)); // TODO-LDML: else fail? - - // find common prefix - auto ctxt_prefix = mismatch(old_ctxtstr_nfc.begin(), old_ctxtstr_nfc.end(), ctxtstr_cleanedup.begin(), ctxtstr_cleanedup.end()); - /** the part of the old str that changed */ - std::u32string old_ctxtstr_changed(ctxt_prefix.first,old_ctxtstr_nfc.end()); + /** NFD and no markers */ + std::u32string ctxtstr_cleanedup = ldml::remove_markers(ctxtstr); + assert(ldml::normalize_nfd_markers(ctxtstr_cleanedup)); + + // find common prefix. + // For example, if the context previously had "aaBBBBB" and it is changing to "aaCCC" then we will have: + // - old_ctxtstr_changed = "BBBBB" + // - new_ctxtstr_changed = "CCC" + // So the BBBBB needs to be removed and then CCC added. + auto ctxt_prefix = mismatch(old_ctxtstr_nfd.begin(), old_ctxtstr_nfd.end(), ctxtstr_cleanedup.begin(), ctxtstr_cleanedup.end()); + /** The part of the old string to be removed */ + std::u32string old_ctxtstr_changed(ctxt_prefix.first,old_ctxtstr_nfd.end()); + /** The new context to be added */ std::u32string new_ctxtstr_changed(ctxt_prefix.second,ctxtstr_cleanedup.end()); // drop the old suffix. Note: this mutates old_ctxtstr_changed. remove_text(state, old_ctxtstr_changed, old_ctxtstr_changed.length()); assert(old_ctxtstr_changed.length() == 0); + // old_ctxtstr_changed is now empty because it's been removed. + // context is "aa" in the above example. emit_text(state, new_ctxtstr_changed); + + // TODO-LDML: need to emit marker here - need to emit text w/ markers, and handle appropriately. + // // TODO-LDML: 1-marker hack! need to support a string with intermixed markers. + if (key_str32.length() == 3 && key_str32[0] == LDML_UC_SENTINEL && key_str32[1] == LDML_MARKER_CODE) { + emit_marker(state, key_str32[2]); + } } void @@ -334,18 +349,7 @@ ldml_processor::remove_text(km_core_state *state, std::u32string &str, size_t le str.pop_back(); state->actions().push_backspace(KM_CORE_BT_CHAR, c->character); // Cause prior char to be removed } else if (type == KM_CORE_BT_MARKER) { - // it's a marker, 'worth' 3 uchars - assert(length >= 3); - assert(lastCtx == c->marker); // end of list - length -= 3; - // pop off the three-part sentinel string (in reverse order of course) - assert(str.back() == c->marker); // marker # - str.pop_back(); - assert(str.back() == LDML_MARKER_CODE); - str.pop_back(); - assert(str.back() == LDML_UC_SENTINEL); - str.pop_back(); - // push a special backspace to delete the marker + // just pop off any markers. state->actions().push_backspace(KM_CORE_BT_MARKER, c->marker); } } @@ -433,7 +437,7 @@ ldml_processor::context_to_string(km_core_state *state, std::u32string &str, boo } else if (last_type == KM_CORE_BT_MARKER) { assert(km::core::kmx::is_valid_marker(c->marker)); if (include_markers) { - prepend_marker(str, c->marker); + ldml::prepend_marker(str, c->marker); } } else { break; diff --git a/core/src/ldml/ldml_processor.hpp b/core/src/ldml/ldml_processor.hpp index 79c6ae4ac4b..92194ddad10 100644 --- a/core/src/ldml/ldml_processor.hpp +++ b/core/src/ldml/ldml_processor.hpp @@ -103,6 +103,9 @@ namespace core { /** process a typed key */ void process_key_string(km_core_state *state, const std::u16string &key_str) const; + /** process a backspace */ + void process_backspace(km_core_state *state) const; + /** * add the string+marker portion of the context to the beginning of str. * Stop when a non-string and non-marker is hit. @@ -111,15 +114,6 @@ namespace core { */ static size_t context_to_string(km_core_state *state, std::u32string &str, bool include_markers = true); - /** prepend the marker string in UC_SENTINEL format to the str */ - inline static void prepend_marker(std::u32string &str, KMX_DWORD marker); }; - - void - ldml_processor::prepend_marker(std::u32string &str, KMX_DWORD marker) { - km_core_usv triple[] = {LDML_UC_SENTINEL, LDML_MARKER_CODE, marker}; - str.insert(0, triple, 3); - } - } // namespace core } // namespace km diff --git a/core/src/ldml/ldml_transforms.cpp b/core/src/ldml/ldml_transforms.cpp index 6ae8feafc87..5c4ff428cd9 100644 --- a/core/src/ldml/ldml_transforms.cpp +++ b/core/src/ldml/ldml_transforms.cpp @@ -10,10 +10,7 @@ #include #include #include "kmx/kmx_xstring.h" - -#ifndef assert -#define assert(x) ((void)0) -#endif +#include namespace km { namespace core { @@ -466,15 +463,13 @@ transform_entry::init() { return false; } // TODO-LDML: if we have mapFrom, may need to do other processing. - const std::u16string patstr = km::core::kmx::u32string_to_u16string(fFrom); + std::u16string patstr = km::core::kmx::u32string_to_u16string(fFrom); + // normalize, including markers + normalize_nfd_markers(patstr); UErrorCode status = U_ZERO_ERROR; - /* const */ icu::UnicodeString patustr_raw = icu::UnicodeString(patstr.data(), (int32_t)patstr.length()); + /* const */ icu::UnicodeString patustr = icu::UnicodeString(patstr.data(), (int32_t)patstr.length()); // add '$' to match to end - patustr_raw.append(u'$'); - icu::UnicodeString patustr; - const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status); - // NFD normalize on pattern creation - nfd->normalize(patustr_raw, patustr, status); + patustr.append(u'$'); // TODO-LDML: may need to escape some markers. Marker #91 will look like a `[` to the pattern fFromPattern.reset(icu::RegexPattern::compile(patustr, 0, status)); return (UASSERT_SUCCESS(status)); } @@ -556,7 +551,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons } const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status); icu::UnicodeString rustr2; - nfd->normalize(rustr, rustr2, status); + nfd->normalize(rustr, rustr2, status); // TODO-LDML: must be normalize with markers! UASSERT_SUCCESS(status); // here we replace the match output. icu::UnicodeString entireOutput = matcher->replaceFirst(rustr2, status); @@ -567,7 +562,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons // normalize the replaced string icu::UnicodeString outu; - nfd->normalize(outu_raw, outu, status); + nfd->normalize(outu_raw, outu, status); // TODO-LDML: must be normalize with markers! UASSERT_SUCCESS(status); // Special case if there's no output, save some allocs @@ -585,8 +580,6 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons outu.toUTF32((UChar32 *)s, out32len + 1, status); UASSERT_SUCCESS(status); output.assign(s, out32len); - // now, build a u32string - std::u32string out32(s, out32len); // clean up buffer delete [] s; } @@ -863,16 +856,6 @@ transforms::load( // string manipulation -bool normalize_nfd(std::u32string &str) { - std::u16string rstr = km::core::kmx::u32string_to_u16string(str); - if(!normalize_nfd(rstr)) { - return false; - } else { - str = km::core::kmx::u16string_to_u32string(rstr); - return true; - } -} - /** internal function to normalize with a specified mode */ static bool normalize(const icu::Normalizer2 *n, std::u16string &str, UErrorCode &status) { UASSERT_SUCCESS(status); @@ -886,6 +869,16 @@ static bool normalize(const icu::Normalizer2 *n, std::u16string &str, UErrorCode return U_SUCCESS(status); } +bool normalize_nfd(std::u32string &str) { + std::u16string rstr = km::core::kmx::u32string_to_u16string(str); + if(!normalize_nfd(rstr)) { + return false; + } else { + str = km::core::kmx::u16string_to_u32string(rstr); + return true; + } +} + bool normalize_nfd(std::u16string &str) { UErrorCode status = U_ZERO_ERROR; const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status); @@ -893,6 +886,87 @@ bool normalize_nfd(std::u16string &str) { return normalize(nfd, str, status); } +bool normalize_nfd_markers(std::u16string &str, marker_map &map) { + std::u32string rstr = km::core::kmx::u16string_to_u32string(str); + if(!normalize_nfd_markers(rstr, map)) { + return false; + } else { + str = km::core::kmx::u32string_to_u16string(rstr); + return true; + } +} + +void add_back_markers(std::u32string &str, const std::u32string &src, const marker_map &map) { + // need to reconstitute. + marker_map map2(map); // make a copy of the map + // clear the string + str.clear(); + // add the end-of-text marker + { + const auto ch = MARKER_BEFORE_EOT; + const auto m = map2.find(ch); + if (m != map2.end()) { + prepend_marker(str, m->second); + map2.erase(ch); // remove it + } + } + // go from end to beginning of string + for (auto p = src.rbegin(); p != src.rend(); p++) { + const auto ch = *p; + str.insert(0, 1, ch); // prepend + + const auto m = map2.find(ch); + if (m != map2.end()) { + prepend_marker(str, m->second); + map2.erase(ch); // remove it + } + } +} + +/** + * TODO-LDML: + * - doesn't support >1 marker per char - may need a set instead of a map! + * - ideally this should be used on a normalization safe subsequence + */ +bool normalize_nfd_markers(std::u32string &str, marker_map &map) { + /** original string, but no markers */ + std::u32string str_unmarked = remove_markers(str, map); + /** original string, no markers, NFD */ + std::u32string str_unmarked_nfd = str_unmarked; + if(!normalize_nfd(str_unmarked_nfd)) { + return false; // normalize failed. + } else if (map.size() == 0) { + // no markers. Return the normalized unmarked str + str = str_unmarked_nfd; + } else if (str_unmarked_nfd == str_unmarked) { + // Normalization produced no change when markers were removed. + // So, we'll call this a no-op. + } else { + add_back_markers(str, str_unmarked_nfd, map); + } + return true; // all OK +} + +bool normalize_nfc_markers(std::u32string &str, marker_map &map) { + /** original string, but no markers */ + std::u32string str_unmarked = remove_markers(str, map); + /** original string, no markers, NFC */ + std::u32string str_unmarked_nfc = str_unmarked; + if(!normalize_nfc(str_unmarked_nfc)) { + return false; // normalize failed. + } else if (map.size() == 0) { + // no markers. Return the normalized unmarked str + str = str_unmarked_nfc; + } else if (str_unmarked_nfc == str_unmarked) { + // Normalization produced no change when markers were removed. + // So, we'll call this a no-op. + } else { + add_back_markers(str, str_unmarked_nfc, map); + } + return true; // all OK +} + + bool normalize_nfc(std::u32string &str) { std::u16string rstr = km::core::kmx::u32string_to_u16string(str); if(!normalize_nfc(rstr)) { @@ -910,6 +984,52 @@ bool normalize_nfc(std::u16string &str) { return normalize(nfc, str, status); } +std::u32string remove_markers(const std::u32string &str, marker_map *markers) { + std::u32string out; + auto i = str.begin(); + auto last = i; + for (i = find(i, str.end(), LDML_UC_SENTINEL); i != str.end(); i = find(i, str.end(), LDML_UC_SENTINEL)) { + // append any prefix (from prior pos'n to here) + out.append(last, i); + + // #1: LDML_UC_SENTINEL (what we searched for) + assert(*i == LDML_UC_SENTINEL); // assert that find() worked + i++; + last = i; + if (i == str.end()) { + break; // hit end + } + + // #2 LDML_MARKER_CODE + if (*i != LDML_MARKER_CODE) { + continue; // can't process this, get out + } + i++; + last = i; + if (i == str.end()) { + break; // hit end + } + + // #3 marker number + const KMX_DWORD marker_no = *i; + assert(marker_no >= LDML_MARKER_MIN_INDEX && marker_no <= LDML_MARKER_MAX_INDEX); + i++; // if end, we'll break out of the loop + last = i; + + // record the marker + if (markers != nullptr) { + if (i == str.end()) { + markers->emplace(MARKER_BEFORE_EOT, marker_no); + } else { + markers->emplace(*i, marker_no); + } + } + } + // get the suffix between the last marker and the end + out.append(last, str.end()); + return out; +} + } // namespace ldml } // namespace core } // namespace km diff --git a/core/src/ldml/ldml_transforms.hpp b/core/src/ldml/ldml_transforms.hpp index 044520ade5e..4a2948613db 100644 --- a/core/src/ldml/ldml_transforms.hpp +++ b/core/src/ldml/ldml_transforms.hpp @@ -8,6 +8,7 @@ #pragma once #include "kmx/kmx_plus.h" +#include "kmx/kmx_xstring.h" #include #include #include @@ -287,14 +288,73 @@ class transforms { // string routines +/** indicates that the marker was before the end of text. */ +const char32_t MARKER_BEFORE_EOT = km::core::kmx::Uni_FFFE_NONCHARACTER; + +/** map from following-char to marker number. */ +typedef std::map marker_map; + /** Normalize a u32string inplace to NFD. @return false on failure */ bool normalize_nfd(std::u32string &str); /** Normalize a u16string inplace to NFD. @return false on failure */ bool normalize_nfd(std::u16string &str); +/** Normalize a u32string inplace to NFD, retaining markers. + * @param markers will be populated with marker chars + * @return false on failure + **/ +bool normalize_nfd_markers(std::u32string &str, marker_map &markers); +bool normalize_nfd_markers(std::u16string &str, marker_map &markers); +inline bool normalize_nfd_markers(std::u32string &str); +inline bool normalize_nfd_markers(std::u16string &str); + +/** Normalize a u32string inplace to NFC, retaining markers. + * @param markers will be populated with marker chars + * @return false on failure + **/ +bool normalize_nfc_markers(std::u32string &str, marker_map &markers); +bool normalize_nfc_markers(std::u16string &str, marker_map &markers); +inline bool normalize_nfc_markers(std::u32string &str); +inline bool normalize_nfc_markers(std::u16string &str); + /** Normalize a u32string inplace to NFC. @return false on failure */ bool normalize_nfc(std::u32string &str); /** Normalize a u16string inplace to NFC. @return false on failure */ bool normalize_nfc(std::u16string &str); +/** Remove markers and optionally note their glue characters in the map */ +std::u32string remove_markers(const std::u32string &str, marker_map *markers = nullptr); +/** same but with a reference */ +inline std::u32string remove_markers(const std::u32string &str, marker_map &markers) { + return remove_markers(str, &markers); +} + +/** prepend the marker string in UC_SENTINEL format to the str */ +inline static void prepend_marker(std::u32string &str, KMX_DWORD marker); + +void +prepend_marker(std::u32string &str, KMX_DWORD marker) { + km_core_usv triple[] = {LDML_UC_SENTINEL, LDML_MARKER_CODE, marker}; + str.insert(0, triple, 3); +} + +bool normalize_nfd_markers(std::u16string &str) { + marker_map m; + return normalize_nfd_markers(str, m); +} + +bool normalize_nfc_markers(std::u16string &str) { + marker_map m; + return normalize_nfc_markers(str, m); +} +bool normalize_nfd_markers(std::u32string &str) { + marker_map m; + return normalize_nfd_markers(str, m); +} + +bool normalize_nfc_markers(std::u32string &str) { + marker_map m; + return normalize_nfc_markers(str, m); +} + } // namespace ldml } // namespace core diff --git a/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml b/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml index 025fe36a483..2b96de271f1 100644 --- a/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml +++ b/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml @@ -133,4 +133,114 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml b/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml index 1162a7769e5..06ad0695f7d 100644 --- a/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml +++ b/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml @@ -15,6 +15,7 @@ from https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-ke + @@ -26,7 +27,7 @@ from https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-ke - + @@ -43,6 +44,29 @@ from https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-ke + + + + + + + + + + + + + + + + + + + + + + + diff --git a/core/tests/unit/ldml/ldml.cpp b/core/tests/unit/ldml/ldml.cpp index eaf764e18eb..e22367a7f96 100644 --- a/core/tests/unit/ldml/ldml.cpp +++ b/core/tests/unit/ldml/ldml.cpp @@ -27,6 +27,9 @@ #include // for surrogate pair macros #include "ldml_test_source.hpp" +#include "debuglog.h" + +#include "ldml/ldml_transforms.hpp" namespace { @@ -105,6 +108,7 @@ apply_action( assert(!context.empty()); assert(context.back().type == KM_CORE_CT_MARKER); context.pop_back(); + // no change to text store. } else if (text_store.length() > 0) { assert(!context.empty() && !text_store.empty()); km_core_usv ch = text_store.back(); @@ -118,11 +122,16 @@ apply_action( text_store.pop_back(); } } - assert(ch == act.backspace.expected_value); - - assert(context.back().type == KM_CORE_CT_CHAR); - assert(context.back().character == ch); - context.pop_back(); + if (act.backspace.expected_type == KM_CORE_BT_CHAR) { + assert(ch == act.backspace.expected_value); + assert(context.back().type == KM_CORE_CT_CHAR); + assert(context.back().character == ch); + context.pop_back(); + } else { + // assume it's otherwise KM-coRE_BT_UNKNOWN + assert(act.backspace.expected_type == KM_CORE_BT_UNKNOWN); + assert(context.empty()); // if KM_CORE_BT_UNKNOWN, context should be empty. + } } break; case KM_CORE_IT_PERSIST_OPT: @@ -156,6 +165,21 @@ verify_context(std::u16string& text_store, km_core_state* &test_state, std::vect km_core_cp *buf = new km_core_cp[n]; try_status(km_core_context_items_to_utf16(citems, buf, &n)); std::cout << "context : " << string_to_hex(buf) << " [" << buf << "]" << std::endl; + std::cout << "testcontext "; + for (auto i = test_context.begin(); i < test_context.end(); i++) { + switch(i->type) { + case KM_CORE_CT_CHAR: + std::cout << "U+" << std::hex << i->character << std::dec << " "; + break; + case KM_CORE_CT_MARKER: + std::cout << "\\m{" << i->character << "} "; + break; + default: + std::cout << "type#" << i->type << " "; + } + } + std::cout << std::endl; + std::cout << "context : " << string_to_hex(buf) << " [" << buf << "]" << std::endl; // Verify that both our local test_context and the core's test_state.context have // not diverged @@ -216,7 +240,8 @@ run_test(const km::core::path &source, const km::core::path &compiled, km::tests // handle backspace here if (action.type == km::tests::LDML_ACTION_KEY_EVENT) { auto &p = action.k; - std::cout << "- key action: 0x" << std::hex << p.vk << "/modifier 0x" << p.modifier_state << std::dec << std::endl; + std::cout << "- key action: " << km::core::kmx::Debug_VirtualKey(p.vk) << "/modifier " << km::core::kmx::Debug_ModifierName(p.modifier_state) << " 0x" << p.modifier_state + << std::dec << std::endl; // Because a normal system tracks caps lock state itself, // we mimic that in the tests. We assume caps lock state is // updated on key_down before the processor receives the @@ -250,6 +275,7 @@ run_test(const km::core::path &source, const km::core::path &compiled, km::tests verify_context(text_store, test_state, test_context); } else if (action.type == km::tests::LDML_ACTION_CHECK_EXPECTED) { + assert(km::core::ldml::normalize_nfd(action.string)); // TODO-LDML: should be NFC std::cout << "- check expected" << std::endl; std::cout << "expected : " << string_to_hex(action.string) << " [" << action.string << "]" << std::endl; std::cout << "text store: " << string_to_hex(text_store) << " [" << text_store << "]" << std::endl; diff --git a/core/tests/unit/ldml/ldml_test_source.cpp b/core/tests/unit/ldml/ldml_test_source.cpp index 9171f25163f..93ddff9f7c9 100644 --- a/core/tests/unit/ldml/ldml_test_source.cpp +++ b/core/tests/unit/ldml/ldml_test_source.cpp @@ -472,7 +472,7 @@ LdmlJsonTestSource::next_action(ldml_action &fillin) { if (type == "check") { fillin.type = LDML_ACTION_CHECK_EXPECTED; fillin.string = LdmlTestSource::parse_u8_source_string(result.get()); - assert(km::core::ldml::normalize_nfc(fillin.string)); + assert(km::core::ldml::normalize_nfd(fillin.string)); // TODO-LDML: should be NFC return; } else if (type == "keystroke") { fillin.type = LDML_ACTION_KEY_EVENT; @@ -483,7 +483,7 @@ LdmlJsonTestSource::next_action(ldml_action &fillin) { } else if (type == "emit") { fillin.type = LDML_ACTION_EMIT_STRING; fillin.string = LdmlTestSource::parse_u8_source_string(to.get()); - assert(km::core::ldml::normalize_nfc(fillin.string)); + assert(km::core::ldml::normalize_nfd(fillin.string)); // TODO-LDML: should be NFC return; } else if (type == "backspace") { // backspace is handled as a key event @@ -507,7 +507,7 @@ int LdmlJsonTestSource::load(const nlohmann::json &data) { this->data = data; // TODO-LDML auto startContext = data["/startContext/to"_json_pointer]; context = LdmlTestSource::parse_u8_source_string(startContext); - assert(km::core::ldml::normalize_nfc(context)); + assert(km::core::ldml::normalize_nfd(context)); // TODO-LDML: should be NFC return 0; } diff --git a/core/tests/unit/ldml/test_transforms.cpp b/core/tests/unit/ldml/test_transforms.cpp index cb0db12bfe4..6960be0ff3d 100644 --- a/core/tests/unit/ldml/test_transforms.cpp +++ b/core/tests/unit/ldml/test_transforms.cpp @@ -4,6 +4,7 @@ #include #include #include +#include "test_color.h" // TODO-LDML: normal asserts wern't working, so using some hacks. // #include "ldml_test_utils.hpp" @@ -11,13 +12,13 @@ // #include "debuglog.h" #ifndef zassert_string_equal -#define zassert_string_equal(actual, expected) \ - { \ - if (actual != expected) { \ - std::cerr << __FILE__ << ":" << __LINE__ << ": " \ - << "got: " << actual << " expected " << expected << std::endl; \ - return EXIT_FAILURE; \ - } \ +#define zassert_string_equal(actual, expected) \ + { \ + if (actual != expected) { \ + std::wcerr << __FILE__ << ":" << __LINE__ << ": " << console_color::fg(console_color::BRIGHT_RED) << "got: " << actual \ + << " expected " << expected << console_color::reset() << std::endl; \ + return EXIT_FAILURE; \ + } \ } #endif @@ -427,10 +428,209 @@ int test_map() { return EXIT_SUCCESS; } +int test_strutils() { + std::cout << "== " << __FUNCTION__ << std::endl; + + std::cout << __FILE__ << ":" << __LINE__ << " * remove_markers" << std::endl; + + + { + std::cout << __FILE__ << ":" << __LINE__ << " - basic test0" << std::endl; + const std::u32string src = U"abc"; + const std::u32string dst = remove_markers(src); + zassert_string_equal(dst, src); // unchanged + } + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - basic test" << std::endl; + const std::u32string src = U"abc"; + const std::u32string dst = remove_markers(src, map); + zassert_string_equal(dst, src); // unchanged + assert_equal(map.size(), 0); + } + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - marker test" << std::endl; + const std::u32string src = U"6\U0000ffff\U00000008\U00000001e"; + const std::u32string dst = remove_markers(src, map); + const std::u32string expect = U"6e"; + zassert_string_equal(dst, expect); + assert_equal(map.size(), 1); + assert_equal(map[U'e'], 0x1L); // marker 1 @ e + } + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - bad0" << std::endl; + const std::u32string src = U"6\U0000ffff\U00000008"; // missing trailing marker # + const std::u32string dst = remove_markers(src, map); + const std::u32string expect = U"6"; + zassert_string_equal(dst, expect); + assert_equal(map.size(), 0); + } + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - bad1" << std::endl; + const std::u32string src = U"6\U0000ffffq"; // missing code + const std::u32string dst = remove_markers(src, map); + const std::u32string expect = U"6q"; + zassert_string_equal(dst, expect); + assert_equal(map.size(), 0); + } + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - bad1" << std::endl; + const std::u32string src = U"6\U0000ffff"; // missing code + const std::u32string dst = remove_markers(src, map); + const std::u32string expect = U"6"; + zassert_string_equal(dst, expect); + assert_equal(map.size(), 0); + } + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - marker end test" << std::endl; + const std::u32string src = U"6\U0000ffff\U00000008\U00000001"; + const std::u32string dst = remove_markers(src, map); + const std::u32string expect = U"6"; + zassert_string_equal(dst, expect); + assert_equal(map.size(), 1); + assert_equal(map[MARKER_BEFORE_EOT], 0x1L); // marker 1 @ e + } + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - complex test" << std::endl; + const std::u32string src = U"6\U0000ffff\U00000008\U00000001e\U0000ffff\U00000008\U00000002\U00000320\U0000ffff\U00000008\U00000003\U00000300\U0000ffff\U00000008\U00000004"; + const std::u32string dst = remove_markers(src, map); + const std::u32string expect = U"6e\U00000320\U00000300"; + zassert_string_equal(dst, expect); + assert_equal(map.size(), 4); + assert_equal(map[U'e'], 0x1L); + assert_equal(map[0x0320], 0x2L); + assert_equal(map[0x0300], 0x3L); + assert_equal(map[MARKER_BEFORE_EOT], 0x4L); + } + return EXIT_SUCCESS; +} + +int test_normalize() { + std::cout << "== " << __FUNCTION__ << std::endl; + + std::cout << __FILE__ << ":" << __LINE__ << " * normalize_nfd_markers" << std::endl; + + + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - noop test" << std::endl; + const std::u32string src = U"6e\U00000320\U00000300"; // already NFD + const std::u32string expect = src; + std::u32string dst = src; + assert(normalize_nfd_markers(dst, map)); + zassert_string_equal(dst, expect); + assert_equal(map.size(), 0); + } + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - medium test" << std::endl; + const std::u32string src = U"6e\U00000300\U00000320"; // swapped + const std::u32string expect = U"6e\U00000320\U00000300"; // correct NFD + std::u32string dst = src; + assert(normalize_nfd_markers(dst, map)); + zassert_string_equal(dst, expect); + assert_equal(map.size(), 0); + } + + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - noop test w markers" << std::endl; + const std::u32string src = + U"6\U0000ffff\U00000008\U00000001e\U0000ffff\U00000008\U00000002\U00000320\U0000ffff\U00000008\U00000003\U00000300" + U"\U0000ffff\U00000008\U00000004"; + const std::u32string expect = src; + std::u32string dst = src; + assert(normalize_nfd_markers(dst, map)); + zassert_string_equal(dst, expect); + assert_equal(map.size(), 4); + assert_equal(map[U'e'], 0x1L); + assert_equal(map[0x0320], 0x2L); + assert_equal(map[0x0300], 0x3L); + assert_equal(map[MARKER_BEFORE_EOT], 0x4L); + } + + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - complex test" << std::endl; + const std::u32string src = // already in order: 320+300 + U"6\U0000ffff\U00000008\U00000001e\U0000ffff\U00000008\U00000002\U00000320\U0000ffff\U00000008\U00000003\U00000300\U0000ffff\U00000008\U00000004"; + const std::u32string expect = src; + std::u32string dst = src; + assert(normalize_nfd_markers(dst, map)); + zassert_string_equal(dst, expect); + assert_equal(map.size(), 4); + assert_equal(map[U'e'], 0x1L); + assert_equal(map[0x0320], 0x2L); + assert_equal(map[0x0300], 0x3L); + assert_equal(map[MARKER_BEFORE_EOT], 0x4L); + + } + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - complex test2" << std::endl; + const std::u32string src = // out of order, 300-320 + U"6\U0000ffff\U00000008\U00000001e\U0000ffff\U00000008\U00000002\U00000300\U0000ffff\U00000008\U00000003\U00000320\U0000ffff\U00000008\U00000004"; + const std::u32string expect = + U"6\U0000ffff\U00000008\U00000001e\U0000ffff\U00000008\U00000003\U00000320\U0000ffff\U00000008\U00000002\U00000300\U0000ffff\U00000008\U00000004"; + std::u32string dst = src; + assert(normalize_nfd_markers(dst, map)); + if (dst != expect) { + std::cout << "dst: " << Debug_UnicodeString(dst) << std::endl; + std::cout << "exp: " << Debug_UnicodeString(expect) << std::endl; + } + zassert_string_equal(dst, expect); + assert_equal(map.size(), 4); + assert_equal(map[U'e'], 0x1L); + assert_equal(map[0x0320], 0x3L); + assert_equal(map[0x0300], 0x2L); + assert_equal(map[MARKER_BEFORE_EOT], 0x4L); + + } + + { + // u"4è\U0000ffff\b\U00000001̠" + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - complex test 4a" << std::endl; + const std::u32string src = U"4e\u0300\uFFFF\b\u0001\u0320"; + const std::u32string expect = U"4e\uFFFF\b\u0001\u0320\u0300"; + std::u32string dst = src; + assert(normalize_nfd_markers(dst, map)); + if (dst != expect) { + std::cout << "dst: " << Debug_UnicodeString(dst) << std::endl; + std::cout << "exp: " << Debug_UnicodeString(expect) << std::endl; + } + zassert_string_equal(dst, expect); + assert_equal(map.size(), 1); + assert_equal(map[0x0320], 0x1L); + } + + return EXIT_SUCCESS; +} + + int main(int argc, const char *argv[]) { int rc = EXIT_SUCCESS; + bool arg_color = false; + + int first_arg = 1; + + if (first_arg < argc) { + arg_color = std::string(argv[first_arg]) == "--color"; + if(arg_color) { + first_arg++; + } + } + + console_color::enabled = console_color::isaterminal() || arg_color; + if (test_transforms() != EXIT_SUCCESS) { rc = EXIT_FAILURE; } @@ -443,6 +643,13 @@ main(int argc, const char *argv[]) { rc = EXIT_FAILURE; } + if (test_strutils() != EXIT_SUCCESS) { + rc = EXIT_FAILURE; + } + + if (test_normalize() != EXIT_SUCCESS) { + rc = EXIT_FAILURE; + } if (rc == EXIT_FAILURE) { std::cout << "== FAILURE" << std::endl; diff --git a/resources/standards-data/ldml-keyboards/techpreview/test/bn-test.xml b/resources/standards-data/ldml-keyboards/techpreview/test/bn-test.xml index ba0d7d84302..ad95223ca6e 100644 --- a/resources/standards-data/ldml-keyboards/techpreview/test/bn-test.xml +++ b/resources/standards-data/ldml-keyboards/techpreview/test/bn-test.xml @@ -2,7 +2,18 @@ - + + + + + + + + + + + +