diff --git a/core/src/debuglog.h b/core/src/debuglog.h index 1ecbaca6124..944695d669f 100644 --- a/core/src/debuglog.h +++ b/core/src/debuglog.h @@ -1,5 +1,7 @@ /* Debugging */ +#pragma once + #include namespace km { @@ -18,10 +20,12 @@ extern const char *s_key_names[]; #ifdef _MSC_VER #define DebugLog(msg,...) (km::kbp::kmx::ShouldDebug() ? km::kbp::kmx::DebugLog_1(__FILE__, __LINE__, __FUNCTION__, (msg),__VA_ARGS__) : 0) +#define DebugLog2(file,line,function,msg,...) (km::kbp::kmx::ShouldDebug() ? km::kbp::kmx::DebugLog_1(file, line, function, (msg),__VA_ARGS__) : 0) #define console_error(msg,...) write_console(TRUE, (msg), __VA_ARGS__) #define console_log(msg,...) write_console(FALSE, (msg), __VA_ARGS__) #else #define DebugLog(msg,...) (km::kbp::kmx::ShouldDebug() ? km::kbp::kmx::DebugLog_1(__FILE__, __LINE__, __FUNCTION__, (msg), ##__VA_ARGS__) : 0) +#define DebugLog2(file,line,function,msg,...) (km::kbp::kmx::ShouldDebug() ? km::kbp::kmx::DebugLog_1(file, line, function, (msg), ##__VA_ARGS__) : 0) #define console_error(msg,...) write_console(TRUE, (msg), ##__VA_ARGS__) #define console_log(msg,...) write_console(FALSE, (msg), ##__VA_ARGS__) #endif diff --git a/core/src/ldml/ldml_processor.cpp b/core/src/ldml/ldml_processor.cpp index 40b263a3532..5d995be4184 100644 --- a/core/src/ldml/ldml_processor.cpp +++ b/core/src/ldml/ldml_processor.cpp @@ -6,6 +6,7 @@ */ #include +#include #include "ldml/ldml_processor.hpp" #include "state.hpp" #include "kmx_file.h" @@ -236,88 +237,16 @@ ldml_processor::process_event( // all other VKs { // Look up the key - const std::u16string str = keys.lookup(vk, modifier_state); + const std::u16string key_str = keys.lookup(vk, modifier_state); - if (str.empty()) { + if (key_str.empty()) { // no key was found, so pass the keystroke on to the Engine state->actions().push_invalidate_context(); state->actions().push_emit_keystroke(); break; // ----- commit and exit } - // found a string - push it into the context and actions - // we convert it here instead of using the emit_text() overload - // so that we don't have to reconvert it inside the transform code. - const std::u32string str32 = kmx::u16string_to_u32string(str); - - if (!transforms) { - // No transforms: just emit the string. - emit_text(state, str32); - } else { - // Process transforms here - /** - * a copy of the current/changed context, for transform use. - * - */ - std::u32string ctxtstr; - (void)context_to_string(state, ctxtstr); - // add the newly added key output to ctxtstr - ctxtstr.append(str32); - - /** the output buffer for transforms */ - std::u32string outputString; - - // apply the transform, get how much matched (at the end) - const size_t matchedContext = transforms->apply(ctxtstr, outputString); - - if (matchedContext == 0) { - // No match, just emit the original string - emit_text(state, str32); - } else { - // We have a match. - - ctxtstr.resize(ctxtstr.length() - str32.length()); - /** how many chars of the context we need to clear */ - auto charsToDelete = matchedContext - str32.length(); /* we don't need to clear the output of the current key */ - - /** how many context items need to be removed */ - size_t contextRemoved = 0; - for (auto c = state->context().rbegin(); charsToDelete > 0 && c != state->context().rend(); c++, contextRemoved++) { - /** last char of context */ - km_core_usv lastCtx = ctxtstr.back(); - uint8_t type = c->type; - assert(type == KM_CORE_BT_CHAR || type == KM_CORE_BT_MARKER); - if (type == KM_CORE_BT_CHAR) { - // single char, drop it - charsToDelete--; - assert(c->character == lastCtx); - ctxtstr.pop_back(); - state->actions().push_backspace(KM_CORE_BT_CHAR, lastCtx); // Cause prior char to be removed - } else if (type == KM_CORE_BT_MARKER) { - // it's a marker, 'worth' 3 uchars - assert(charsToDelete >= 3); - assert(lastCtx == c->marker); // end of list - charsToDelete -= 3; - // pop off the three-part sentinel string - ctxtstr.pop_back(); - ctxtstr.pop_back(); - ctxtstr.pop_back(); - // push a special backspace to delete the marker - state->actions().push_backspace(KM_CORE_BT_MARKER, c->marker); - } - } - // now, pop the right number of context items - for (size_t i = 0; i < contextRemoved; i++) { - // we don't pop during the above loop because the iterator gets confused - state->context().pop_back(); - } - // Now, add in the updated text. This will convert UC_SENTINEL, etc back to marker actions. - emit_text(state, outputString); - // If we needed it further. we could update ctxtstr here: - // ctxtstr.append(outputString); - // ... but it is no longer needed at this point. - } // end of transform match - } // end of processing transforms + process_key_string(state, key_str); } // end of processing a 'normal' vk } // end of switch // end of normal processing: commit and exit @@ -330,6 +259,103 @@ ldml_processor::process_event( return KM_CORE_STATUS_OK; } +void +ldml_processor::process_key_string(km_core_state *state, const std::u16string &key_str) const { + // We know that key_str is not empty per the caller. + assert(!key_str.empty()); + + // we convert the keys str to UTF-32 here instead of using the emit_text() overload + // so that we don't have to reconvert it inside the transform code. + std::u32string key_str32 = kmx::u16string_to_u32string(key_str); + assert(ldml::normalize_nfd(key_str32)); // TODO-LDML: else fail? + + // extract context string, in NFC + std::u32string old_ctxtstr_nfc; + (void)context_to_string(state, old_ctxtstr_nfc, false); + assert(ldml::normalize_nfc(old_ctxtstr_nfc)); // TODO-LDML: else fail? + + // context string in NFD + std::u32string ctxtstr; + (void)context_to_string(state, ctxtstr, true); // with markers + // add the newly added key output to ctxtstr + ctxtstr.append(key_str32); + assert(ldml::normalize_nfd(ctxtstr)); // TODO-LDML: else fail? + + /** transform output string */ + std::u32string outputString; + /** how many chars of the ctxtstr to replace */ + size_t matchedContext = 0; // zero if no transforms + + // begin modifications to the string + + if(transforms) { + matchedContext = transforms->apply(ctxtstr, outputString); + } else { + // no transforms, no output + } + + // drop last 'matchedContext': + ctxtstr.resize(ctxtstr.length() - matchedContext); + ctxtstr.append(outputString); // TODO-LDML: should be able to do a normalization-safe append here. + assert(ldml::normalize_nfd(ctxtstr)); // TODO-LDML: else fail? + + // Ok. We've done all the happy manipulations. + + /** NFC and no markers */ + std::u32string ctxtstr_cleanedup = ctxtstr; + // TODO-LDML: remove markers! + assert(ldml::normalize_nfc(ctxtstr_cleanedup)); // TODO-LDML: else fail? + + // find common prefix + auto ctxt_prefix = mismatch(old_ctxtstr_nfc.begin(), old_ctxtstr_nfc.end(), ctxtstr_cleanedup.begin(), ctxtstr_cleanedup.end()); + /** the part of the old str that changed */ + std::u32string old_ctxtstr_changed(ctxt_prefix.first,old_ctxtstr_nfc.end()); + std::u32string new_ctxtstr_changed(ctxt_prefix.second,ctxtstr_cleanedup.end()); + + // drop the old suffix. Note: this mutates old_ctxtstr_changed. + remove_text(state, old_ctxtstr_changed, old_ctxtstr_changed.length()); + assert(old_ctxtstr_changed.length() == 0); + emit_text(state, new_ctxtstr_changed); +} + +void +ldml_processor::remove_text(km_core_state *state, std::u32string &str, size_t length) { + /** how many context items need to be removed */ + size_t contextRemoved = 0; + for (auto c = state->context().rbegin(); length > 0 && c != state->context().rend(); c++, contextRemoved++) { + /** last char of context */ + km_core_usv lastCtx = str.back(); + uint8_t type = c->type; + assert(type == KM_CORE_BT_CHAR || type == KM_CORE_BT_MARKER); + if (type == KM_CORE_BT_CHAR) { + // single char, drop it + length--; + assert(c->character == lastCtx); + str.pop_back(); + state->actions().push_backspace(KM_CORE_BT_CHAR, c->character); // Cause prior char to be removed + } else if (type == KM_CORE_BT_MARKER) { + // it's a marker, 'worth' 3 uchars + assert(length >= 3); + assert(lastCtx == c->marker); // end of list + length -= 3; + // pop off the three-part sentinel string (in reverse order of course) + assert(str.back() == c->marker); // marker # + str.pop_back(); + assert(str.back() == LDML_MARKER_CODE); + str.pop_back(); + assert(str.back() == LDML_UC_SENTINEL); + str.pop_back(); + // push a special backspace to delete the marker + state->actions().push_backspace(KM_CORE_BT_MARKER, c->marker); + } + } + // now, pop the right number of context items + for (size_t i = 0; i < contextRemoved; i++) { + // we don't pop during the above loop because the iterator gets confused + state->context().pop_back(); + } +} + km_core_attr const & ldml_processor::attributes() const { return engine_attrs; } @@ -395,10 +421,10 @@ ldml_processor::emit_marker(km_core_state *state, KMX_DWORD marker_no) { } size_t -ldml_processor::context_to_string(km_core_state *state, std::u32string &str) { +ldml_processor::context_to_string(km_core_state *state, std::u32string &str, bool include_markers) { str.clear(); auto &cp = state->context(); - size_t ctxlen = 0; // TODO-LDML: is this needed? + size_t ctxlen = 0; // TODO-LDML: not used by callers? uint8_t last_type = KM_CORE_BT_UNKNOWN; for (auto c = cp.rbegin(); c != cp.rend(); c++, ctxlen++) { last_type = c->type; @@ -406,7 +432,9 @@ ldml_processor::context_to_string(km_core_state *state, std::u32string &str) { str.insert(0, 1, c->character); } else if (last_type == KM_CORE_BT_MARKER) { assert(km::kbp::kmx::is_valid_marker(c->marker)); - prepend_marker(str, c->marker); + if (include_markers) { + prepend_marker(str, c->marker); + } } else { break; } @@ -414,6 +442,5 @@ ldml_processor::context_to_string(km_core_state *state, std::u32string &str) { return ctxlen; // consumed the entire context buffer. } - } // namespace kbp } // namespace km diff --git a/core/src/ldml/ldml_processor.hpp b/core/src/ldml/ldml_processor.hpp index e0129115b21..0845aa70ac9 100644 --- a/core/src/ldml/ldml_processor.hpp +++ b/core/src/ldml/ldml_processor.hpp @@ -93,6 +93,15 @@ namespace kbp { static void emit_text(km_core_state *state, km_core_usv ch); /** emit a marker */ static void emit_marker(km_core_state *state, KMX_DWORD marker); + /** + * Delete text from the state. + * @param str string with text to remove, from the end + * @param length number of chars from the end of str to drop + */ + static void remove_text(km_core_state *state, std::u32string &str, size_t length); + + /** process a typed key */ + void process_key_string(km_core_state *state, const std::u16string &key_str) const; /** * add the string+marker portion of the context to the beginning of str. @@ -100,7 +109,7 @@ namespace kbp { * Convert markers into the UC_SENTINEL format. * @return the number of context items consumed */ - static size_t context_to_string(km_core_state *state, std::u32string &str); + static size_t context_to_string(km_core_state *state, std::u32string &str, bool include_markers = true); /** prepend the marker string in UC_SENTINEL format to the str */ inline static void prepend_marker(std::u32string &str, KMX_DWORD marker); diff --git a/core/src/ldml/ldml_transforms.cpp b/core/src/ldml/ldml_transforms.cpp index 5507c6ed86d..56a63278764 100644 --- a/core/src/ldml/ldml_transforms.cpp +++ b/core/src/ldml/ldml_transforms.cpp @@ -12,7 +12,7 @@ #include "kmx/kmx_xstring.h" #ifndef assert -#define assert(x) // TODO-LDML +#define assert(x) ((void)0) #endif namespace km { @@ -403,25 +403,29 @@ transform_entry::transform_entry(const transform_entry &other) transform_entry::transform_entry(const std::u32string &from, const std::u32string &to) : fFrom(from), fTo(to), fFromPattern(nullptr), fMapFromStrId(), fMapToStrId(), fMapFromList(), fMapToList() { - assert(!fFrom.empty()); // TODO-LDML: should not happen? + assert(!fFrom.empty()); init(); } -// TODO-LDML: How do we return errors from here? transform_entry::transform_entry( const std::u32string &from, const std::u32string &to, KMX_DWORD mapFrom, KMX_DWORD mapTo, - const kmx::kmx_plus &kplus) + const kmx::kmx_plus &kplus, + bool &valid) : fFrom(from), fTo(to), fFromPattern(nullptr), fMapFromStrId(mapFrom), fMapToStrId(mapTo) { + if (!valid) + return; // exit early assert(!fFrom.empty()); // TODO-LDML: should not happen? assert((fMapFromStrId == 0) == (fMapToStrId == 0)); // we have both or we have neither. assert(kplus.strs != nullptr); assert(kplus.vars != nullptr); assert(kplus.elem != nullptr); - init(); + if(!init()) { + valid = false; + } // setup mapFrom if (fMapFromStrId != 0) { @@ -456,18 +460,23 @@ transform_entry::transform_entry( } } -void +bool transform_entry::init() { - if (!fFrom.empty()) { - // TODO-LDML: if we have mapFrom, may need to do other processing. - const std::u16string patstr = km::kbp::kmx::u32string_to_u16string(fFrom); - UErrorCode status = U_ZERO_ERROR; - /* const */ icu::UnicodeString patustr = icu::UnicodeString(patstr.data(), (int32_t)patstr.length()); - // add '$' to match to end - patustr.append(u'$'); - fFromPattern.reset(icu::RegexPattern::compile(patustr, 0, status)); - assert(U_SUCCESS(status)); // TODO-LDML: may be best to propagate status up ^^ + if (fFrom.empty()) { + return false; } + // TODO-LDML: if we have mapFrom, may need to do other processing. + const std::u16string patstr = km::kbp::kmx::u32string_to_u16string(fFrom); + UErrorCode status = U_ZERO_ERROR; + /* const */ icu::UnicodeString patustr_raw = icu::UnicodeString(patstr.data(), (int32_t)patstr.length()); + // add '$' to match to end + patustr_raw.append(u'$'); + icu::UnicodeString patustr; + const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status); + // NFD normalize on pattern creation + nfd->normalize(patustr_raw, patustr, status); + fFromPattern.reset(icu::RegexPattern::compile(patustr, 0, status)); + return (UASSERT_SUCCESS(status)); } size_t @@ -480,7 +489,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons icu::UnicodeString matchustr = icu::UnicodeString(matchstr.data(), (int32_t)matchstr.length()); // TODO-LDML: create a new Matcher every time. These could be cached and reset. std::unique_ptr matcher(fFromPattern->matcher(matchustr, status)); - assert(U_SUCCESS(status)); + UASSERT_SUCCESS(status); if (!matcher->find(status)) { // i.e. matches somewhere, in this case at end of str return 0; // no match @@ -490,7 +499,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons // TODO-LDML: if we had an underlying UText this would be simpler. int32_t matchStart = matcher->start(status); int32_t matchEnd = matcher->end(status); - assert(U_SUCCESS(status)); + UASSERT_SUCCESS(status); // extract.. const icu::UnicodeString substr = matchustr.tempSubStringBetween(matchStart, matchEnd); // preflight to UTF-32 to get length @@ -517,7 +526,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons // we actually need the group(1) string here. // this is only the content in parenthesis () icu::UnicodeString group1 = matcher->group(1, status); - assert(U_SUCCESS(status)); // TODO-LDML: could be a malformed from pattern + UASSERT_SUCCESS(status); // TODO-LDML: could be a malformed from pattern // now, how long is group1 in UTF-32, hmm? UErrorCode preflightStatus = U_ZERO_ERROR; // throwaway status auto group1Len = group1.toUTF32(nullptr, 0, preflightStatus); @@ -525,7 +534,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons assert(s != nullptr); // TODO-LDML: OOM // convert substr.toUTF32((UChar32 *)s, group1Len + 1, status); - assert(U_SUCCESS(status)); + UASSERT_SUCCESS(status); std::u32string match32(s, group1Len); // taken from just group1 // clean up buffer delete [] s; @@ -545,12 +554,21 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons rustr = icu::UnicodeString(rstr.data(), (int32_t)rstr.length()); // and we return to the regular code flow. } + const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status); + icu::UnicodeString rustr2; + nfd->normalize(rustr, rustr2, status); + UASSERT_SUCCESS(status); // here we replace the match output. - icu::UnicodeString entireOutput = matcher->replaceFirst(rustr, status); - assert(U_SUCCESS(status)); // TODO-LDML: could fail here due to bad input (syntax err) + icu::UnicodeString entireOutput = matcher->replaceFirst(rustr2, status); + UASSERT_SUCCESS(status); // TODO-LDML: could fail here due to bad input (syntax err) // entireOutput includes all of 'input', but modified. Need to substring it. - icu::UnicodeString outu = entireOutput.tempSubString(matchStart); + icu::UnicodeString outu_raw = entireOutput.tempSubString(matchStart); + + // normalize the replaced string + icu::UnicodeString outu; + nfd->normalize(outu_raw, outu, status); + UASSERT_SUCCESS(status); // Special case if there's no output, save some allocs if (outu.length() == 0) { @@ -565,7 +583,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons assert(s != nullptr); // convert outu.toUTF32((UChar32 *)s, out32len + 1, status); - assert(U_SUCCESS(status)); + UASSERT_SUCCESS(status); output.assign(s, out32len); // now, build a u32string std::u32string out32(s, out32len); @@ -754,35 +772,34 @@ transforms::load( const kmx::kmx_plus &kplus, const kbp::kmx::COMP_KMXPLUS_TRAN *tran, const kbp::kmx::COMP_KMXPLUS_TRAN_Helper &tranHelper) { + bool valid = true; if (tran == nullptr) { DebugLog("for tran: tran is null"); - assert(false); - return nullptr; - } - if (!tranHelper.valid()) { + valid = false; + } else if (!tranHelper.valid()) { DebugLog("for tran: tranHelper is invalid"); - assert(false); - return nullptr; - } - if (nullptr == kplus.elem) { + valid = false; + } else if (nullptr == kplus.elem) { DebugLog("for tran: kplus.elem == nullptr"); - assert(false); - return nullptr; - } - if (nullptr == kplus.strs) { + valid = false; + } else if (nullptr == kplus.strs) { DebugLog("for tran: kplus.strs == nullptr"); // need a string table to get strings - assert(false); - return nullptr; - } - if (nullptr == kplus.vars) { + valid = false; + } else if (nullptr == kplus.vars) { DebugLog("for tran: kplus.vars == nullptr"); // need a vars table to get maps - assert(false); + valid = false; + } + + assert(valid); + if (!valid) { return nullptr; } // with that out of the way, let's set it up - transforms *transforms = new ldml::transforms(); + std::unique_ptr transforms; + + transforms.reset(new ldml::transforms()); for (KMX_DWORD groupNumber = 0; groupNumber < tran->groupCount; groupNumber++) { const kmx::COMP_KMXPLUS_TRAN_GROUP *group = tranHelper.getGroup(groupNumber); @@ -798,7 +815,15 @@ transforms::load( const std::u32string toStr = kmx::u16string_to_u32string(kplus.strs->get(element->to)); KMX_DWORD mapFrom = element->mapFrom; // copy, because of alignment KMX_DWORD mapTo = element->mapTo; // copy, because of alignment - newGroup.emplace_back(fromStr, toStr, mapFrom, mapTo, kplus); // creating a transform_entry + assert(!fromStr.empty()); + if (fromStr.empty()) { + valid = false; + } + newGroup.emplace_back(fromStr, toStr, mapFrom, mapTo, kplus, valid); // creating a transform_entry + assert(valid); + if(!valid) { + return nullptr; + } } transforms->addGroup(newGroup); } else if (group->type == LDML_TRAN_GROUP_TYPE_REORDER) { @@ -828,7 +853,61 @@ transforms::load( return nullptr; } } - return transforms; + assert(valid); + if (!valid) { + return nullptr; + } else { + return transforms.release(); + } +} + +// string manipulation + +bool normalize_nfd(std::u32string &str) { + std::u16string rstr = km::kbp::kmx::u32string_to_u16string(str); + if(!normalize_nfd(rstr)) { + return false; + } else { + str = km::kbp::kmx::u16string_to_u32string(rstr); + return true; + } +} + +/** internal function to normalize with a specified mode */ +static bool normalize(const icu::Normalizer2 *n, std::u16string &str, UErrorCode &status) { + UASSERT_SUCCESS(status); + assert(n != nullptr); + icu::UnicodeString dest; + icu::UnicodeString src = icu::UnicodeString(str.data(), (int32_t)str.length()); + n->normalize(src, dest, status); + if (UASSERT_SUCCESS(status)) { + str.assign(dest.getBuffer(), dest.length()); + } + return U_SUCCESS(status); +} + +bool normalize_nfd(std::u16string &str) { + UErrorCode status = U_ZERO_ERROR; + const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status); + UASSERT_SUCCESS(status); + return normalize(nfd, str, status); +} + +bool normalize_nfc(std::u32string &str) { + std::u16string rstr = km::kbp::kmx::u32string_to_u16string(str); + if(!normalize_nfc(rstr)) { + return false; + } else { + str = km::kbp::kmx::u16string_to_u32string(rstr); + return true; + } +} + +bool normalize_nfc(std::u16string &str) { + UErrorCode status = U_ZERO_ERROR; + const icu::Normalizer2 *nfc = icu::Normalizer2::getNFCInstance(status); + UASSERT_SUCCESS(status); + return normalize(nfc, str, status); } } // namespace ldml diff --git a/core/src/ldml/ldml_transforms.hpp b/core/src/ldml/ldml_transforms.hpp index 9d231bfe323..56cac24181c 100644 --- a/core/src/ldml/ldml_transforms.hpp +++ b/core/src/ldml/ldml_transforms.hpp @@ -13,6 +13,7 @@ #include #include #include +#include "debuglog.h" #if !defined(HAVE_ICU4C) #error icu4c is required for this code @@ -25,11 +26,24 @@ #include "unicode/unistr.h" #include "unicode/regex.h" #include "unicode/utext.h" +#include "unicode/normalizer2.h" namespace km { namespace kbp { namespace ldml { +/** @returns true on success */ +inline bool uassert_success(const char *file, int line, const char *function, UErrorCode status) { + if (U_FAILURE(status)) { + DebugLog2(file, line, function, "U_FAILURE(%s)", u_errorName(status)); + return false; + } else { + return true; + } +} + +#define UASSERT_SUCCESS(status) assert(U_SUCCESS(status)), uassert_success(__FILE__, __LINE__, __FUNCTION__, status) + using km::kbp::kmx::SimpleUSet; /** @@ -94,7 +108,8 @@ class transform_entry { const std::u32string &to, KMX_DWORD mapFrom, KMX_DWORD mapTo, - const kmx::kmx_plus &kplus); + const kmx::kmx_plus &kplus, + bool &valid); /** * If matching, apply the match to the output string @@ -113,8 +128,8 @@ class transform_entry { const KMX_DWORD fMapToStrId; std::deque fMapFromList; std::deque fMapToList; - /** Internal function to setup pattern string */ - void init(); + /** Internal function to setup pattern string @returns true on success */ + bool init(); /** @returns the index of the item in the fMapFromList list, or -1 */ int32_t findIndexFrom(const std::u32string &match) const; public: @@ -270,6 +285,17 @@ class transforms { const kbp::kmx::COMP_KMXPLUS_TRAN_Helper &tranHelper); }; +// string routines + +/** Normalize a u32string inplace to NFD. @return false on failure */ +bool normalize_nfd(std::u32string &str); +/** Normalize a u16string inplace to NFD. @return false on failure */ +bool normalize_nfd(std::u16string &str); +/** Normalize a u32string inplace to NFC. @return false on failure */ +bool normalize_nfc(std::u32string &str); +/** Normalize a u16string inplace to NFC. @return false on failure */ +bool normalize_nfc(std::u16string &str); + } // namespace ldml } // namespace kbp } // namespace km diff --git a/core/subprojects/packagefiles/icu/meson.build b/core/subprojects/packagefiles/icu/meson.build index 128fc9fbf97..7b329e27df7 100644 --- a/core/subprojects/packagefiles/icu/meson.build +++ b/core/subprojects/packagefiles/icu/meson.build @@ -35,7 +35,7 @@ uconfig.set('U_ENABLE_DYLOAD', 0) # no DLL uconfig.set('U_CHECK_DYLOAD', 0) # no DLL uconfig.set('UCONFIG_NO_FILE_IO', 1) uconfig.set('UCONFIG_NO_LEGACY_CONVERSION', 1) # turn off file based codepage conversion -uconfig.set('UCONFIG_NO_NORMALIZATION', 1) # TODO-LDML: may want this +uconfig.set('UCONFIG_NO_NORMALIZATION', 0) uconfig.set('UCONFIG_NO_BREAK_ITERATION', 1) # TODO-LDML: may want this uconfig.set('UCONFIG_NO_IDNA', 1) uconfig.set('UCONFIG_NO_COLLATION', 1) diff --git a/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml b/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml new file mode 100644 index 00000000000..025fe36a483 --- /dev/null +++ b/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml @@ -0,0 +1,136 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml b/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml new file mode 100644 index 00000000000..b66ff1a4b6a --- /dev/null +++ b/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml @@ -0,0 +1,52 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/core/tests/unit/ldml/keyboards/meson.build b/core/tests/unit/ldml/keyboards/meson.build index eea0b519ac6..9dc2efc263a 100644 --- a/core/tests/unit/ldml/keyboards/meson.build +++ b/core/tests/unit/ldml/keyboards/meson.build @@ -34,6 +34,7 @@ tests_without_testdata = [ tests_with_testdata = [ 'k_001_tiny', 'k_007_transform_rgx', + 'k_008_transform_norm', 'k_020_fr', # TODO-LDML: move to cldr above (fix vkey) 'k_200_reorder_nod_Lana', 'k_210_marker', diff --git a/core/tests/unit/ldml/ldml.cpp b/core/tests/unit/ldml/ldml.cpp index 31a4cded731..85ba3ae19c3 100644 --- a/core/tests/unit/ldml/ldml.cpp +++ b/core/tests/unit/ldml/ldml.cpp @@ -295,8 +295,8 @@ run_test(const km::kbp::path &source, const km::kbp::path &compiled, km::tests:: * Run all tests for this keyboard */ int run_all_tests(const km::kbp::path &source, const km::kbp::path &compiled) { - std::cout << "source file = " << source << std::endl - << "compiled file = " << compiled << std::endl; + std::wcout << console_color::fg(console_color::BLUE) << "source file = " << source << std::endl + << "compiled file = " << compiled << console_color::reset() << std::endl; km::tests::LdmlEmbeddedTestSource embedded_test_source; @@ -306,7 +306,8 @@ int run_all_tests(const km::kbp::path &source, const km::kbp::path &compiled) { if (embedded_result == 0) { // embedded loaded OK, try it - std::cout << "TEST: " << source.name() << " (embedded)" << std::endl; + std::wcout << console_color::fg(console_color::BLUE) << console_color::bold() << "TEST: " << source.name() << " (embedded)" + << console_color::reset() << std::endl; embedded_result = run_test(source, compiled, embedded_test_source); if (embedded_result != 0) { failures.push_back("in-XML (@@ comment) embedded test failed"); @@ -327,26 +328,40 @@ int run_all_tests(const km::kbp::path &source, const km::kbp::path &compiled) { assert(json_tests.size() > 0); // Loop over all tests for (const auto& n : json_tests) { - std::cout << "TEST: " << json_path.stem() << "/" << n.first << std::endl; + std::wcout << console_color::fg(console_color::BLUE) << console_color::bold() << "TEST: " << json_path.stem().c_str() << "/" << n.first.c_str() << console_color::reset() << std::endl; int sub_test = run_test(source, compiled, *n.second); if (sub_test != 0) { - std::cout << " FAIL: " << json_path.stem() << "/" << n.first << std::endl; + std::wcout << console_color::fg(console_color::BRIGHT_RED) << "FAIL: " << json_path.stem() << "/" << n.first.c_str() + << console_color::reset() << std::endl; failures.push_back(json_path.stem() + "/" + n.first); json_result = sub_test; // set to last failure } else { - std::cout << " PASS: " << json_path.stem() << "/" << n.first << std::endl; + std::wcout << console_color::fg(console_color::GREEN) << " PASS: " << console_color::reset() << json_path.stem() + << "/" << n.first.c_str() << std::endl; } } - std::cout << " " << json_tests.size() << " JSON test(s) in " << json_path.stem() << std::endl; + auto all_count = json_tests.size(); + auto fail_count = failures.size(); + auto pass_count = all_count - fail_count; + if (pass_count > 0) { + std::wcout << console_color::fg(console_color::GREEN) << " +" << pass_count; + } + if (fail_count > 0) { + std::wcout << console_color::fg(console_color::BRIGHT_RED) << + " -" << fail_count; + } + std::wcout << console_color::reset() << " of " << all_count << " JSON tests in " + << json_path.stem() << std::endl; } // OK. + std::wcout << console_color::fg(console_color::YELLOW) << "---- Summary of " << source.name() << " ----" << console_color::reset() << std::endl; if (embedded_result == -1) { - std::cout << "Note: No embedded test." << std::endl; + std::wcout << console_color::fg(console_color::YELLOW) << "Note: No embedded test." << console_color::reset() << std::endl; } if (json_result == -1) { - std::cout << "Note: No json test." << std::endl; + std::wcout << console_color::fg(console_color::YELLOW) << "Note: No json test." << console_color::reset() << std::endl; } // if both are missing, that's an error in itself. @@ -358,7 +373,7 @@ int run_all_tests(const km::kbp::path &source, const km::kbp::path &compiled) { // recap the failures if (failures.size() > 0) { for (const auto& f : failures) { - std::cerr << "failure summary: " << f << std::endl; + std::wcerr << console_color::fg(console_color::RED) << "failed: " << f.c_str() << console_color::reset() << std::endl; } return -1; } else { @@ -402,7 +417,7 @@ int main(int argc, char *argv[]) { int rc = run_all_tests(argv[first_arg], argv[first_arg + 1]); if (rc != EXIT_SUCCESS) { - std::cerr << "FAILED" << std::endl; + std::wcerr << console_color::fg(console_color::BRIGHT_RED) << "FAILED" << console_color::reset() << std::endl; rc = EXIT_FAILURE; } return rc; diff --git a/core/tests/unit/ldml/ldml_test_source.cpp b/core/tests/unit/ldml/ldml_test_source.cpp index 1a5905d1db3..ab8423f7648 100644 --- a/core/tests/unit/ldml/ldml_test_source.cpp +++ b/core/tests/unit/ldml/ldml_test_source.cpp @@ -24,6 +24,7 @@ #include #include "ldml/keyboardprocessor_ldml.h" #include "ldml/ldml_processor.hpp" +#include "ldml/ldml_transforms.hpp" #include "path.hpp" #include "state.hpp" @@ -467,6 +468,7 @@ LdmlJsonTestSource::next_action(ldml_action &fillin) { if (as_check.is_string()) { fillin.type = LDML_ACTION_CHECK_EXPECTED; fillin.string = LdmlTestSource::parse_u8_source_string(as_check.get()); + assert(km::kbp::ldml::normalize_nfc(fillin.string)); return; } @@ -485,6 +487,7 @@ LdmlJsonTestSource::next_action(ldml_action &fillin) { if (as_emit.is_string()) { fillin.type = LDML_ACTION_EMIT_STRING; fillin.string = LdmlTestSource::parse_u8_source_string(as_emit.get()); + assert(km::kbp::ldml::normalize_nfc(fillin.string)); return; } @@ -502,7 +505,7 @@ int LdmlJsonTestSource::load(const nlohmann::json &data) { this->data = data; // TODO-LDML auto startContext = data["/startContext/to"_json_pointer]; context = LdmlTestSource::parse_u8_source_string(startContext); - + assert(km::kbp::ldml::normalize_nfc(context)); return 0; }