From dcb6796d5bf6dbc174cf4bcce19dfa4ba4cb3a5b Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Mon, 9 Oct 2023 18:45:17 -0500 Subject: [PATCH 01/18] =?UTF-8?q?chore(core):=20Some=20failing=20normaliza?= =?UTF-8?q?tion=20transform=20tests=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For: #9468 --- .../keyboards/k_008_transform_norm-test.xml | 136 ++++++++++++++++++ .../ldml/keyboards/k_008_transform_norm.xml | 74 ++++++++++ core/tests/unit/ldml/keyboards/meson.build | 1 + 3 files changed, 211 insertions(+) create mode 100644 core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml create mode 100644 core/tests/unit/ldml/keyboards/k_008_transform_norm.xml diff --git a/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml b/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml new file mode 100644 index 00000000000..21f815c457a --- /dev/null +++ b/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml @@ -0,0 +1,136 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml b/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml new file mode 100644 index 00000000000..e54c3f6e6e8 --- /dev/null +++ b/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml @@ -0,0 +1,74 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/core/tests/unit/ldml/keyboards/meson.build b/core/tests/unit/ldml/keyboards/meson.build index b83c74d1414..536e6e06e6e 100644 --- a/core/tests/unit/ldml/keyboards/meson.build +++ b/core/tests/unit/ldml/keyboards/meson.build @@ -34,6 +34,7 @@ tests_without_testdata = [ tests_with_testdata = [ 'k_001_tiny', 'k_007_transform_rgx', + 'k_008_transform_norm', 'k_020_fr', # TODO-LDML: move to cldr above (fix vkey) 'k_200_reorder_nod_Lana', 'k_210_marker', From 15e6fb60dbf3bec894e7ae178cd1595b5986b35e Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Tue, 10 Oct 2023 17:51:12 -0500 Subject: [PATCH 02/18] =?UTF-8?q?chore(core):=20dx:=20improve=20test=20out?= =?UTF-8?q?put=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - colorize ldml test For: #9468 --- core/tests/unit/ldml/ldml.cpp | 37 ++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/core/tests/unit/ldml/ldml.cpp b/core/tests/unit/ldml/ldml.cpp index 31a4cded731..de0c9e5556c 100644 --- a/core/tests/unit/ldml/ldml.cpp +++ b/core/tests/unit/ldml/ldml.cpp @@ -295,8 +295,8 @@ run_test(const km::kbp::path &source, const km::kbp::path &compiled, km::tests:: * Run all tests for this keyboard */ int run_all_tests(const km::kbp::path &source, const km::kbp::path &compiled) { - std::cout << "source file = " << source << std::endl - << "compiled file = " << compiled << std::endl; + std::wcout << console_color::fg(console_color::BLUE) << "source file = " << source << std::endl + << "compiled file = " << compiled << console_color::reset() << std::endl; km::tests::LdmlEmbeddedTestSource embedded_test_source; @@ -306,7 +306,8 @@ int run_all_tests(const km::kbp::path &source, const km::kbp::path &compiled) { if (embedded_result == 0) { // embedded loaded OK, try it - std::cout << "TEST: " << source.name() << " (embedded)" << std::endl; + std::wcout << console_color::fg(console_color::BLUE) << console_color::bold() << "TEST: " << source.name() << " (embedded)" + << console_color::reset() << std::endl; embedded_result = run_test(source, compiled, embedded_test_source); if (embedded_result != 0) { failures.push_back("in-XML (@@ comment) embedded test failed"); @@ -327,26 +328,40 @@ int run_all_tests(const km::kbp::path &source, const km::kbp::path &compiled) { assert(json_tests.size() > 0); // Loop over all tests for (const auto& n : json_tests) { - std::cout << "TEST: " << json_path.stem() << "/" << n.first << std::endl; + std::wcout << console_color::fg(console_color::BLUE) << console_color::bold() << "TEST: " << json_path.stem().c_str() << "/" << n.first.c_str() << console_color::reset() << std::endl; int sub_test = run_test(source, compiled, *n.second); if (sub_test != 0) { - std::cout << " FAIL: " << json_path.stem() << "/" << n.first << std::endl; + std::wcout << console_color::fg(console_color::BRIGHT_RED) << "FAIL: " << json_path.stem() << "/" << n.first.c_str() + << console_color::reset() << std::endl; failures.push_back(json_path.stem() + "/" + n.first); json_result = sub_test; // set to last failure } else { - std::cout << " PASS: " << json_path.stem() << "/" << n.first << std::endl; + std::wcout << console_color::fg(console_color::GREEN) << " PASS: " << console_color::reset() << json_path.stem() + << "/" << n.first.c_str() << std::endl; } } - std::cout << " " << json_tests.size() << " JSON test(s) in " << json_path.stem() << std::endl; + int all_count = json_tests.size(); + int fail_count = failures.size(); + int pass_count = all_count - fail_count; + if (pass_count > 0) { + std::wcout << console_color::fg(console_color::GREEN) << " +" << pass_count; + } + if (fail_count > 0) { + std::wcout << console_color::fg(console_color::BRIGHT_RED) << + " -" << fail_count; + } + std::wcout << console_color::reset() << " of " << all_count << " JSON tests in " + << json_path.stem() << std::endl; } // OK. + std::wcout << console_color::fg(console_color::YELLOW) << "---- Summary of " << source.name() << " ----" << console_color::reset() << std::endl; if (embedded_result == -1) { - std::cout << "Note: No embedded test." << std::endl; + std::wcout << console_color::fg(console_color::YELLOW) << "Note: No embedded test." << console_color::reset() << std::endl; } if (json_result == -1) { - std::cout << "Note: No json test." << std::endl; + std::wcout << console_color::fg(console_color::YELLOW) << "Note: No json test." << console_color::reset() << std::endl; } // if both are missing, that's an error in itself. @@ -358,7 +373,7 @@ int run_all_tests(const km::kbp::path &source, const km::kbp::path &compiled) { // recap the failures if (failures.size() > 0) { for (const auto& f : failures) { - std::cerr << "failure summary: " << f << std::endl; + std::wcerr << console_color::fg(console_color::RED) << "failed: " << f.c_str() << console_color::reset() << std::endl; } return -1; } else { @@ -402,7 +417,7 @@ int main(int argc, char *argv[]) { int rc = run_all_tests(argv[first_arg], argv[first_arg + 1]); if (rc != EXIT_SUCCESS) { - std::cerr << "FAILED" << std::endl; + std::wcerr << console_color::fg(console_color::BRIGHT_RED) << "FAILED" << console_color::reset() << std::endl; rc = EXIT_FAILURE; } return rc; From daa0523dde797d179c983809481c3bac17f44417 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Tue, 10 Oct 2023 19:01:14 -0500 Subject: [PATCH 03/18] =?UTF-8?q?feat(core):=20ldml=20normalization=20?= =?UTF-8?q?=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 0 steps forward, 4294967294 steps back For: #9468 --- core/src/ldml/ldml_processor.cpp | 13 +++- core/src/ldml/ldml_transforms.cpp | 67 +++++++++++++++++-- core/src/ldml/ldml_transforms.hpp | 8 +++ .../keyboards/k_008_transform_norm-test.xml | 6 +- .../ldml/keyboards/k_008_transform_norm.xml | 22 ------ 5 files changed, 85 insertions(+), 31 deletions(-) diff --git a/core/src/ldml/ldml_processor.cpp b/core/src/ldml/ldml_processor.cpp index 40b263a3532..ec8d1fcb089 100644 --- a/core/src/ldml/ldml_processor.cpp +++ b/core/src/ldml/ldml_processor.cpp @@ -235,9 +235,12 @@ ldml_processor::process_event( default: // all other VKs { + UErrorCode status = U_ZERO_ERROR; // Look up the key - const std::u16string str = keys.lookup(vk, modifier_state); + std::u16string str = keys.lookup(vk, modifier_state); + ldml::normalize_nfd(str, status); + assert(U_SUCCESS(status)); if (str.empty()) { // no key was found, so pass the keystroke on to the Engine state->actions().push_invalidate_context(); @@ -248,7 +251,7 @@ ldml_processor::process_event( // found a string - push it into the context and actions // we convert it here instead of using the emit_text() overload // so that we don't have to reconvert it inside the transform code. - const std::u32string str32 = kmx::u16string_to_u32string(str); + std::u32string str32 = kmx::u16string_to_u32string(str); if (!transforms) { // No transforms: just emit the string. @@ -263,6 +266,9 @@ ldml_processor::process_event( (void)context_to_string(state, ctxtstr); // add the newly added key output to ctxtstr ctxtstr.append(str32); + // and normalize + ldml::normalize_nfd(ctxtstr, status); + assert(U_SUCCESS(status)); /** the output buffer for transforms */ std::u32string outputString; @@ -270,6 +276,9 @@ ldml_processor::process_event( // apply the transform, get how much matched (at the end) const size_t matchedContext = transforms->apply(ctxtstr, outputString); + ldml::normalize_nfd(outputString, status); + assert(U_SUCCESS(status)); + if (matchedContext == 0) { // No match, just emit the original string emit_text(state, str32); diff --git a/core/src/ldml/ldml_transforms.cpp b/core/src/ldml/ldml_transforms.cpp index 5507c6ed86d..842b6e6a638 100644 --- a/core/src/ldml/ldml_transforms.cpp +++ b/core/src/ldml/ldml_transforms.cpp @@ -462,9 +462,13 @@ transform_entry::init() { // TODO-LDML: if we have mapFrom, may need to do other processing. const std::u16string patstr = km::kbp::kmx::u32string_to_u16string(fFrom); UErrorCode status = U_ZERO_ERROR; - /* const */ icu::UnicodeString patustr = icu::UnicodeString(patstr.data(), (int32_t)patstr.length()); + /* const */ icu::UnicodeString patustr_raw = icu::UnicodeString(patstr.data(), (int32_t)patstr.length()); // add '$' to match to end - patustr.append(u'$'); + patustr_raw.append(u'$'); + icu::UnicodeString patustr; + const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status); + // NFD normalize on pattern creation + nfd->normalize(patustr_raw, patustr, status); fFromPattern.reset(icu::RegexPattern::compile(patustr, 0, status)); assert(U_SUCCESS(status)); // TODO-LDML: may be best to propagate status up ^^ } @@ -545,12 +549,21 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons rustr = icu::UnicodeString(rstr.data(), (int32_t)rstr.length()); // and we return to the regular code flow. } + const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status); + icu::UnicodeString rustr2; + nfd->normalize(rustr, rustr2, status); + assert(U_SUCCESS(status)); // here we replace the match output. - icu::UnicodeString entireOutput = matcher->replaceFirst(rustr, status); + icu::UnicodeString entireOutput = matcher->replaceFirst(rustr2, status); assert(U_SUCCESS(status)); // TODO-LDML: could fail here due to bad input (syntax err) // entireOutput includes all of 'input', but modified. Need to substring it. - icu::UnicodeString outu = entireOutput.tempSubString(matchStart); + icu::UnicodeString outu_raw = entireOutput.tempSubString(matchStart); + + // normalize the replaced string + icu::UnicodeString outu; + nfd->normalize(outu_raw, outu, status); + assert(U_SUCCESS(status)); // Special case if there's no output, save some allocs if (outu.length() == 0) { @@ -831,6 +844,52 @@ transforms::load( return transforms; } +// string + +std::u32string &normalize_nfd(std::u32string &str, UErrorCode &status) { + const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status); + if (U_FAILURE(status)) { + return str; + } + icu::UnicodeString dest; + const std::u16string rstr = km::kbp::kmx::u32string_to_u16string(str); + icu::UnicodeString src = icu::UnicodeString(rstr.data(), (int32_t)rstr.length()); + nfd->normalize(src, dest, status); + if (U_FAILURE(status)) { + return str; + } + + UErrorCode preflightStatus = U_ZERO_ERROR; + // calculate how big the buffer is + auto out32len = dest.toUTF32(nullptr, 0, preflightStatus); // preflightStatus will be an err, because we know the buffer overruns zero bytes + // allocate + char32_t *s = new char32_t[out32len + 1]; + assert(s != nullptr); + // convert + dest.toUTF32((UChar32 *)s, out32len + 1, status); + assert(U_SUCCESS(status)); + str.assign(s, out32len); + delete [] s; + return str; +} + +std::u16string &normalize_nfd(std::u16string &str, UErrorCode &status) { + const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status); + if (U_FAILURE(status)) { + return str; + } + icu::UnicodeString dest; + icu::UnicodeString src = icu::UnicodeString(str.data(), (int32_t)str.length()); + nfd->normalize(src, dest, status); + if (U_FAILURE(status)) { + return str; + } + str.assign(dest.getBuffer(), dest.length()); + return str; +} + + + } // namespace ldml } // namespace kbp } // namespace km diff --git a/core/src/ldml/ldml_transforms.hpp b/core/src/ldml/ldml_transforms.hpp index 9d231bfe323..27449f2db87 100644 --- a/core/src/ldml/ldml_transforms.hpp +++ b/core/src/ldml/ldml_transforms.hpp @@ -25,6 +25,7 @@ #include "unicode/unistr.h" #include "unicode/regex.h" #include "unicode/utext.h" +#include "unicode/normalizer2.h" namespace km { namespace kbp { @@ -270,6 +271,13 @@ class transforms { const kbp::kmx::COMP_KMXPLUS_TRAN_Helper &tranHelper); }; +// string routines + +/** Normalize a u32string inplace. Returns a reference to the same string. */ +std::u32string &normalize_nfd(std::u32string &str, UErrorCode &status); +/** Normalize a u16string inplace. Returns a reference to the same string. */ +std::u16string &normalize_nfd(std::u16string &str, UErrorCode &status); + } // namespace ldml } // namespace kbp } // namespace km diff --git a/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml b/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml index 21f815c457a..8fd842ca4a8 100644 --- a/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml +++ b/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml @@ -9,20 +9,20 @@ - + - + - + diff --git a/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml b/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml index e54c3f6e6e8..cb4cdb533ad 100644 --- a/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml +++ b/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml @@ -40,17 +40,6 @@ from https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-ke - - - - - - - - - - - @@ -59,16 +48,5 @@ from https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-ke - - - - - - - - - - - From 522d2ef5835e7ce7d9a496f468a24939e5be841e Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Thu, 12 Oct 2023 10:10:37 -0500 Subject: [PATCH 04/18] =?UTF-8?q?feat(core):=20ldml=20normalization=20?= =?UTF-8?q?=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - change tests to NFD for now For: #9468 --- core/tests/unit/ldml/keyboards/k_003_transform.xml | 2 +- core/tests/unit/ldml/keyboards/k_210_marker-test.xml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/core/tests/unit/ldml/keyboards/k_003_transform.xml b/core/tests/unit/ldml/keyboards/k_003_transform.xml index 814b6829d8e..5575bd01dcc 100644 --- a/core/tests/unit/ldml/keyboards/k_003_transform.xml +++ b/core/tests/unit/ldml/keyboards/k_003_transform.xml @@ -4,7 +4,7 @@ from https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-keyboards.md#element-transform @@keys: [K_Q][K_U][K_BKQUOTE][K_E] -@@expected: qu\u00ea +@@expected: que\u0302 --> diff --git a/core/tests/unit/ldml/keyboards/k_210_marker-test.xml b/core/tests/unit/ldml/keyboards/k_210_marker-test.xml index 24f104f4d71..dd7d7be0092 100644 --- a/core/tests/unit/ldml/keyboards/k_210_marker-test.xml +++ b/core/tests/unit/ldml/keyboards/k_210_marker-test.xml @@ -29,12 +29,12 @@ - + - + From e64f59613ae0cf3277ef9ba700fca7777ed813ef Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Thu, 12 Oct 2023 10:10:51 -0500 Subject: [PATCH 05/18] =?UTF-8?q?feat(core):=20ldml=20normalization=20?= =?UTF-8?q?=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - split out 'process_key_string' For: #9468 --- core/src/ldml/ldml_processor.cpp | 166 +++++++++++++++---------------- core/src/ldml/ldml_processor.hpp | 3 + 2 files changed, 85 insertions(+), 84 deletions(-) diff --git a/core/src/ldml/ldml_processor.cpp b/core/src/ldml/ldml_processor.cpp index ec8d1fcb089..6074722ef8b 100644 --- a/core/src/ldml/ldml_processor.cpp +++ b/core/src/ldml/ldml_processor.cpp @@ -235,98 +235,17 @@ ldml_processor::process_event( default: // all other VKs { - UErrorCode status = U_ZERO_ERROR; // Look up the key - std::u16string str = keys.lookup(vk, modifier_state); + const std::u16string key_str = keys.lookup(vk, modifier_state); - ldml::normalize_nfd(str, status); - assert(U_SUCCESS(status)); - if (str.empty()) { + if (key_str.empty()) { // no key was found, so pass the keystroke on to the Engine state->actions().push_invalidate_context(); state->actions().push_emit_keystroke(); break; // ----- commit and exit } - // found a string - push it into the context and actions - // we convert it here instead of using the emit_text() overload - // so that we don't have to reconvert it inside the transform code. - std::u32string str32 = kmx::u16string_to_u32string(str); - - if (!transforms) { - // No transforms: just emit the string. - emit_text(state, str32); - } else { - // Process transforms here - /** - * a copy of the current/changed context, for transform use. - * - */ - std::u32string ctxtstr; - (void)context_to_string(state, ctxtstr); - // add the newly added key output to ctxtstr - ctxtstr.append(str32); - // and normalize - ldml::normalize_nfd(ctxtstr, status); - assert(U_SUCCESS(status)); - - /** the output buffer for transforms */ - std::u32string outputString; - - // apply the transform, get how much matched (at the end) - const size_t matchedContext = transforms->apply(ctxtstr, outputString); - - ldml::normalize_nfd(outputString, status); - assert(U_SUCCESS(status)); - - if (matchedContext == 0) { - // No match, just emit the original string - emit_text(state, str32); - } else { - // We have a match. - - ctxtstr.resize(ctxtstr.length() - str32.length()); - /** how many chars of the context we need to clear */ - auto charsToDelete = matchedContext - str32.length(); /* we don't need to clear the output of the current key */ - - /** how many context items need to be removed */ - size_t contextRemoved = 0; - for (auto c = state->context().rbegin(); charsToDelete > 0 && c != state->context().rend(); c++, contextRemoved++) { - /** last char of context */ - km_core_usv lastCtx = ctxtstr.back(); - uint8_t type = c->type; - assert(type == KM_CORE_BT_CHAR || type == KM_CORE_BT_MARKER); - if (type == KM_CORE_BT_CHAR) { - // single char, drop it - charsToDelete--; - assert(c->character == lastCtx); - ctxtstr.pop_back(); - state->actions().push_backspace(KM_CORE_BT_CHAR, lastCtx); // Cause prior char to be removed - } else if (type == KM_CORE_BT_MARKER) { - // it's a marker, 'worth' 3 uchars - assert(charsToDelete >= 3); - assert(lastCtx == c->marker); // end of list - charsToDelete -= 3; - // pop off the three-part sentinel string - ctxtstr.pop_back(); - ctxtstr.pop_back(); - ctxtstr.pop_back(); - // push a special backspace to delete the marker - state->actions().push_backspace(KM_CORE_BT_MARKER, c->marker); - } - } - // now, pop the right number of context items - for (size_t i = 0; i < contextRemoved; i++) { - // we don't pop during the above loop because the iterator gets confused - state->context().pop_back(); - } - // Now, add in the updated text. This will convert UC_SENTINEL, etc back to marker actions. - emit_text(state, outputString); - // If we needed it further. we could update ctxtstr here: - // ctxtstr.append(outputString); - // ... but it is no longer needed at this point. - } // end of transform match - } // end of processing transforms + process_key_string(state, key_str); } // end of processing a 'normal' vk } // end of switch // end of normal processing: commit and exit @@ -339,6 +258,85 @@ ldml_processor::process_event( return KM_CORE_STATUS_OK; } +void +ldml_processor::process_key_string(km_core_state *state, const std::u16string &key_str) const { + // found a string - push it into the context and actions + // we convert it here instead of using the emit_text() overload + // so that we don't have to reconvert it inside the transform code. + std::u32string str32 = kmx::u16string_to_u32string(key_str); + + if (!transforms) { + // No transforms: just emit the string. + emit_text(state, str32); + } else { + // Process transforms here + /** + * a copy of the current/changed context, for transform use. + * + */ + std::u32string ctxtstr; + (void)context_to_string(state, ctxtstr); + // add the newly added key output to ctxtstr + ctxtstr.append(str32); + // and normalize + + /** the output buffer for transforms */ + std::u32string outputString; + + // apply the transform, get how much matched (at the end) + const size_t matchedContext = transforms->apply(ctxtstr, outputString); + + + if (matchedContext == 0) { + // No match, just emit the original string + emit_text(state, str32); + } else { + // We have a match. + + ctxtstr.resize(ctxtstr.length() - str32.length()); + /** how many chars of the context we need to clear */ + auto charsToDelete = matchedContext - str32.length(); /* we don't need to clear the output of the current key */ + + /** how many context items need to be removed */ + size_t contextRemoved = 0; + for (auto c = state->context().rbegin(); charsToDelete > 0 && c != state->context().rend(); c++, contextRemoved++) { + /** last char of context */ + km_core_usv lastCtx = ctxtstr.back(); + uint8_t type = c->type; + assert(type == KM_CORE_BT_CHAR || type == KM_CORE_BT_MARKER); + if (type == KM_CORE_BT_CHAR) { + // single char, drop it + charsToDelete--; + assert(c->character == lastCtx); + ctxtstr.pop_back(); + state->actions().push_backspace(KM_CORE_BT_CHAR, c->character); // Cause prior char to be removed + } else if (type == KM_CORE_BT_MARKER) { + // it's a marker, 'worth' 3 uchars + assert(charsToDelete >= 3); + assert(lastCtx == c->marker); // end of list + charsToDelete -= 3; + // pop off the three-part sentinel string + ctxtstr.pop_back(); + ctxtstr.pop_back(); + ctxtstr.pop_back(); + // push a special backspace to delete the marker + state->actions().push_backspace(KM_CORE_BT_MARKER, c->marker); + } + } + // now, pop the right number of context items + for (size_t i = 0; i < contextRemoved; i++) { + // we don't pop during the above loop because the iterator gets confused + state->context().pop_back(); + } + // Now, add in the updated text. This will convert UC_SENTINEL, etc back to marker actions. + emit_text(state, outputString); + // If we needed it further. we could update ctxtstr here: + // ctxtstr.append(outputString); + // ... but it is no longer needed at this point. + } // end of transform match + } // end of processing transforms +} + km_core_attr const & ldml_processor::attributes() const { return engine_attrs; } diff --git a/core/src/ldml/ldml_processor.hpp b/core/src/ldml/ldml_processor.hpp index e0129115b21..f061ca6e43f 100644 --- a/core/src/ldml/ldml_processor.hpp +++ b/core/src/ldml/ldml_processor.hpp @@ -94,6 +94,9 @@ namespace kbp { /** emit a marker */ static void emit_marker(km_core_state *state, KMX_DWORD marker); + /** process a typed key */ + void process_key_string(km_core_state *state, const std::u16string &key_str) const; + /** * add the string+marker portion of the context to the beginning of str. * Stop when a non-string and non-marker is hit. From 736af4b957e462e05cd3f7f012cc4c9573d15f91 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Thu, 12 Oct 2023 15:12:25 -0500 Subject: [PATCH 06/18] =?UTF-8?q?feat(core):=20ldml=20normalization=20?= =?UTF-8?q?=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - change more tests to NFD for now For: #9468 --- core/tests/unit/ldml/keyboards/k_210_marker-test.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/tests/unit/ldml/keyboards/k_210_marker-test.xml b/core/tests/unit/ldml/keyboards/k_210_marker-test.xml index dd7d7be0092..91fd7d4520f 100644 --- a/core/tests/unit/ldml/keyboards/k_210_marker-test.xml +++ b/core/tests/unit/ldml/keyboards/k_210_marker-test.xml @@ -7,13 +7,13 @@ - + - + From a577bd2f5394aa64f72d224cbbb03983c2b094f8 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Thu, 12 Oct 2023 16:57:37 -0500 Subject: [PATCH 07/18] =?UTF-8?q?feat(core):=20ldml=20normalization=20?= =?UTF-8?q?=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - more progress in the normalization pipeline. Trying to keep it from leaking. - normalize output to NFC. - normalize json test data to NFC (both context and expected). - we do NOT try to normalize the 'embedded' strings currently. - yet more test data fixes (turns out u+00e0 ≠ u+00e8) For: #9468 --- core/src/ldml/ldml_processor.cpp | 175 ++++++++++-------- core/src/ldml/ldml_processor.hpp | 8 +- core/src/ldml/ldml_transforms.cpp | 47 +++++ core/src/ldml/ldml_transforms.hpp | 4 + .../unit/ldml/keyboards/k_003_transform.xml | 2 +- .../keyboards/k_008_transform_norm-test.xml | 2 +- .../ldml/keyboards/k_008_transform_norm.xml | 12 +- core/tests/unit/ldml/ldml_test_source.cpp | 9 +- 8 files changed, 172 insertions(+), 87 deletions(-) diff --git a/core/src/ldml/ldml_processor.cpp b/core/src/ldml/ldml_processor.cpp index 6074722ef8b..64237e5b2f8 100644 --- a/core/src/ldml/ldml_processor.cpp +++ b/core/src/ldml/ldml_processor.cpp @@ -6,6 +6,7 @@ */ #include +#include #include "ldml/ldml_processor.hpp" #include "state.hpp" #include "kmx_file.h" @@ -260,81 +261,100 @@ ldml_processor::process_event( void ldml_processor::process_key_string(km_core_state *state, const std::u16string &key_str) const { - // found a string - push it into the context and actions - // we convert it here instead of using the emit_text() overload - // so that we don't have to reconvert it inside the transform code. - std::u32string str32 = kmx::u16string_to_u32string(key_str); + UErrorCode status = U_ZERO_ERROR; + // We know that key_str is not empty per the caller. - if (!transforms) { - // No transforms: just emit the string. - emit_text(state, str32); + // we convert the keys str to UTF-32 here instead of using the emit_text() overload + // so that we don't have to reconvert it inside the transform code. + std::u32string key_str32 = kmx::u16string_to_u32string(key_str); + // normalize the keystroke to NFD + ldml::normalize_nfd(key_str32, status); + + // extract context string, in NFC + std::u32string old_ctxtstr_nfc; + (void)context_to_string(state, old_ctxtstr_nfc, false); + ldml::normalize_nfc(old_ctxtstr_nfc, status); + assert(U_SUCCESS(status)); + + // context string in NFD + std::u32string ctxtstr; + (void)context_to_string(state, ctxtstr, true); // with markers + // add the newly added key output to ctxtstr + ctxtstr.append(key_str32); + ldml::normalize_nfd(ctxtstr, status); + assert(U_SUCCESS(status)); + + /** transform output string */ + std::u32string outputString; + /** how many chars of the ctxtstr to replace */ + size_t matchedContext = 0; // zero if no transforms + + // begin modifications to the string + + if(transforms) { + matchedContext = transforms->apply(ctxtstr, outputString); } else { - // Process transforms here - /** - * a copy of the current/changed context, for transform use. - * - */ - std::u32string ctxtstr; - (void)context_to_string(state, ctxtstr); - // add the newly added key output to ctxtstr - ctxtstr.append(str32); - // and normalize - - /** the output buffer for transforms */ - std::u32string outputString; - - // apply the transform, get how much matched (at the end) - const size_t matchedContext = transforms->apply(ctxtstr, outputString); - - - if (matchedContext == 0) { - // No match, just emit the original string - emit_text(state, str32); - } else { - // We have a match. - - ctxtstr.resize(ctxtstr.length() - str32.length()); - /** how many chars of the context we need to clear */ - auto charsToDelete = matchedContext - str32.length(); /* we don't need to clear the output of the current key */ - - /** how many context items need to be removed */ - size_t contextRemoved = 0; - for (auto c = state->context().rbegin(); charsToDelete > 0 && c != state->context().rend(); c++, contextRemoved++) { - /** last char of context */ - km_core_usv lastCtx = ctxtstr.back(); - uint8_t type = c->type; - assert(type == KM_CORE_BT_CHAR || type == KM_CORE_BT_MARKER); - if (type == KM_CORE_BT_CHAR) { - // single char, drop it - charsToDelete--; - assert(c->character == lastCtx); - ctxtstr.pop_back(); - state->actions().push_backspace(KM_CORE_BT_CHAR, c->character); // Cause prior char to be removed - } else if (type == KM_CORE_BT_MARKER) { - // it's a marker, 'worth' 3 uchars - assert(charsToDelete >= 3); - assert(lastCtx == c->marker); // end of list - charsToDelete -= 3; - // pop off the three-part sentinel string - ctxtstr.pop_back(); - ctxtstr.pop_back(); - ctxtstr.pop_back(); - // push a special backspace to delete the marker - state->actions().push_backspace(KM_CORE_BT_MARKER, c->marker); - } - } - // now, pop the right number of context items - for (size_t i = 0; i < contextRemoved; i++) { - // we don't pop during the above loop because the iterator gets confused - state->context().pop_back(); - } - // Now, add in the updated text. This will convert UC_SENTINEL, etc back to marker actions. - emit_text(state, outputString); - // If we needed it further. we could update ctxtstr here: - // ctxtstr.append(outputString); - // ... but it is no longer needed at this point. - } // end of transform match - } // end of processing transforms + // no transforms, no output + } + + // drop last 'matchedContext': + ctxtstr.resize(ctxtstr.length() - matchedContext); + ctxtstr.append(outputString); // TODO-LDML: should be able to do a normalization-safe append here. + ldml::normalize_nfd(ctxtstr, status); + assert(U_SUCCESS(status)); + + // Ok. We've done all the happy manipulations. + + /** NFC and no markers */ + std::u32string ctxtstr_cleanedup = ctxtstr; + // TODO-LDML: remove markers! + ldml::normalize_nfc(ctxtstr_cleanedup, status); + + // find common prefix + auto ctxt_prefix = mismatch(old_ctxtstr_nfc.begin(), old_ctxtstr_nfc.end(), ctxtstr_cleanedup.begin(), ctxtstr_cleanedup.end()); + /** the part of the old str that changed */ + std::u32string old_ctxtstr_changed(ctxt_prefix.first,old_ctxtstr_nfc.end()); + std::u32string new_ctxtstr_changed(ctxt_prefix.second,ctxtstr_cleanedup.end()); + + // drop the old suffix. Note: this mutates old_ctxtstr_changed. + remove_text(state, old_ctxtstr_changed, old_ctxtstr_changed.length()); + assert(old_ctxtstr_changed.length() == 0); + emit_text(state, new_ctxtstr_changed); +} + +void +ldml_processor::remove_text(km_core_state *state, std::u32string &str, size_t length) { + /** how many context items need to be removed */ + size_t contextRemoved = 0; + for (auto c = state->context().rbegin(); length > 0 && c != state->context().rend(); c++, contextRemoved++) { + /** last char of context */ + km_core_usv lastCtx = str.back(); + uint8_t type = c->type; + assert(type == KM_CORE_BT_CHAR || type == KM_CORE_BT_MARKER); + if (type == KM_CORE_BT_CHAR) { + // single char, drop it + length--; + assert(c->character == lastCtx); + str.pop_back(); + state->actions().push_backspace(KM_CORE_BT_CHAR, c->character); // Cause prior char to be removed + } else if (type == KM_CORE_BT_MARKER) { + // it's a marker, 'worth' 3 uchars + assert(length >= 3); + assert(lastCtx == c->marker); // end of list + length -= 3; + // pop off the three-part sentinel string + str.pop_back(); + str.pop_back(); + str.pop_back(); + // push a special backspace to delete the marker + state->actions().push_backspace(KM_CORE_BT_MARKER, c->marker); + } + } + // now, pop the right number of context items + for (size_t i = 0; i < contextRemoved; i++) { + // we don't pop during the above loop because the iterator gets confused + state->context().pop_back(); + } } km_core_attr const & ldml_processor::attributes() const { @@ -402,10 +422,10 @@ ldml_processor::emit_marker(km_core_state *state, KMX_DWORD marker_no) { } size_t -ldml_processor::context_to_string(km_core_state *state, std::u32string &str) { +ldml_processor::context_to_string(km_core_state *state, std::u32string &str, bool include_markers) { str.clear(); auto &cp = state->context(); - size_t ctxlen = 0; // TODO-LDML: is this needed? + size_t ctxlen = 0; // TODO-LDML: not used by callers? uint8_t last_type = KM_CORE_BT_UNKNOWN; for (auto c = cp.rbegin(); c != cp.rend(); c++, ctxlen++) { last_type = c->type; @@ -413,7 +433,9 @@ ldml_processor::context_to_string(km_core_state *state, std::u32string &str) { str.insert(0, 1, c->character); } else if (last_type == KM_CORE_BT_MARKER) { assert(km::kbp::kmx::is_valid_marker(c->marker)); - prepend_marker(str, c->marker); + if (include_markers) { + prepend_marker(str, c->marker); + } } else { break; } @@ -421,6 +443,5 @@ ldml_processor::context_to_string(km_core_state *state, std::u32string &str) { return ctxlen; // consumed the entire context buffer. } - } // namespace kbp } // namespace km diff --git a/core/src/ldml/ldml_processor.hpp b/core/src/ldml/ldml_processor.hpp index f061ca6e43f..0845aa70ac9 100644 --- a/core/src/ldml/ldml_processor.hpp +++ b/core/src/ldml/ldml_processor.hpp @@ -93,6 +93,12 @@ namespace kbp { static void emit_text(km_core_state *state, km_core_usv ch); /** emit a marker */ static void emit_marker(km_core_state *state, KMX_DWORD marker); + /** + * Delete text from the state. + * @param str string with text to remove, from the end + * @param length number of chars from the end of str to drop + */ + static void remove_text(km_core_state *state, std::u32string &str, size_t length); /** process a typed key */ void process_key_string(km_core_state *state, const std::u16string &key_str) const; @@ -103,7 +109,7 @@ namespace kbp { * Convert markers into the UC_SENTINEL format. * @return the number of context items consumed */ - static size_t context_to_string(km_core_state *state, std::u32string &str); + static size_t context_to_string(km_core_state *state, std::u32string &str, bool include_markers = true); /** prepend the marker string in UC_SENTINEL format to the str */ inline static void prepend_marker(std::u32string &str, KMX_DWORD marker); diff --git a/core/src/ldml/ldml_transforms.cpp b/core/src/ldml/ldml_transforms.cpp index 842b6e6a638..651919312bf 100644 --- a/core/src/ldml/ldml_transforms.cpp +++ b/core/src/ldml/ldml_transforms.cpp @@ -846,6 +846,7 @@ transforms::load( // string +// TODO-LDML: copypasta -> refactor std::u32string &normalize_nfd(std::u32string &str, UErrorCode &status) { const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status); if (U_FAILURE(status)) { @@ -873,6 +874,7 @@ std::u32string &normalize_nfd(std::u32string &str, UErrorCode &status) { return str; } +// TODO-LDML: copypasta -> refactor std::u16string &normalize_nfd(std::u16string &str, UErrorCode &status) { const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status); if (U_FAILURE(status)) { @@ -888,6 +890,51 @@ std::u16string &normalize_nfd(std::u16string &str, UErrorCode &status) { return str; } +// TODO-LDML: copypasta -> refactor +std::u32string &normalize_nfc(std::u32string &str, UErrorCode &status) { + const icu::Normalizer2 *nfc = icu::Normalizer2::getNFCInstance(status); + if (U_FAILURE(status)) { + return str; + } + icu::UnicodeString dest; + const std::u16string rstr = km::kbp::kmx::u32string_to_u16string(str); + icu::UnicodeString src = icu::UnicodeString(rstr.data(), (int32_t)rstr.length()); + nfc->normalize(src, dest, status); + if (U_FAILURE(status)) { + return str; + } + + UErrorCode preflightStatus = U_ZERO_ERROR; + // calculate how big the buffer is + auto out32len = dest.toUTF32(nullptr, 0, preflightStatus); // preflightStatus will be an err, because we know the buffer overruns zero bytes + // allocate + char32_t *s = new char32_t[out32len + 1]; + assert(s != nullptr); + // convert + dest.toUTF32((UChar32 *)s, out32len + 1, status); + assert(U_SUCCESS(status)); + str.assign(s, out32len); + delete [] s; + return str; +} + +// TODO-LDML: copypasta -> refactor +std::u16string &normalize_nfc(std::u16string &str, UErrorCode &status) { + const icu::Normalizer2 *nfc = icu::Normalizer2::getNFCInstance(status); + if (U_FAILURE(status)) { + return str; + } + icu::UnicodeString dest; + icu::UnicodeString src = icu::UnicodeString(str.data(), (int32_t)str.length()); + nfc->normalize(src, dest, status); + if (U_FAILURE(status)) { + return str; + } + str.assign(dest.getBuffer(), dest.length()); + return str; +} + + } // namespace ldml diff --git a/core/src/ldml/ldml_transforms.hpp b/core/src/ldml/ldml_transforms.hpp index 27449f2db87..3a8e3cca428 100644 --- a/core/src/ldml/ldml_transforms.hpp +++ b/core/src/ldml/ldml_transforms.hpp @@ -277,6 +277,10 @@ class transforms { std::u32string &normalize_nfd(std::u32string &str, UErrorCode &status); /** Normalize a u16string inplace. Returns a reference to the same string. */ std::u16string &normalize_nfd(std::u16string &str, UErrorCode &status); +/** Normalize a u32string inplace. Returns a reference to the same string. */ +std::u32string &normalize_nfc(std::u32string &str, UErrorCode &status); +/** Normalize a u16string inplace. Returns a reference to the same string. */ +std::u16string &normalize_nfc(std::u16string &str, UErrorCode &status); } // namespace ldml } // namespace kbp diff --git a/core/tests/unit/ldml/keyboards/k_003_transform.xml b/core/tests/unit/ldml/keyboards/k_003_transform.xml index 5575bd01dcc..97984340122 100644 --- a/core/tests/unit/ldml/keyboards/k_003_transform.xml +++ b/core/tests/unit/ldml/keyboards/k_003_transform.xml @@ -4,7 +4,7 @@ from https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-keyboards.md#element-transform @@keys: [K_Q][K_U][K_BKQUOTE][K_E] -@@expected: que\u0302 +@@expected: qu\u00EA --> diff --git a/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml b/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml index 8fd842ca4a8..025fe36a483 100644 --- a/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml +++ b/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml @@ -20,7 +20,7 @@ - + diff --git a/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml b/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml index cb4cdb533ad..b66ff1a4b6a 100644 --- a/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml +++ b/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml @@ -13,11 +13,11 @@ from https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-ke - - - + + + - + @@ -30,7 +30,7 @@ from https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-ke - + @@ -46,7 +46,7 @@ from https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-ke - + diff --git a/core/tests/unit/ldml/ldml_test_source.cpp b/core/tests/unit/ldml/ldml_test_source.cpp index 1a5905d1db3..4ad5d5a3a72 100644 --- a/core/tests/unit/ldml/ldml_test_source.cpp +++ b/core/tests/unit/ldml/ldml_test_source.cpp @@ -24,6 +24,7 @@ #include #include "ldml/keyboardprocessor_ldml.h" #include "ldml/ldml_processor.hpp" +#include "ldml/ldml_transforms.hpp" #include "path.hpp" #include "state.hpp" @@ -467,6 +468,9 @@ LdmlJsonTestSource::next_action(ldml_action &fillin) { if (as_check.is_string()) { fillin.type = LDML_ACTION_CHECK_EXPECTED; fillin.string = LdmlTestSource::parse_u8_source_string(as_check.get()); + UErrorCode status = U_ZERO_ERROR; + km::kbp::ldml::normalize_nfc(fillin.string, status); + assert(U_SUCCESS(status)); return; } @@ -485,6 +489,8 @@ LdmlJsonTestSource::next_action(ldml_action &fillin) { if (as_emit.is_string()) { fillin.type = LDML_ACTION_EMIT_STRING; fillin.string = LdmlTestSource::parse_u8_source_string(as_emit.get()); + UErrorCode status = U_ZERO_ERROR; + km::kbp::ldml::normalize_nfc(fillin.string, status); return; } @@ -502,7 +508,8 @@ int LdmlJsonTestSource::load(const nlohmann::json &data) { this->data = data; // TODO-LDML auto startContext = data["/startContext/to"_json_pointer]; context = LdmlTestSource::parse_u8_source_string(startContext); - + UErrorCode status = U_ZERO_ERROR; + km::kbp::ldml::normalize_nfc(context, status); return 0; } From b3de1ffc4abef1bb3dd8a5f69477a0604979adec Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Thu, 12 Oct 2023 17:38:38 -0500 Subject: [PATCH 08/18] =?UTF-8?q?feat(core):=20ldml=20normalization=20?= =?UTF-8?q?=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - fix windows build For: #9468 --- core/subprojects/packagefiles/icu/meson.build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/subprojects/packagefiles/icu/meson.build b/core/subprojects/packagefiles/icu/meson.build index 128fc9fbf97..7b329e27df7 100644 --- a/core/subprojects/packagefiles/icu/meson.build +++ b/core/subprojects/packagefiles/icu/meson.build @@ -35,7 +35,7 @@ uconfig.set('U_ENABLE_DYLOAD', 0) # no DLL uconfig.set('U_CHECK_DYLOAD', 0) # no DLL uconfig.set('UCONFIG_NO_FILE_IO', 1) uconfig.set('UCONFIG_NO_LEGACY_CONVERSION', 1) # turn off file based codepage conversion -uconfig.set('UCONFIG_NO_NORMALIZATION', 1) # TODO-LDML: may want this +uconfig.set('UCONFIG_NO_NORMALIZATION', 0) uconfig.set('UCONFIG_NO_BREAK_ITERATION', 1) # TODO-LDML: may want this uconfig.set('UCONFIG_NO_IDNA', 1) uconfig.set('UCONFIG_NO_COLLATION', 1) From 95ad1c47e426f03c746f5111bd96b3738ed95ead Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Thu, 12 Oct 2023 17:46:41 -0500 Subject: [PATCH 09/18] =?UTF-8?q?feat(core):=20ldml=20normalization=20?= =?UTF-8?q?=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - fix unnecessary test churn. For: #9468 --- core/tests/unit/ldml/keyboards/k_003_transform.xml | 2 +- core/tests/unit/ldml/keyboards/k_210_marker-test.xml | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/core/tests/unit/ldml/keyboards/k_003_transform.xml b/core/tests/unit/ldml/keyboards/k_003_transform.xml index 97984340122..814b6829d8e 100644 --- a/core/tests/unit/ldml/keyboards/k_003_transform.xml +++ b/core/tests/unit/ldml/keyboards/k_003_transform.xml @@ -4,7 +4,7 @@ from https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-keyboards.md#element-transform @@keys: [K_Q][K_U][K_BKQUOTE][K_E] -@@expected: qu\u00EA +@@expected: qu\u00ea --> diff --git a/core/tests/unit/ldml/keyboards/k_210_marker-test.xml b/core/tests/unit/ldml/keyboards/k_210_marker-test.xml index 91fd7d4520f..24f104f4d71 100644 --- a/core/tests/unit/ldml/keyboards/k_210_marker-test.xml +++ b/core/tests/unit/ldml/keyboards/k_210_marker-test.xml @@ -7,13 +7,13 @@ - + - + @@ -29,12 +29,12 @@ - + - + From bf15feb1160a0d9994076eda87874f3d6e32e319 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Thu, 12 Oct 2023 19:13:52 -0500 Subject: [PATCH 10/18] =?UTF-8?q?feat(core):=20ldml=20normalization=20?= =?UTF-8?q?=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - fix int -> auto For: #9468 --- core/tests/unit/ldml/ldml.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/tests/unit/ldml/ldml.cpp b/core/tests/unit/ldml/ldml.cpp index de0c9e5556c..85ba3ae19c3 100644 --- a/core/tests/unit/ldml/ldml.cpp +++ b/core/tests/unit/ldml/ldml.cpp @@ -340,9 +340,9 @@ int run_all_tests(const km::kbp::path &source, const km::kbp::path &compiled) { << "/" << n.first.c_str() << std::endl; } } - int all_count = json_tests.size(); - int fail_count = failures.size(); - int pass_count = all_count - fail_count; + auto all_count = json_tests.size(); + auto fail_count = failures.size(); + auto pass_count = all_count - fail_count; if (pass_count > 0) { std::wcout << console_color::fg(console_color::GREEN) << " +" << pass_count; } From 9ee183c0eec9f4c12b65c74239250827778b7da4 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Mon, 16 Oct 2023 17:21:39 -0500 Subject: [PATCH 11/18] =?UTF-8?q?fix(core):=20ldml=20update=20to=20normali?= =?UTF-8?q?zation=20per=20review=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - simplify normalize_nfc and normalize_nfd functions For: #9468 --- core/src/ldml/ldml_transforms.cpp | 56 +++++++------------------------ 1 file changed, 12 insertions(+), 44 deletions(-) diff --git a/core/src/ldml/ldml_transforms.cpp b/core/src/ldml/ldml_transforms.cpp index 651919312bf..f9027755044 100644 --- a/core/src/ldml/ldml_transforms.cpp +++ b/core/src/ldml/ldml_transforms.cpp @@ -848,29 +848,13 @@ transforms::load( // TODO-LDML: copypasta -> refactor std::u32string &normalize_nfd(std::u32string &str, UErrorCode &status) { - const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status); - if (U_FAILURE(status)) { - return str; - } - icu::UnicodeString dest; - const std::u16string rstr = km::kbp::kmx::u32string_to_u16string(str); - icu::UnicodeString src = icu::UnicodeString(rstr.data(), (int32_t)rstr.length()); - nfd->normalize(src, dest, status); - if (U_FAILURE(status)) { - return str; - } + std::u16string rstr = km::kbp::kmx::u32string_to_u16string(str); - UErrorCode preflightStatus = U_ZERO_ERROR; - // calculate how big the buffer is - auto out32len = dest.toUTF32(nullptr, 0, preflightStatus); // preflightStatus will be an err, because we know the buffer overruns zero bytes - // allocate - char32_t *s = new char32_t[out32len + 1]; - assert(s != nullptr); - // convert - dest.toUTF32((UChar32 *)s, out32len + 1, status); - assert(U_SUCCESS(status)); - str.assign(s, out32len); - delete [] s; + normalize_nfd(rstr, status); + if (U_SUCCESS(status)) { + // if failure, leave it alone + str = km::kbp::kmx::u16string_to_u32string(rstr); + } return str; } @@ -892,29 +876,13 @@ std::u16string &normalize_nfd(std::u16string &str, UErrorCode &status) { // TODO-LDML: copypasta -> refactor std::u32string &normalize_nfc(std::u32string &str, UErrorCode &status) { - const icu::Normalizer2 *nfc = icu::Normalizer2::getNFCInstance(status); - if (U_FAILURE(status)) { - return str; - } - icu::UnicodeString dest; - const std::u16string rstr = km::kbp::kmx::u32string_to_u16string(str); - icu::UnicodeString src = icu::UnicodeString(rstr.data(), (int32_t)rstr.length()); - nfc->normalize(src, dest, status); - if (U_FAILURE(status)) { - return str; - } + std::u16string rstr = km::kbp::kmx::u32string_to_u16string(str); - UErrorCode preflightStatus = U_ZERO_ERROR; - // calculate how big the buffer is - auto out32len = dest.toUTF32(nullptr, 0, preflightStatus); // preflightStatus will be an err, because we know the buffer overruns zero bytes - // allocate - char32_t *s = new char32_t[out32len + 1]; - assert(s != nullptr); - // convert - dest.toUTF32((UChar32 *)s, out32len + 1, status); - assert(U_SUCCESS(status)); - str.assign(s, out32len); - delete [] s; + normalize_nfc(rstr, status); + if (U_SUCCESS(status)) { + // if failure, leave it alone + str = km::kbp::kmx::u16string_to_u32string(rstr); + } return str; } From 93e1c18af2b080f1e698af3697caf6ea0f0d28fb Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Mon, 16 Oct 2023 20:22:52 -0500 Subject: [PATCH 12/18] =?UTF-8?q?fix(core):=20ldml=20update=20to=20normali?= =?UTF-8?q?zation=20per=20review=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For: #9468 Co-authored-by: Marc Durdin --- core/src/ldml/ldml_processor.cpp | 4 +++- core/src/ldml/ldml_transforms.hpp | 8 ++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/core/src/ldml/ldml_processor.cpp b/core/src/ldml/ldml_processor.cpp index 64237e5b2f8..cbb9d9f57be 100644 --- a/core/src/ldml/ldml_processor.cpp +++ b/core/src/ldml/ldml_processor.cpp @@ -263,11 +263,13 @@ void ldml_processor::process_key_string(km_core_state *state, const std::u16string &key_str) const { UErrorCode status = U_ZERO_ERROR; // We know that key_str is not empty per the caller. + assert(!key_str.empty()); // we convert the keys str to UTF-32 here instead of using the emit_text() overload // so that we don't have to reconvert it inside the transform code. std::u32string key_str32 = kmx::u16string_to_u32string(key_str); - // normalize the keystroke to NFD + ldml::normalize_nfd(key_str32, status); + assert(U_SUCCESS(status)); ldml::normalize_nfd(key_str32, status); // extract context string, in NFC diff --git a/core/src/ldml/ldml_transforms.hpp b/core/src/ldml/ldml_transforms.hpp index 3a8e3cca428..f482e56a992 100644 --- a/core/src/ldml/ldml_transforms.hpp +++ b/core/src/ldml/ldml_transforms.hpp @@ -273,13 +273,13 @@ class transforms { // string routines -/** Normalize a u32string inplace. Returns a reference to the same string. */ +/** Normalize a u32string inplace to NFD. Returns a reference to the same string. */ std::u32string &normalize_nfd(std::u32string &str, UErrorCode &status); -/** Normalize a u16string inplace. Returns a reference to the same string. */ +/** Normalize a u16string inplace to NFD. Returns a reference to the same string. */ std::u16string &normalize_nfd(std::u16string &str, UErrorCode &status); -/** Normalize a u32string inplace. Returns a reference to the same string. */ +/** Normalize a u32string inplace to NFC. Returns a reference to the same string. */ std::u32string &normalize_nfc(std::u32string &str, UErrorCode &status); -/** Normalize a u16string inplace. Returns a reference to the same string. */ +/** Normalize a u16string inplace to NFC. Returns a reference to the same string. */ std::u16string &normalize_nfc(std::u16string &str, UErrorCode &status); } // namespace ldml From 9dbf70c94ea71c3313b80d31f903778c20182bbd Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Wed, 18 Oct 2023 17:47:56 -0500 Subject: [PATCH 13/18] =?UTF-8?q?fix(core):=20ldml=20more=20updates=20to?= =?UTF-8?q?=20normalization=20per=20review=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - verify marker string as it's popped off - add a missing pragma to debuglog.h - move UErrorCode out of ldml_processor, change normalization functions to return true on success For: #9468 --- core/src/debuglog.h | 2 + core/src/ldml/ldml_processor.cpp | 21 +++---- core/src/ldml/ldml_transforms.cpp | 77 ++++++++++------------- core/src/ldml/ldml_transforms.hpp | 24 ++++--- core/tests/unit/ldml/ldml_test_source.cpp | 10 +-- 5 files changed, 63 insertions(+), 71 deletions(-) diff --git a/core/src/debuglog.h b/core/src/debuglog.h index 1ecbaca6124..5b547132e7f 100644 --- a/core/src/debuglog.h +++ b/core/src/debuglog.h @@ -1,5 +1,7 @@ /* Debugging */ +#pragma once + #include namespace km { diff --git a/core/src/ldml/ldml_processor.cpp b/core/src/ldml/ldml_processor.cpp index cbb9d9f57be..5d995be4184 100644 --- a/core/src/ldml/ldml_processor.cpp +++ b/core/src/ldml/ldml_processor.cpp @@ -261,30 +261,25 @@ ldml_processor::process_event( void ldml_processor::process_key_string(km_core_state *state, const std::u16string &key_str) const { - UErrorCode status = U_ZERO_ERROR; // We know that key_str is not empty per the caller. assert(!key_str.empty()); // we convert the keys str to UTF-32 here instead of using the emit_text() overload // so that we don't have to reconvert it inside the transform code. std::u32string key_str32 = kmx::u16string_to_u32string(key_str); - ldml::normalize_nfd(key_str32, status); - assert(U_SUCCESS(status)); - ldml::normalize_nfd(key_str32, status); + assert(ldml::normalize_nfd(key_str32)); // TODO-LDML: else fail? // extract context string, in NFC std::u32string old_ctxtstr_nfc; (void)context_to_string(state, old_ctxtstr_nfc, false); - ldml::normalize_nfc(old_ctxtstr_nfc, status); - assert(U_SUCCESS(status)); + assert(ldml::normalize_nfc(old_ctxtstr_nfc)); // TODO-LDML: else fail? // context string in NFD std::u32string ctxtstr; (void)context_to_string(state, ctxtstr, true); // with markers // add the newly added key output to ctxtstr ctxtstr.append(key_str32); - ldml::normalize_nfd(ctxtstr, status); - assert(U_SUCCESS(status)); + assert(ldml::normalize_nfd(ctxtstr)); // TODO-LDML: else fail? /** transform output string */ std::u32string outputString; @@ -302,15 +297,14 @@ ldml_processor::process_key_string(km_core_state *state, const std::u16string &k // drop last 'matchedContext': ctxtstr.resize(ctxtstr.length() - matchedContext); ctxtstr.append(outputString); // TODO-LDML: should be able to do a normalization-safe append here. - ldml::normalize_nfd(ctxtstr, status); - assert(U_SUCCESS(status)); + assert(ldml::normalize_nfd(ctxtstr)); // TODO-LDML: else fail? // Ok. We've done all the happy manipulations. /** NFC and no markers */ std::u32string ctxtstr_cleanedup = ctxtstr; // TODO-LDML: remove markers! - ldml::normalize_nfc(ctxtstr_cleanedup, status); + assert(ldml::normalize_nfc(ctxtstr_cleanedup)); // TODO-LDML: else fail? // find common prefix auto ctxt_prefix = mismatch(old_ctxtstr_nfc.begin(), old_ctxtstr_nfc.end(), ctxtstr_cleanedup.begin(), ctxtstr_cleanedup.end()); @@ -344,9 +338,12 @@ ldml_processor::remove_text(km_core_state *state, std::u32string &str, size_t le assert(length >= 3); assert(lastCtx == c->marker); // end of list length -= 3; - // pop off the three-part sentinel string + // pop off the three-part sentinel string (in reverse order of course) + assert(str.back() == c->marker); // marker # str.pop_back(); + assert(str.back() == LDML_MARKER_CODE); str.pop_back(); + assert(str.back() == LDML_UC_SENTINEL); str.pop_back(); // push a special backspace to delete the marker state->actions().push_backspace(KM_CORE_BT_MARKER, c->marker); diff --git a/core/src/ldml/ldml_transforms.cpp b/core/src/ldml/ldml_transforms.cpp index f9027755044..0ee01204658 100644 --- a/core/src/ldml/ldml_transforms.cpp +++ b/core/src/ldml/ldml_transforms.cpp @@ -470,7 +470,7 @@ transform_entry::init() { // NFD normalize on pattern creation nfd->normalize(patustr_raw, patustr, status); fFromPattern.reset(icu::RegexPattern::compile(patustr, 0, status)); - assert(U_SUCCESS(status)); // TODO-LDML: may be best to propagate status up ^^ + UASSERT_SUCCESS(status); } } @@ -484,7 +484,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons icu::UnicodeString matchustr = icu::UnicodeString(matchstr.data(), (int32_t)matchstr.length()); // TODO-LDML: create a new Matcher every time. These could be cached and reset. std::unique_ptr matcher(fFromPattern->matcher(matchustr, status)); - assert(U_SUCCESS(status)); + UASSERT_SUCCESS(status); if (!matcher->find(status)) { // i.e. matches somewhere, in this case at end of str return 0; // no match @@ -494,7 +494,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons // TODO-LDML: if we had an underlying UText this would be simpler. int32_t matchStart = matcher->start(status); int32_t matchEnd = matcher->end(status); - assert(U_SUCCESS(status)); + UASSERT_SUCCESS(status); // extract.. const icu::UnicodeString substr = matchustr.tempSubStringBetween(matchStart, matchEnd); // preflight to UTF-32 to get length @@ -521,7 +521,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons // we actually need the group(1) string here. // this is only the content in parenthesis () icu::UnicodeString group1 = matcher->group(1, status); - assert(U_SUCCESS(status)); // TODO-LDML: could be a malformed from pattern + UASSERT_SUCCESS(status); // TODO-LDML: could be a malformed from pattern // now, how long is group1 in UTF-32, hmm? UErrorCode preflightStatus = U_ZERO_ERROR; // throwaway status auto group1Len = group1.toUTF32(nullptr, 0, preflightStatus); @@ -529,7 +529,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons assert(s != nullptr); // TODO-LDML: OOM // convert substr.toUTF32((UChar32 *)s, group1Len + 1, status); - assert(U_SUCCESS(status)); + UASSERT_SUCCESS(status); std::u32string match32(s, group1Len); // taken from just group1 // clean up buffer delete [] s; @@ -552,10 +552,10 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status); icu::UnicodeString rustr2; nfd->normalize(rustr, rustr2, status); - assert(U_SUCCESS(status)); + UASSERT_SUCCESS(status); // here we replace the match output. icu::UnicodeString entireOutput = matcher->replaceFirst(rustr2, status); - assert(U_SUCCESS(status)); // TODO-LDML: could fail here due to bad input (syntax err) + UASSERT_SUCCESS(status); // TODO-LDML: could fail here due to bad input (syntax err) // entireOutput includes all of 'input', but modified. Need to substring it. icu::UnicodeString outu_raw = entireOutput.tempSubString(matchStart); @@ -563,7 +563,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons // normalize the replaced string icu::UnicodeString outu; nfd->normalize(outu_raw, outu, status); - assert(U_SUCCESS(status)); + UASSERT_SUCCESS(status); // Special case if there's no output, save some allocs if (outu.length() == 0) { @@ -578,7 +578,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons assert(s != nullptr); // convert outu.toUTF32((UChar32 *)s, out32len + 1, status); - assert(U_SUCCESS(status)); + UASSERT_SUCCESS(status); output.assign(s, out32len); // now, build a u32string std::u32string out32(s, out32len); @@ -844,67 +844,56 @@ transforms::load( return transforms; } -// string +// string manipulation -// TODO-LDML: copypasta -> refactor -std::u32string &normalize_nfd(std::u32string &str, UErrorCode &status) { +bool normalize_nfd(std::u32string &str) { std::u16string rstr = km::kbp::kmx::u32string_to_u16string(str); - normalize_nfd(rstr, status); - if (U_SUCCESS(status)) { - // if failure, leave it alone + if(!normalize_nfd(rstr)) { + return false; + } else { str = km::kbp::kmx::u16string_to_u32string(rstr); + return true; } - return str; } -// TODO-LDML: copypasta -> refactor -std::u16string &normalize_nfd(std::u16string &str, UErrorCode &status) { +bool normalize_nfd(std::u16string &str) { + UErrorCode status = U_ZERO_ERROR; const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status); - if (U_FAILURE(status)) { - return str; - } + UASSERT_SUCCESS(status); + if (U_FAILURE(status)) return false; // exit early since normalizer pointer could be nullptr icu::UnicodeString dest; icu::UnicodeString src = icu::UnicodeString(str.data(), (int32_t)str.length()); nfd->normalize(src, dest, status); - if (U_FAILURE(status)) { - return str; - } + UASSERT_SUCCESS(status); str.assign(dest.getBuffer(), dest.length()); - return str; + return U_SUCCESS(status); } -// TODO-LDML: copypasta -> refactor -std::u32string &normalize_nfc(std::u32string &str, UErrorCode &status) { +bool normalize_nfc(std::u32string &str) { std::u16string rstr = km::kbp::kmx::u32string_to_u16string(str); - - normalize_nfc(rstr, status); - if (U_SUCCESS(status)) { - // if failure, leave it alone + if(!normalize_nfc(rstr)) { + return false; + } else { str = km::kbp::kmx::u16string_to_u32string(rstr); + return true; } - return str; } -// TODO-LDML: copypasta -> refactor -std::u16string &normalize_nfc(std::u16string &str, UErrorCode &status) { +bool normalize_nfc(std::u16string &str) { + UErrorCode status = U_ZERO_ERROR; const icu::Normalizer2 *nfc = icu::Normalizer2::getNFCInstance(status); - if (U_FAILURE(status)) { - return str; - } + UASSERT_SUCCESS(status); + if (U_FAILURE(status)) return false; // exit early since normalizer pointer could be nullptr + icu::UnicodeString dest; icu::UnicodeString src = icu::UnicodeString(str.data(), (int32_t)str.length()); nfc->normalize(src, dest, status); - if (U_FAILURE(status)) { - return str; - } + UASSERT_SUCCESS(status); str.assign(dest.getBuffer(), dest.length()); - return str; + return U_SUCCESS(status); } - - - } // namespace ldml } // namespace kbp } // namespace km diff --git a/core/src/ldml/ldml_transforms.hpp b/core/src/ldml/ldml_transforms.hpp index f482e56a992..d7c5f2c6bf7 100644 --- a/core/src/ldml/ldml_transforms.hpp +++ b/core/src/ldml/ldml_transforms.hpp @@ -13,6 +13,7 @@ #include #include #include +#include "debuglog.h" #if !defined(HAVE_ICU4C) #error icu4c is required for this code @@ -31,6 +32,13 @@ namespace km { namespace kbp { namespace ldml { +// macro for working with ICU calls +#define UASSERT_SUCCESS(status) \ + if (U_FAILURE(status)) { \ + DebugLog("U_FAILURE(%s)", u_errorName(status)); \ + assert(U_SUCCESS(status)); \ + } + using km::kbp::kmx::SimpleUSet; /** @@ -273,14 +281,14 @@ class transforms { // string routines -/** Normalize a u32string inplace to NFD. Returns a reference to the same string. */ -std::u32string &normalize_nfd(std::u32string &str, UErrorCode &status); -/** Normalize a u16string inplace to NFD. Returns a reference to the same string. */ -std::u16string &normalize_nfd(std::u16string &str, UErrorCode &status); -/** Normalize a u32string inplace to NFC. Returns a reference to the same string. */ -std::u32string &normalize_nfc(std::u32string &str, UErrorCode &status); -/** Normalize a u16string inplace to NFC. Returns a reference to the same string. */ -std::u16string &normalize_nfc(std::u16string &str, UErrorCode &status); +/** Normalize a u32string inplace to NFD. @return false on failure */ +bool normalize_nfd(std::u32string &str); +/** Normalize a u16string inplace to NFD. @return false on failure */ +bool normalize_nfd(std::u16string &str); +/** Normalize a u32string inplace to NFC. @return false on failure */ +bool normalize_nfc(std::u32string &str); +/** Normalize a u16string inplace to NFC. @return false on failure */ +bool normalize_nfc(std::u16string &str); } // namespace ldml } // namespace kbp diff --git a/core/tests/unit/ldml/ldml_test_source.cpp b/core/tests/unit/ldml/ldml_test_source.cpp index 4ad5d5a3a72..ab8423f7648 100644 --- a/core/tests/unit/ldml/ldml_test_source.cpp +++ b/core/tests/unit/ldml/ldml_test_source.cpp @@ -468,9 +468,7 @@ LdmlJsonTestSource::next_action(ldml_action &fillin) { if (as_check.is_string()) { fillin.type = LDML_ACTION_CHECK_EXPECTED; fillin.string = LdmlTestSource::parse_u8_source_string(as_check.get()); - UErrorCode status = U_ZERO_ERROR; - km::kbp::ldml::normalize_nfc(fillin.string, status); - assert(U_SUCCESS(status)); + assert(km::kbp::ldml::normalize_nfc(fillin.string)); return; } @@ -489,8 +487,7 @@ LdmlJsonTestSource::next_action(ldml_action &fillin) { if (as_emit.is_string()) { fillin.type = LDML_ACTION_EMIT_STRING; fillin.string = LdmlTestSource::parse_u8_source_string(as_emit.get()); - UErrorCode status = U_ZERO_ERROR; - km::kbp::ldml::normalize_nfc(fillin.string, status); + assert(km::kbp::ldml::normalize_nfc(fillin.string)); return; } @@ -508,8 +505,7 @@ int LdmlJsonTestSource::load(const nlohmann::json &data) { this->data = data; // TODO-LDML auto startContext = data["/startContext/to"_json_pointer]; context = LdmlTestSource::parse_u8_source_string(startContext); - UErrorCode status = U_ZERO_ERROR; - km::kbp::ldml::normalize_nfc(context, status); + assert(km::kbp::ldml::normalize_nfc(context)); return 0; } From f1b12a18482d27c8a2292884e4bd7c78367bd618 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Wed, 18 Oct 2023 17:55:08 -0500 Subject: [PATCH 14/18] =?UTF-8?q?fix(core):=20ldml=20more=20updates=20to?= =?UTF-8?q?=20normalization=20per=20review=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - refactor internal normalization call For: #9468 --- core/src/ldml/ldml_transforms.cpp | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/core/src/ldml/ldml_transforms.cpp b/core/src/ldml/ldml_transforms.cpp index 0ee01204658..b3669661ce9 100644 --- a/core/src/ldml/ldml_transforms.cpp +++ b/core/src/ldml/ldml_transforms.cpp @@ -857,19 +857,25 @@ bool normalize_nfd(std::u32string &str) { } } -bool normalize_nfd(std::u16string &str) { - UErrorCode status = U_ZERO_ERROR; - const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status); +/** internal function to normalize with a specified mode */ +static bool normalize(const icu::Normalizer2 *n, std::u16string &str, UErrorCode &status) { UASSERT_SUCCESS(status); - if (U_FAILURE(status)) return false; // exit early since normalizer pointer could be nullptr + assert(n != nullptr); icu::UnicodeString dest; icu::UnicodeString src = icu::UnicodeString(str.data(), (int32_t)str.length()); - nfd->normalize(src, dest, status); + n->normalize(src, dest, status); UASSERT_SUCCESS(status); str.assign(dest.getBuffer(), dest.length()); return U_SUCCESS(status); } +bool normalize_nfd(std::u16string &str) { + UErrorCode status = U_ZERO_ERROR; + const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status); + UASSERT_SUCCESS(status); + return normalize(nfd, str, status); +} + bool normalize_nfc(std::u32string &str) { std::u16string rstr = km::kbp::kmx::u32string_to_u16string(str); if(!normalize_nfc(rstr)) { @@ -884,14 +890,7 @@ bool normalize_nfc(std::u16string &str) { UErrorCode status = U_ZERO_ERROR; const icu::Normalizer2 *nfc = icu::Normalizer2::getNFCInstance(status); UASSERT_SUCCESS(status); - if (U_FAILURE(status)) return false; // exit early since normalizer pointer could be nullptr - - icu::UnicodeString dest; - icu::UnicodeString src = icu::UnicodeString(str.data(), (int32_t)str.length()); - nfc->normalize(src, dest, status); - UASSERT_SUCCESS(status); - str.assign(dest.getBuffer(), dest.length()); - return U_SUCCESS(status); + return normalize(nfc, str, status); } } // namespace ldml From 3723e6b3d756c452f8d799404dcd10756e610683 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Wed, 18 Oct 2023 17:58:05 -0500 Subject: [PATCH 15/18] =?UTF-8?q?fix(core):=20ldml=20more=20updates=20to?= =?UTF-8?q?=20normalization=20per=20review=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - consistency For: #9468 --- core/src/ldml/ldml_transforms.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/core/src/ldml/ldml_transforms.cpp b/core/src/ldml/ldml_transforms.cpp index b3669661ce9..9becb93ba78 100644 --- a/core/src/ldml/ldml_transforms.cpp +++ b/core/src/ldml/ldml_transforms.cpp @@ -865,7 +865,9 @@ static bool normalize(const icu::Normalizer2 *n, std::u16string &str, UErrorCode icu::UnicodeString src = icu::UnicodeString(str.data(), (int32_t)str.length()); n->normalize(src, dest, status); UASSERT_SUCCESS(status); - str.assign(dest.getBuffer(), dest.length()); + if (U_SUCCESS(status)) { + str.assign(dest.getBuffer(), dest.length()); + } return U_SUCCESS(status); } From 4afe0387d63ba99f03a0faff6bb93bfcfe0de81b Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Thu, 19 Oct 2023 13:16:03 -0500 Subject: [PATCH 16/18] =?UTF-8?q?fix(core):=20ldml=20more=20updates=20to?= =?UTF-8?q?=20normalization=20per=20review=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - move meat of uassert_success() into an inline For: #9468 --- core/src/debuglog.h | 2 ++ core/src/ldml/ldml_transforms.hpp | 16 +++++++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/core/src/debuglog.h b/core/src/debuglog.h index 5b547132e7f..944695d669f 100644 --- a/core/src/debuglog.h +++ b/core/src/debuglog.h @@ -20,10 +20,12 @@ extern const char *s_key_names[]; #ifdef _MSC_VER #define DebugLog(msg,...) (km::kbp::kmx::ShouldDebug() ? km::kbp::kmx::DebugLog_1(__FILE__, __LINE__, __FUNCTION__, (msg),__VA_ARGS__) : 0) +#define DebugLog2(file,line,function,msg,...) (km::kbp::kmx::ShouldDebug() ? km::kbp::kmx::DebugLog_1(file, line, function, (msg),__VA_ARGS__) : 0) #define console_error(msg,...) write_console(TRUE, (msg), __VA_ARGS__) #define console_log(msg,...) write_console(FALSE, (msg), __VA_ARGS__) #else #define DebugLog(msg,...) (km::kbp::kmx::ShouldDebug() ? km::kbp::kmx::DebugLog_1(__FILE__, __LINE__, __FUNCTION__, (msg), ##__VA_ARGS__) : 0) +#define DebugLog2(file,line,function,msg,...) (km::kbp::kmx::ShouldDebug() ? km::kbp::kmx::DebugLog_1(file, line, function, (msg), ##__VA_ARGS__) : 0) #define console_error(msg,...) write_console(TRUE, (msg), ##__VA_ARGS__) #define console_log(msg,...) write_console(FALSE, (msg), ##__VA_ARGS__) #endif diff --git a/core/src/ldml/ldml_transforms.hpp b/core/src/ldml/ldml_transforms.hpp index d7c5f2c6bf7..f48ab8a47d8 100644 --- a/core/src/ldml/ldml_transforms.hpp +++ b/core/src/ldml/ldml_transforms.hpp @@ -32,12 +32,18 @@ namespace km { namespace kbp { namespace ldml { -// macro for working with ICU calls -#define UASSERT_SUCCESS(status) \ - if (U_FAILURE(status)) { \ - DebugLog("U_FAILURE(%s)", u_errorName(status)); \ - assert(U_SUCCESS(status)); \ +/** @returns true on success */ +inline bool uassert_success(const char *file, int line, const char *function, UErrorCode status) { + if (U_FAILURE(status)) { + DebugLog2(file, line, function, "U_FAILURE(%s)", u_errorName(status)); + return false; + } else { + return true; } +} + +#define UASSERT_SUCCESS(status) assert(U_SUCCESS(status)); \ + uassert_success(__FILE__, __LINE__, __FUNCTION__, status) using km::kbp::kmx::SimpleUSet; From 1d9820b02096402778b2d2c4aad0de92d1abc29d Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Thu, 19 Oct 2023 13:33:55 -0500 Subject: [PATCH 17/18] =?UTF-8?q?fix(core):=20ldml=20more=20updates=20to?= =?UTF-8?q?=20normalization=20per=20review=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - improve assert failure handling- fix a potential heap leak For: #9468 --- core/src/ldml/ldml_transforms.cpp | 67 ++++++++++++++++++------------- core/src/ldml/ldml_transforms.hpp | 10 ++--- 2 files changed, 43 insertions(+), 34 deletions(-) diff --git a/core/src/ldml/ldml_transforms.cpp b/core/src/ldml/ldml_transforms.cpp index 9becb93ba78..b2bfdaaa040 100644 --- a/core/src/ldml/ldml_transforms.cpp +++ b/core/src/ldml/ldml_transforms.cpp @@ -12,7 +12,7 @@ #include "kmx/kmx_xstring.h" #ifndef assert -#define assert(x) // TODO-LDML +#define assert(x) ((void)0) #endif namespace km { @@ -403,7 +403,7 @@ transform_entry::transform_entry(const transform_entry &other) transform_entry::transform_entry(const std::u32string &from, const std::u32string &to) : fFrom(from), fTo(to), fFromPattern(nullptr), fMapFromStrId(), fMapToStrId(), fMapFromList(), fMapToList() { - assert(!fFrom.empty()); // TODO-LDML: should not happen? + assert(!fFrom.empty()); init(); } @@ -414,14 +414,16 @@ transform_entry::transform_entry( const std::u32string &to, KMX_DWORD mapFrom, KMX_DWORD mapTo, - const kmx::kmx_plus &kplus) + const kmx::kmx_plus &kplus, + bool &valid) : fFrom(from), fTo(to), fFromPattern(nullptr), fMapFromStrId(mapFrom), fMapToStrId(mapTo) { + valid = false; assert(!fFrom.empty()); // TODO-LDML: should not happen? assert((fMapFromStrId == 0) == (fMapToStrId == 0)); // we have both or we have neither. assert(kplus.strs != nullptr); assert(kplus.vars != nullptr); assert(kplus.elem != nullptr); - init(); + valid = init(); // setup mapFrom if (fMapFromStrId != 0) { @@ -456,7 +458,7 @@ transform_entry::transform_entry( } } -void +bool transform_entry::init() { if (!fFrom.empty()) { // TODO-LDML: if we have mapFrom, may need to do other processing. @@ -470,8 +472,9 @@ transform_entry::init() { // NFD normalize on pattern creation nfd->normalize(patustr_raw, patustr, status); fFromPattern.reset(icu::RegexPattern::compile(patustr, 0, status)); - UASSERT_SUCCESS(status); + return (UASSERT_SUCCESS(status)); } + return false; // fFrom should not be empty. } size_t @@ -767,35 +770,34 @@ transforms::load( const kmx::kmx_plus &kplus, const kbp::kmx::COMP_KMXPLUS_TRAN *tran, const kbp::kmx::COMP_KMXPLUS_TRAN_Helper &tranHelper) { + bool valid = true; if (tran == nullptr) { DebugLog("for tran: tran is null"); - assert(false); - return nullptr; - } - if (!tranHelper.valid()) { + valid = false; + } else if (!tranHelper.valid()) { DebugLog("for tran: tranHelper is invalid"); - assert(false); - return nullptr; - } - if (nullptr == kplus.elem) { + valid = false; + } else if (nullptr == kplus.elem) { DebugLog("for tran: kplus.elem == nullptr"); - assert(false); - return nullptr; - } - if (nullptr == kplus.strs) { + valid = false; + } else if (nullptr == kplus.strs) { DebugLog("for tran: kplus.strs == nullptr"); // need a string table to get strings - assert(false); - return nullptr; - } - if (nullptr == kplus.vars) { + valid = false; + } else if (nullptr == kplus.vars) { DebugLog("for tran: kplus.vars == nullptr"); // need a vars table to get maps - assert(false); + valid = false; + } + + assert(valid); + if (!valid) { return nullptr; } // with that out of the way, let's set it up - transforms *transforms = new ldml::transforms(); + std::unique_ptr transforms; + + transforms.reset(new ldml::transforms()); for (KMX_DWORD groupNumber = 0; groupNumber < tran->groupCount; groupNumber++) { const kmx::COMP_KMXPLUS_TRAN_GROUP *group = tranHelper.getGroup(groupNumber); @@ -811,7 +813,11 @@ transforms::load( const std::u32string toStr = kmx::u16string_to_u32string(kplus.strs->get(element->to)); KMX_DWORD mapFrom = element->mapFrom; // copy, because of alignment KMX_DWORD mapTo = element->mapTo; // copy, because of alignment - newGroup.emplace_back(fromStr, toStr, mapFrom, mapTo, kplus); // creating a transform_entry + newGroup.emplace_back(fromStr, toStr, mapFrom, mapTo, kplus, valid); // creating a transform_entry + assert(valid); + if(!valid) { + return nullptr; + } } transforms->addGroup(newGroup); } else if (group->type == LDML_TRAN_GROUP_TYPE_REORDER) { @@ -841,14 +847,18 @@ transforms::load( return nullptr; } } - return transforms; + assert(valid); + if (!valid) { + return nullptr; + } else { + return transforms.release(); + } } // string manipulation bool normalize_nfd(std::u32string &str) { std::u16string rstr = km::kbp::kmx::u32string_to_u16string(str); - if(!normalize_nfd(rstr)) { return false; } else { @@ -864,8 +874,7 @@ static bool normalize(const icu::Normalizer2 *n, std::u16string &str, UErrorCode icu::UnicodeString dest; icu::UnicodeString src = icu::UnicodeString(str.data(), (int32_t)str.length()); n->normalize(src, dest, status); - UASSERT_SUCCESS(status); - if (U_SUCCESS(status)) { + if (UASSERT_SUCCESS(status)) { str.assign(dest.getBuffer(), dest.length()); } return U_SUCCESS(status); diff --git a/core/src/ldml/ldml_transforms.hpp b/core/src/ldml/ldml_transforms.hpp index f48ab8a47d8..56cac24181c 100644 --- a/core/src/ldml/ldml_transforms.hpp +++ b/core/src/ldml/ldml_transforms.hpp @@ -42,8 +42,7 @@ inline bool uassert_success(const char *file, int line, const char *function, UE } } -#define UASSERT_SUCCESS(status) assert(U_SUCCESS(status)); \ - uassert_success(__FILE__, __LINE__, __FUNCTION__, status) +#define UASSERT_SUCCESS(status) assert(U_SUCCESS(status)), uassert_success(__FILE__, __LINE__, __FUNCTION__, status) using km::kbp::kmx::SimpleUSet; @@ -109,7 +108,8 @@ class transform_entry { const std::u32string &to, KMX_DWORD mapFrom, KMX_DWORD mapTo, - const kmx::kmx_plus &kplus); + const kmx::kmx_plus &kplus, + bool &valid); /** * If matching, apply the match to the output string @@ -128,8 +128,8 @@ class transform_entry { const KMX_DWORD fMapToStrId; std::deque fMapFromList; std::deque fMapToList; - /** Internal function to setup pattern string */ - void init(); + /** Internal function to setup pattern string @returns true on success */ + bool init(); /** @returns the index of the item in the fMapFromList list, or -1 */ int32_t findIndexFrom(const std::u32string &match) const; public: From 4cdc2c3b5476dcefb43ba43a00864a8ef0bc4aa0 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Thu, 19 Oct 2023 17:09:37 -0500 Subject: [PATCH 18/18] =?UTF-8?q?fix(core):=20ldml=20more=20updates=20to?= =?UTF-8?q?=20normalization=20per=20review=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - reduce function depth - catch another gotcha (valid was being overwritten) For: #9468 --- core/src/ldml/ldml_transforms.cpp | 42 ++++++++++++++++++------------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/core/src/ldml/ldml_transforms.cpp b/core/src/ldml/ldml_transforms.cpp index b2bfdaaa040..56a63278764 100644 --- a/core/src/ldml/ldml_transforms.cpp +++ b/core/src/ldml/ldml_transforms.cpp @@ -408,7 +408,6 @@ transform_entry::transform_entry(const std::u32string &from, const std::u32strin init(); } -// TODO-LDML: How do we return errors from here? transform_entry::transform_entry( const std::u32string &from, const std::u32string &to, @@ -417,13 +416,16 @@ transform_entry::transform_entry( const kmx::kmx_plus &kplus, bool &valid) : fFrom(from), fTo(to), fFromPattern(nullptr), fMapFromStrId(mapFrom), fMapToStrId(mapTo) { - valid = false; + if (!valid) + return; // exit early assert(!fFrom.empty()); // TODO-LDML: should not happen? assert((fMapFromStrId == 0) == (fMapToStrId == 0)); // we have both or we have neither. assert(kplus.strs != nullptr); assert(kplus.vars != nullptr); assert(kplus.elem != nullptr); - valid = init(); + if(!init()) { + valid = false; + } // setup mapFrom if (fMapFromStrId != 0) { @@ -460,21 +462,21 @@ transform_entry::transform_entry( bool transform_entry::init() { - if (!fFrom.empty()) { - // TODO-LDML: if we have mapFrom, may need to do other processing. - const std::u16string patstr = km::kbp::kmx::u32string_to_u16string(fFrom); - UErrorCode status = U_ZERO_ERROR; - /* const */ icu::UnicodeString patustr_raw = icu::UnicodeString(patstr.data(), (int32_t)patstr.length()); - // add '$' to match to end - patustr_raw.append(u'$'); - icu::UnicodeString patustr; - const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status); - // NFD normalize on pattern creation - nfd->normalize(patustr_raw, patustr, status); - fFromPattern.reset(icu::RegexPattern::compile(patustr, 0, status)); - return (UASSERT_SUCCESS(status)); - } - return false; // fFrom should not be empty. + if (fFrom.empty()) { + return false; + } + // TODO-LDML: if we have mapFrom, may need to do other processing. + const std::u16string patstr = km::kbp::kmx::u32string_to_u16string(fFrom); + UErrorCode status = U_ZERO_ERROR; + /* const */ icu::UnicodeString patustr_raw = icu::UnicodeString(patstr.data(), (int32_t)patstr.length()); + // add '$' to match to end + patustr_raw.append(u'$'); + icu::UnicodeString patustr; + const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status); + // NFD normalize on pattern creation + nfd->normalize(patustr_raw, patustr, status); + fFromPattern.reset(icu::RegexPattern::compile(patustr, 0, status)); + return (UASSERT_SUCCESS(status)); } size_t @@ -813,6 +815,10 @@ transforms::load( const std::u32string toStr = kmx::u16string_to_u32string(kplus.strs->get(element->to)); KMX_DWORD mapFrom = element->mapFrom; // copy, because of alignment KMX_DWORD mapTo = element->mapTo; // copy, because of alignment + assert(!fromStr.empty()); + if (fromStr.empty()) { + valid = false; + } newGroup.emplace_back(fromStr, toStr, mapFrom, mapTo, kplus, valid); // creating a transform_entry assert(valid); if(!valid) {