From 9dbf70c94ea71c3313b80d31f903778c20182bbd Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Wed, 18 Oct 2023 17:47:56 -0500 Subject: [PATCH] =?UTF-8?q?fix(core):=20ldml=20more=20updates=20to=20norma?= =?UTF-8?q?lization=20per=20review=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - verify marker string as it's popped off - add a missing pragma to debuglog.h - move UErrorCode out of ldml_processor, change normalization functions to return true on success For: #9468 --- core/src/debuglog.h | 2 + core/src/ldml/ldml_processor.cpp | 21 +++---- core/src/ldml/ldml_transforms.cpp | 77 ++++++++++------------- core/src/ldml/ldml_transforms.hpp | 24 ++++--- core/tests/unit/ldml/ldml_test_source.cpp | 10 +-- 5 files changed, 63 insertions(+), 71 deletions(-) diff --git a/core/src/debuglog.h b/core/src/debuglog.h index 1ecbaca6124..5b547132e7f 100644 --- a/core/src/debuglog.h +++ b/core/src/debuglog.h @@ -1,5 +1,7 @@ /* Debugging */ +#pragma once + #include namespace km { diff --git a/core/src/ldml/ldml_processor.cpp b/core/src/ldml/ldml_processor.cpp index cbb9d9f57be..5d995be4184 100644 --- a/core/src/ldml/ldml_processor.cpp +++ b/core/src/ldml/ldml_processor.cpp @@ -261,30 +261,25 @@ ldml_processor::process_event( void ldml_processor::process_key_string(km_core_state *state, const std::u16string &key_str) const { - UErrorCode status = U_ZERO_ERROR; // We know that key_str is not empty per the caller. assert(!key_str.empty()); // we convert the keys str to UTF-32 here instead of using the emit_text() overload // so that we don't have to reconvert it inside the transform code. std::u32string key_str32 = kmx::u16string_to_u32string(key_str); - ldml::normalize_nfd(key_str32, status); - assert(U_SUCCESS(status)); - ldml::normalize_nfd(key_str32, status); + assert(ldml::normalize_nfd(key_str32)); // TODO-LDML: else fail? // extract context string, in NFC std::u32string old_ctxtstr_nfc; (void)context_to_string(state, old_ctxtstr_nfc, false); - ldml::normalize_nfc(old_ctxtstr_nfc, status); - assert(U_SUCCESS(status)); + assert(ldml::normalize_nfc(old_ctxtstr_nfc)); // TODO-LDML: else fail? // context string in NFD std::u32string ctxtstr; (void)context_to_string(state, ctxtstr, true); // with markers // add the newly added key output to ctxtstr ctxtstr.append(key_str32); - ldml::normalize_nfd(ctxtstr, status); - assert(U_SUCCESS(status)); + assert(ldml::normalize_nfd(ctxtstr)); // TODO-LDML: else fail? /** transform output string */ std::u32string outputString; @@ -302,15 +297,14 @@ ldml_processor::process_key_string(km_core_state *state, const std::u16string &k // drop last 'matchedContext': ctxtstr.resize(ctxtstr.length() - matchedContext); ctxtstr.append(outputString); // TODO-LDML: should be able to do a normalization-safe append here. - ldml::normalize_nfd(ctxtstr, status); - assert(U_SUCCESS(status)); + assert(ldml::normalize_nfd(ctxtstr)); // TODO-LDML: else fail? // Ok. We've done all the happy manipulations. /** NFC and no markers */ std::u32string ctxtstr_cleanedup = ctxtstr; // TODO-LDML: remove markers! - ldml::normalize_nfc(ctxtstr_cleanedup, status); + assert(ldml::normalize_nfc(ctxtstr_cleanedup)); // TODO-LDML: else fail? // find common prefix auto ctxt_prefix = mismatch(old_ctxtstr_nfc.begin(), old_ctxtstr_nfc.end(), ctxtstr_cleanedup.begin(), ctxtstr_cleanedup.end()); @@ -344,9 +338,12 @@ ldml_processor::remove_text(km_core_state *state, std::u32string &str, size_t le assert(length >= 3); assert(lastCtx == c->marker); // end of list length -= 3; - // pop off the three-part sentinel string + // pop off the three-part sentinel string (in reverse order of course) + assert(str.back() == c->marker); // marker # str.pop_back(); + assert(str.back() == LDML_MARKER_CODE); str.pop_back(); + assert(str.back() == LDML_UC_SENTINEL); str.pop_back(); // push a special backspace to delete the marker state->actions().push_backspace(KM_CORE_BT_MARKER, c->marker); diff --git a/core/src/ldml/ldml_transforms.cpp b/core/src/ldml/ldml_transforms.cpp index f9027755044..0ee01204658 100644 --- a/core/src/ldml/ldml_transforms.cpp +++ b/core/src/ldml/ldml_transforms.cpp @@ -470,7 +470,7 @@ transform_entry::init() { // NFD normalize on pattern creation nfd->normalize(patustr_raw, patustr, status); fFromPattern.reset(icu::RegexPattern::compile(patustr, 0, status)); - assert(U_SUCCESS(status)); // TODO-LDML: may be best to propagate status up ^^ + UASSERT_SUCCESS(status); } } @@ -484,7 +484,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons icu::UnicodeString matchustr = icu::UnicodeString(matchstr.data(), (int32_t)matchstr.length()); // TODO-LDML: create a new Matcher every time. These could be cached and reset. std::unique_ptr matcher(fFromPattern->matcher(matchustr, status)); - assert(U_SUCCESS(status)); + UASSERT_SUCCESS(status); if (!matcher->find(status)) { // i.e. matches somewhere, in this case at end of str return 0; // no match @@ -494,7 +494,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons // TODO-LDML: if we had an underlying UText this would be simpler. int32_t matchStart = matcher->start(status); int32_t matchEnd = matcher->end(status); - assert(U_SUCCESS(status)); + UASSERT_SUCCESS(status); // extract.. const icu::UnicodeString substr = matchustr.tempSubStringBetween(matchStart, matchEnd); // preflight to UTF-32 to get length @@ -521,7 +521,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons // we actually need the group(1) string here. // this is only the content in parenthesis () icu::UnicodeString group1 = matcher->group(1, status); - assert(U_SUCCESS(status)); // TODO-LDML: could be a malformed from pattern + UASSERT_SUCCESS(status); // TODO-LDML: could be a malformed from pattern // now, how long is group1 in UTF-32, hmm? UErrorCode preflightStatus = U_ZERO_ERROR; // throwaway status auto group1Len = group1.toUTF32(nullptr, 0, preflightStatus); @@ -529,7 +529,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons assert(s != nullptr); // TODO-LDML: OOM // convert substr.toUTF32((UChar32 *)s, group1Len + 1, status); - assert(U_SUCCESS(status)); + UASSERT_SUCCESS(status); std::u32string match32(s, group1Len); // taken from just group1 // clean up buffer delete [] s; @@ -552,10 +552,10 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status); icu::UnicodeString rustr2; nfd->normalize(rustr, rustr2, status); - assert(U_SUCCESS(status)); + UASSERT_SUCCESS(status); // here we replace the match output. icu::UnicodeString entireOutput = matcher->replaceFirst(rustr2, status); - assert(U_SUCCESS(status)); // TODO-LDML: could fail here due to bad input (syntax err) + UASSERT_SUCCESS(status); // TODO-LDML: could fail here due to bad input (syntax err) // entireOutput includes all of 'input', but modified. Need to substring it. icu::UnicodeString outu_raw = entireOutput.tempSubString(matchStart); @@ -563,7 +563,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons // normalize the replaced string icu::UnicodeString outu; nfd->normalize(outu_raw, outu, status); - assert(U_SUCCESS(status)); + UASSERT_SUCCESS(status); // Special case if there's no output, save some allocs if (outu.length() == 0) { @@ -578,7 +578,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons assert(s != nullptr); // convert outu.toUTF32((UChar32 *)s, out32len + 1, status); - assert(U_SUCCESS(status)); + UASSERT_SUCCESS(status); output.assign(s, out32len); // now, build a u32string std::u32string out32(s, out32len); @@ -844,67 +844,56 @@ transforms::load( return transforms; } -// string +// string manipulation -// TODO-LDML: copypasta -> refactor -std::u32string &normalize_nfd(std::u32string &str, UErrorCode &status) { +bool normalize_nfd(std::u32string &str) { std::u16string rstr = km::kbp::kmx::u32string_to_u16string(str); - normalize_nfd(rstr, status); - if (U_SUCCESS(status)) { - // if failure, leave it alone + if(!normalize_nfd(rstr)) { + return false; + } else { str = km::kbp::kmx::u16string_to_u32string(rstr); + return true; } - return str; } -// TODO-LDML: copypasta -> refactor -std::u16string &normalize_nfd(std::u16string &str, UErrorCode &status) { +bool normalize_nfd(std::u16string &str) { + UErrorCode status = U_ZERO_ERROR; const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status); - if (U_FAILURE(status)) { - return str; - } + UASSERT_SUCCESS(status); + if (U_FAILURE(status)) return false; // exit early since normalizer pointer could be nullptr icu::UnicodeString dest; icu::UnicodeString src = icu::UnicodeString(str.data(), (int32_t)str.length()); nfd->normalize(src, dest, status); - if (U_FAILURE(status)) { - return str; - } + UASSERT_SUCCESS(status); str.assign(dest.getBuffer(), dest.length()); - return str; + return U_SUCCESS(status); } -// TODO-LDML: copypasta -> refactor -std::u32string &normalize_nfc(std::u32string &str, UErrorCode &status) { +bool normalize_nfc(std::u32string &str) { std::u16string rstr = km::kbp::kmx::u32string_to_u16string(str); - - normalize_nfc(rstr, status); - if (U_SUCCESS(status)) { - // if failure, leave it alone + if(!normalize_nfc(rstr)) { + return false; + } else { str = km::kbp::kmx::u16string_to_u32string(rstr); + return true; } - return str; } -// TODO-LDML: copypasta -> refactor -std::u16string &normalize_nfc(std::u16string &str, UErrorCode &status) { +bool normalize_nfc(std::u16string &str) { + UErrorCode status = U_ZERO_ERROR; const icu::Normalizer2 *nfc = icu::Normalizer2::getNFCInstance(status); - if (U_FAILURE(status)) { - return str; - } + UASSERT_SUCCESS(status); + if (U_FAILURE(status)) return false; // exit early since normalizer pointer could be nullptr + icu::UnicodeString dest; icu::UnicodeString src = icu::UnicodeString(str.data(), (int32_t)str.length()); nfc->normalize(src, dest, status); - if (U_FAILURE(status)) { - return str; - } + UASSERT_SUCCESS(status); str.assign(dest.getBuffer(), dest.length()); - return str; + return U_SUCCESS(status); } - - - } // namespace ldml } // namespace kbp } // namespace km diff --git a/core/src/ldml/ldml_transforms.hpp b/core/src/ldml/ldml_transforms.hpp index f482e56a992..d7c5f2c6bf7 100644 --- a/core/src/ldml/ldml_transforms.hpp +++ b/core/src/ldml/ldml_transforms.hpp @@ -13,6 +13,7 @@ #include #include #include +#include "debuglog.h" #if !defined(HAVE_ICU4C) #error icu4c is required for this code @@ -31,6 +32,13 @@ namespace km { namespace kbp { namespace ldml { +// macro for working with ICU calls +#define UASSERT_SUCCESS(status) \ + if (U_FAILURE(status)) { \ + DebugLog("U_FAILURE(%s)", u_errorName(status)); \ + assert(U_SUCCESS(status)); \ + } + using km::kbp::kmx::SimpleUSet; /** @@ -273,14 +281,14 @@ class transforms { // string routines -/** Normalize a u32string inplace to NFD. Returns a reference to the same string. */ -std::u32string &normalize_nfd(std::u32string &str, UErrorCode &status); -/** Normalize a u16string inplace to NFD. Returns a reference to the same string. */ -std::u16string &normalize_nfd(std::u16string &str, UErrorCode &status); -/** Normalize a u32string inplace to NFC. Returns a reference to the same string. */ -std::u32string &normalize_nfc(std::u32string &str, UErrorCode &status); -/** Normalize a u16string inplace to NFC. Returns a reference to the same string. */ -std::u16string &normalize_nfc(std::u16string &str, UErrorCode &status); +/** Normalize a u32string inplace to NFD. @return false on failure */ +bool normalize_nfd(std::u32string &str); +/** Normalize a u16string inplace to NFD. @return false on failure */ +bool normalize_nfd(std::u16string &str); +/** Normalize a u32string inplace to NFC. @return false on failure */ +bool normalize_nfc(std::u32string &str); +/** Normalize a u16string inplace to NFC. @return false on failure */ +bool normalize_nfc(std::u16string &str); } // namespace ldml } // namespace kbp diff --git a/core/tests/unit/ldml/ldml_test_source.cpp b/core/tests/unit/ldml/ldml_test_source.cpp index 4ad5d5a3a72..ab8423f7648 100644 --- a/core/tests/unit/ldml/ldml_test_source.cpp +++ b/core/tests/unit/ldml/ldml_test_source.cpp @@ -468,9 +468,7 @@ LdmlJsonTestSource::next_action(ldml_action &fillin) { if (as_check.is_string()) { fillin.type = LDML_ACTION_CHECK_EXPECTED; fillin.string = LdmlTestSource::parse_u8_source_string(as_check.get()); - UErrorCode status = U_ZERO_ERROR; - km::kbp::ldml::normalize_nfc(fillin.string, status); - assert(U_SUCCESS(status)); + assert(km::kbp::ldml::normalize_nfc(fillin.string)); return; } @@ -489,8 +487,7 @@ LdmlJsonTestSource::next_action(ldml_action &fillin) { if (as_emit.is_string()) { fillin.type = LDML_ACTION_EMIT_STRING; fillin.string = LdmlTestSource::parse_u8_source_string(as_emit.get()); - UErrorCode status = U_ZERO_ERROR; - km::kbp::ldml::normalize_nfc(fillin.string, status); + assert(km::kbp::ldml::normalize_nfc(fillin.string)); return; } @@ -508,8 +505,7 @@ int LdmlJsonTestSource::load(const nlohmann::json &data) { this->data = data; // TODO-LDML auto startContext = data["/startContext/to"_json_pointer]; context = LdmlTestSource::parse_u8_source_string(startContext); - UErrorCode status = U_ZERO_ERROR; - km::kbp::ldml::normalize_nfc(context, status); + assert(km::kbp::ldml::normalize_nfc(context)); return 0; }