Skip to content

Commit

Permalink
fix(core): ldml more updates to normalization per review 🙀
Browse files Browse the repository at this point in the history
- verify marker string as it's popped off
- add a missing pragma to debuglog.h
- move UErrorCode out of ldml_processor, change normalization functions to return true on success

For: #9468
  • Loading branch information
srl295 committed Oct 18, 2023
1 parent 93e1c18 commit 9dbf70c
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 71 deletions.
2 changes: 2 additions & 0 deletions core/src/debuglog.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
/* Debugging */

#pragma once

#include <keyman/keyman_core_api_bits.h>

namespace km {
Expand Down
21 changes: 9 additions & 12 deletions core/src/ldml/ldml_processor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -261,30 +261,25 @@ ldml_processor::process_event(

void
ldml_processor::process_key_string(km_core_state *state, const std::u16string &key_str) const {
UErrorCode status = U_ZERO_ERROR;
// We know that key_str is not empty per the caller.
assert(!key_str.empty());

// we convert the keys str to UTF-32 here instead of using the emit_text() overload
// so that we don't have to reconvert it inside the transform code.
std::u32string key_str32 = kmx::u16string_to_u32string(key_str);
ldml::normalize_nfd(key_str32, status);
assert(U_SUCCESS(status));
ldml::normalize_nfd(key_str32, status);
assert(ldml::normalize_nfd(key_str32)); // TODO-LDML: else fail?

// extract context string, in NFC
std::u32string old_ctxtstr_nfc;
(void)context_to_string(state, old_ctxtstr_nfc, false);
ldml::normalize_nfc(old_ctxtstr_nfc, status);
assert(U_SUCCESS(status));
assert(ldml::normalize_nfc(old_ctxtstr_nfc)); // TODO-LDML: else fail?

// context string in NFD
std::u32string ctxtstr;
(void)context_to_string(state, ctxtstr, true); // with markers
// add the newly added key output to ctxtstr
ctxtstr.append(key_str32);
ldml::normalize_nfd(ctxtstr, status);
assert(U_SUCCESS(status));
assert(ldml::normalize_nfd(ctxtstr)); // TODO-LDML: else fail?

/** transform output string */
std::u32string outputString;
Expand All @@ -302,15 +297,14 @@ ldml_processor::process_key_string(km_core_state *state, const std::u16string &k
// drop last 'matchedContext':
ctxtstr.resize(ctxtstr.length() - matchedContext);
ctxtstr.append(outputString); // TODO-LDML: should be able to do a normalization-safe append here.
ldml::normalize_nfd(ctxtstr, status);
assert(U_SUCCESS(status));
assert(ldml::normalize_nfd(ctxtstr)); // TODO-LDML: else fail?

// Ok. We've done all the happy manipulations.

/** NFC and no markers */
std::u32string ctxtstr_cleanedup = ctxtstr;
// TODO-LDML: remove markers!
ldml::normalize_nfc(ctxtstr_cleanedup, status);
assert(ldml::normalize_nfc(ctxtstr_cleanedup)); // TODO-LDML: else fail?

// find common prefix
auto ctxt_prefix = mismatch(old_ctxtstr_nfc.begin(), old_ctxtstr_nfc.end(), ctxtstr_cleanedup.begin(), ctxtstr_cleanedup.end());
Expand Down Expand Up @@ -344,9 +338,12 @@ ldml_processor::remove_text(km_core_state *state, std::u32string &str, size_t le
assert(length >= 3);
assert(lastCtx == c->marker); // end of list
length -= 3;
// pop off the three-part sentinel string
// pop off the three-part sentinel string (in reverse order of course)
assert(str.back() == c->marker); // marker #
str.pop_back();
assert(str.back() == LDML_MARKER_CODE);
str.pop_back();
assert(str.back() == LDML_UC_SENTINEL);
str.pop_back();
// push a special backspace to delete the marker
state->actions().push_backspace(KM_CORE_BT_MARKER, c->marker);
Expand Down
77 changes: 33 additions & 44 deletions core/src/ldml/ldml_transforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,7 @@ transform_entry::init() {
// NFD normalize on pattern creation
nfd->normalize(patustr_raw, patustr, status);
fFromPattern.reset(icu::RegexPattern::compile(patustr, 0, status));
assert(U_SUCCESS(status)); // TODO-LDML: may be best to propagate status up ^^
UASSERT_SUCCESS(status);
}
}

Expand All @@ -484,7 +484,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons
icu::UnicodeString matchustr = icu::UnicodeString(matchstr.data(), (int32_t)matchstr.length());
// TODO-LDML: create a new Matcher every time. These could be cached and reset.
std::unique_ptr<icu::RegexMatcher> matcher(fFromPattern->matcher(matchustr, status));
assert(U_SUCCESS(status));
UASSERT_SUCCESS(status);

if (!matcher->find(status)) { // i.e. matches somewhere, in this case at end of str
return 0; // no match
Expand All @@ -494,7 +494,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons
// TODO-LDML: if we had an underlying UText this would be simpler.
int32_t matchStart = matcher->start(status);
int32_t matchEnd = matcher->end(status);
assert(U_SUCCESS(status));
UASSERT_SUCCESS(status);
// extract..
const icu::UnicodeString substr = matchustr.tempSubStringBetween(matchStart, matchEnd);
// preflight to UTF-32 to get length
Expand All @@ -521,15 +521,15 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons
// we actually need the group(1) string here.
// this is only the content in parenthesis ()
icu::UnicodeString group1 = matcher->group(1, status);
assert(U_SUCCESS(status)); // TODO-LDML: could be a malformed from pattern
UASSERT_SUCCESS(status); // TODO-LDML: could be a malformed from pattern
// now, how long is group1 in UTF-32, hmm?
UErrorCode preflightStatus = U_ZERO_ERROR; // throwaway status
auto group1Len = group1.toUTF32(nullptr, 0, preflightStatus);
char32_t *s = new char32_t[group1Len + 1];
assert(s != nullptr); // TODO-LDML: OOM
// convert
substr.toUTF32((UChar32 *)s, group1Len + 1, status);
assert(U_SUCCESS(status));
UASSERT_SUCCESS(status);
std::u32string match32(s, group1Len); // taken from just group1
// clean up buffer
delete [] s;
Expand All @@ -552,18 +552,18 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons
const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status);
icu::UnicodeString rustr2;
nfd->normalize(rustr, rustr2, status);
assert(U_SUCCESS(status));
UASSERT_SUCCESS(status);
// here we replace the match output.
icu::UnicodeString entireOutput = matcher->replaceFirst(rustr2, status);
assert(U_SUCCESS(status)); // TODO-LDML: could fail here due to bad input (syntax err)
UASSERT_SUCCESS(status); // TODO-LDML: could fail here due to bad input (syntax err)

// entireOutput includes all of 'input', but modified. Need to substring it.
icu::UnicodeString outu_raw = entireOutput.tempSubString(matchStart);

// normalize the replaced string
icu::UnicodeString outu;
nfd->normalize(outu_raw, outu, status);
assert(U_SUCCESS(status));
UASSERT_SUCCESS(status);

// Special case if there's no output, save some allocs
if (outu.length() == 0) {
Expand All @@ -578,7 +578,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons
assert(s != nullptr);
// convert
outu.toUTF32((UChar32 *)s, out32len + 1, status);
assert(U_SUCCESS(status));
UASSERT_SUCCESS(status);
output.assign(s, out32len);
// now, build a u32string
std::u32string out32(s, out32len);
Expand Down Expand Up @@ -844,67 +844,56 @@ transforms::load(
return transforms;
}

// string
// string manipulation

// TODO-LDML: copypasta -> refactor
std::u32string &normalize_nfd(std::u32string &str, UErrorCode &status) {
bool normalize_nfd(std::u32string &str) {
std::u16string rstr = km::kbp::kmx::u32string_to_u16string(str);

normalize_nfd(rstr, status);
if (U_SUCCESS(status)) {
// if failure, leave it alone
if(!normalize_nfd(rstr)) {
return false;
} else {
str = km::kbp::kmx::u16string_to_u32string(rstr);
return true;
}
return str;
}

// TODO-LDML: copypasta -> refactor
std::u16string &normalize_nfd(std::u16string &str, UErrorCode &status) {
bool normalize_nfd(std::u16string &str) {
UErrorCode status = U_ZERO_ERROR;
const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status);
if (U_FAILURE(status)) {
return str;
}
UASSERT_SUCCESS(status);
if (U_FAILURE(status)) return false; // exit early since normalizer pointer could be nullptr
icu::UnicodeString dest;
icu::UnicodeString src = icu::UnicodeString(str.data(), (int32_t)str.length());
nfd->normalize(src, dest, status);
if (U_FAILURE(status)) {
return str;
}
UASSERT_SUCCESS(status);
str.assign(dest.getBuffer(), dest.length());
return str;
return U_SUCCESS(status);
}

// TODO-LDML: copypasta -> refactor
std::u32string &normalize_nfc(std::u32string &str, UErrorCode &status) {
bool normalize_nfc(std::u32string &str) {
std::u16string rstr = km::kbp::kmx::u32string_to_u16string(str);

normalize_nfc(rstr, status);
if (U_SUCCESS(status)) {
// if failure, leave it alone
if(!normalize_nfc(rstr)) {
return false;
} else {
str = km::kbp::kmx::u16string_to_u32string(rstr);
return true;
}
return str;
}

// TODO-LDML: copypasta -> refactor
std::u16string &normalize_nfc(std::u16string &str, UErrorCode &status) {
bool normalize_nfc(std::u16string &str) {
UErrorCode status = U_ZERO_ERROR;
const icu::Normalizer2 *nfc = icu::Normalizer2::getNFCInstance(status);
if (U_FAILURE(status)) {
return str;
}
UASSERT_SUCCESS(status);
if (U_FAILURE(status)) return false; // exit early since normalizer pointer could be nullptr

icu::UnicodeString dest;
icu::UnicodeString src = icu::UnicodeString(str.data(), (int32_t)str.length());
nfc->normalize(src, dest, status);
if (U_FAILURE(status)) {
return str;
}
UASSERT_SUCCESS(status);
str.assign(dest.getBuffer(), dest.length());
return str;
return U_SUCCESS(status);
}




} // namespace ldml
} // namespace kbp
} // namespace km
24 changes: 16 additions & 8 deletions core/src/ldml/ldml_transforms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <string>
#include <unordered_map>
#include <utility>
#include "debuglog.h"

#if !defined(HAVE_ICU4C)
#error icu4c is required for this code
Expand All @@ -31,6 +32,13 @@ namespace km {
namespace kbp {
namespace ldml {

// macro for working with ICU calls
#define UASSERT_SUCCESS(status) \
if (U_FAILURE(status)) { \
DebugLog("U_FAILURE(%s)", u_errorName(status)); \
assert(U_SUCCESS(status)); \
}

using km::kbp::kmx::SimpleUSet;

/**
Expand Down Expand Up @@ -273,14 +281,14 @@ class transforms {

// string routines

/** Normalize a u32string inplace to NFD. Returns a reference to the same string. */
std::u32string &normalize_nfd(std::u32string &str, UErrorCode &status);
/** Normalize a u16string inplace to NFD. Returns a reference to the same string. */
std::u16string &normalize_nfd(std::u16string &str, UErrorCode &status);
/** Normalize a u32string inplace to NFC. Returns a reference to the same string. */
std::u32string &normalize_nfc(std::u32string &str, UErrorCode &status);
/** Normalize a u16string inplace to NFC. Returns a reference to the same string. */
std::u16string &normalize_nfc(std::u16string &str, UErrorCode &status);
/** Normalize a u32string inplace to NFD. @return false on failure */
bool normalize_nfd(std::u32string &str);
/** Normalize a u16string inplace to NFD. @return false on failure */
bool normalize_nfd(std::u16string &str);
/** Normalize a u32string inplace to NFC. @return false on failure */
bool normalize_nfc(std::u32string &str);
/** Normalize a u16string inplace to NFC. @return false on failure */
bool normalize_nfc(std::u16string &str);

} // namespace ldml
} // namespace kbp
Expand Down
10 changes: 3 additions & 7 deletions core/tests/unit/ldml/ldml_test_source.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -468,9 +468,7 @@ LdmlJsonTestSource::next_action(ldml_action &fillin) {
if (as_check.is_string()) {
fillin.type = LDML_ACTION_CHECK_EXPECTED;
fillin.string = LdmlTestSource::parse_u8_source_string(as_check.get<std::string>());
UErrorCode status = U_ZERO_ERROR;
km::kbp::ldml::normalize_nfc(fillin.string, status);
assert(U_SUCCESS(status));
assert(km::kbp::ldml::normalize_nfc(fillin.string));
return;
}

Expand All @@ -489,8 +487,7 @@ LdmlJsonTestSource::next_action(ldml_action &fillin) {
if (as_emit.is_string()) {
fillin.type = LDML_ACTION_EMIT_STRING;
fillin.string = LdmlTestSource::parse_u8_source_string(as_emit.get<std::string>());
UErrorCode status = U_ZERO_ERROR;
km::kbp::ldml::normalize_nfc(fillin.string, status);
assert(km::kbp::ldml::normalize_nfc(fillin.string));
return;
}

Expand All @@ -508,8 +505,7 @@ int LdmlJsonTestSource::load(const nlohmann::json &data) {
this->data = data; // TODO-LDML
auto startContext = data["/startContext/to"_json_pointer];
context = LdmlTestSource::parse_u8_source_string(startContext);
UErrorCode status = U_ZERO_ERROR;
km::kbp::ldml::normalize_nfc(context, status);
assert(km::kbp::ldml::normalize_nfc(context));
return 0;
}

Expand Down

0 comments on commit 9dbf70c

Please sign in to comment.