Skip to content

Commit

Permalink
Merge pull request #9728 from keymanapp/feat/core/9468-normalization-…
Browse files Browse the repository at this point in the history
…epic-ldml
  • Loading branch information
srl295 authored Oct 20, 2023
2 parents e96b99f + 4cdc2c3 commit 16d24bc
Show file tree
Hide file tree
Showing 11 changed files with 491 additions and 139 deletions.
4 changes: 4 additions & 0 deletions core/src/debuglog.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
/* Debugging */

#pragma once

#include <keyman/keyman_core_api_bits.h>

namespace km {
Expand All @@ -18,10 +20,12 @@ extern const char *s_key_names[];

#ifdef _MSC_VER
#define DebugLog(msg,...) (km::kbp::kmx::ShouldDebug() ? km::kbp::kmx::DebugLog_1(__FILE__, __LINE__, __FUNCTION__, (msg),__VA_ARGS__) : 0)
#define DebugLog2(file,line,function,msg,...) (km::kbp::kmx::ShouldDebug() ? km::kbp::kmx::DebugLog_1(file, line, function, (msg),__VA_ARGS__) : 0)
#define console_error(msg,...) write_console(TRUE, (msg), __VA_ARGS__)
#define console_log(msg,...) write_console(FALSE, (msg), __VA_ARGS__)
#else
#define DebugLog(msg,...) (km::kbp::kmx::ShouldDebug() ? km::kbp::kmx::DebugLog_1(__FILE__, __LINE__, __FUNCTION__, (msg), ##__VA_ARGS__) : 0)
#define DebugLog2(file,line,function,msg,...) (km::kbp::kmx::ShouldDebug() ? km::kbp::kmx::DebugLog_1(file, line, function, (msg), ##__VA_ARGS__) : 0)
#define console_error(msg,...) write_console(TRUE, (msg), ##__VA_ARGS__)
#define console_log(msg,...) write_console(FALSE, (msg), ##__VA_ARGS__)
#endif
Expand Down
185 changes: 106 additions & 79 deletions core/src/ldml/ldml_processor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
*/

#include <fstream>
#include <algorithm>
#include "ldml/ldml_processor.hpp"
#include "state.hpp"
#include "kmx_file.h"
Expand Down Expand Up @@ -236,88 +237,16 @@ ldml_processor::process_event(
// all other VKs
{
// Look up the key
const std::u16string str = keys.lookup(vk, modifier_state);
const std::u16string key_str = keys.lookup(vk, modifier_state);

if (str.empty()) {
if (key_str.empty()) {
// no key was found, so pass the keystroke on to the Engine
state->actions().push_invalidate_context();
state->actions().push_emit_keystroke();
break; // ----- commit and exit
}

// found a string - push it into the context and actions
// we convert it here instead of using the emit_text() overload
// so that we don't have to reconvert it inside the transform code.
const std::u32string str32 = kmx::u16string_to_u32string(str);

if (!transforms) {
// No transforms: just emit the string.
emit_text(state, str32);
} else {
// Process transforms here
/**
* a copy of the current/changed context, for transform use.
*
*/
std::u32string ctxtstr;
(void)context_to_string(state, ctxtstr);
// add the newly added key output to ctxtstr
ctxtstr.append(str32);

/** the output buffer for transforms */
std::u32string outputString;

// apply the transform, get how much matched (at the end)
const size_t matchedContext = transforms->apply(ctxtstr, outputString);

if (matchedContext == 0) {
// No match, just emit the original string
emit_text(state, str32);
} else {
// We have a match.

ctxtstr.resize(ctxtstr.length() - str32.length());
/** how many chars of the context we need to clear */
auto charsToDelete = matchedContext - str32.length(); /* we don't need to clear the output of the current key */

/** how many context items need to be removed */
size_t contextRemoved = 0;
for (auto c = state->context().rbegin(); charsToDelete > 0 && c != state->context().rend(); c++, contextRemoved++) {
/** last char of context */
km_core_usv lastCtx = ctxtstr.back();
uint8_t type = c->type;
assert(type == KM_CORE_BT_CHAR || type == KM_CORE_BT_MARKER);
if (type == KM_CORE_BT_CHAR) {
// single char, drop it
charsToDelete--;
assert(c->character == lastCtx);
ctxtstr.pop_back();
state->actions().push_backspace(KM_CORE_BT_CHAR, lastCtx); // Cause prior char to be removed
} else if (type == KM_CORE_BT_MARKER) {
// it's a marker, 'worth' 3 uchars
assert(charsToDelete >= 3);
assert(lastCtx == c->marker); // end of list
charsToDelete -= 3;
// pop off the three-part sentinel string
ctxtstr.pop_back();
ctxtstr.pop_back();
ctxtstr.pop_back();
// push a special backspace to delete the marker
state->actions().push_backspace(KM_CORE_BT_MARKER, c->marker);
}
}
// now, pop the right number of context items
for (size_t i = 0; i < contextRemoved; i++) {
// we don't pop during the above loop because the iterator gets confused
state->context().pop_back();
}
// Now, add in the updated text. This will convert UC_SENTINEL, etc back to marker actions.
emit_text(state, outputString);
// If we needed it further. we could update ctxtstr here:
// ctxtstr.append(outputString);
// ... but it is no longer needed at this point.
} // end of transform match
} // end of processing transforms
process_key_string(state, key_str);
} // end of processing a 'normal' vk
} // end of switch
// end of normal processing: commit and exit
Expand All @@ -330,6 +259,103 @@ ldml_processor::process_event(
return KM_CORE_STATUS_OK;
}

void
ldml_processor::process_key_string(km_core_state *state, const std::u16string &key_str) const {
// We know that key_str is not empty per the caller.
assert(!key_str.empty());

// we convert the keys str to UTF-32 here instead of using the emit_text() overload
// so that we don't have to reconvert it inside the transform code.
std::u32string key_str32 = kmx::u16string_to_u32string(key_str);
assert(ldml::normalize_nfd(key_str32)); // TODO-LDML: else fail?

// extract context string, in NFC
std::u32string old_ctxtstr_nfc;
(void)context_to_string(state, old_ctxtstr_nfc, false);
assert(ldml::normalize_nfc(old_ctxtstr_nfc)); // TODO-LDML: else fail?

// context string in NFD
std::u32string ctxtstr;
(void)context_to_string(state, ctxtstr, true); // with markers
// add the newly added key output to ctxtstr
ctxtstr.append(key_str32);
assert(ldml::normalize_nfd(ctxtstr)); // TODO-LDML: else fail?

/** transform output string */
std::u32string outputString;
/** how many chars of the ctxtstr to replace */
size_t matchedContext = 0; // zero if no transforms

// begin modifications to the string

if(transforms) {
matchedContext = transforms->apply(ctxtstr, outputString);
} else {
// no transforms, no output
}

// drop last 'matchedContext':
ctxtstr.resize(ctxtstr.length() - matchedContext);
ctxtstr.append(outputString); // TODO-LDML: should be able to do a normalization-safe append here.
assert(ldml::normalize_nfd(ctxtstr)); // TODO-LDML: else fail?

// Ok. We've done all the happy manipulations.

/** NFC and no markers */
std::u32string ctxtstr_cleanedup = ctxtstr;
// TODO-LDML: remove markers!
assert(ldml::normalize_nfc(ctxtstr_cleanedup)); // TODO-LDML: else fail?

// find common prefix
auto ctxt_prefix = mismatch(old_ctxtstr_nfc.begin(), old_ctxtstr_nfc.end(), ctxtstr_cleanedup.begin(), ctxtstr_cleanedup.end());
/** the part of the old str that changed */
std::u32string old_ctxtstr_changed(ctxt_prefix.first,old_ctxtstr_nfc.end());
std::u32string new_ctxtstr_changed(ctxt_prefix.second,ctxtstr_cleanedup.end());

// drop the old suffix. Note: this mutates old_ctxtstr_changed.
remove_text(state, old_ctxtstr_changed, old_ctxtstr_changed.length());
assert(old_ctxtstr_changed.length() == 0);
emit_text(state, new_ctxtstr_changed);
}

void
ldml_processor::remove_text(km_core_state *state, std::u32string &str, size_t length) {
/** how many context items need to be removed */
size_t contextRemoved = 0;
for (auto c = state->context().rbegin(); length > 0 && c != state->context().rend(); c++, contextRemoved++) {
/** last char of context */
km_core_usv lastCtx = str.back();
uint8_t type = c->type;
assert(type == KM_CORE_BT_CHAR || type == KM_CORE_BT_MARKER);
if (type == KM_CORE_BT_CHAR) {
// single char, drop it
length--;
assert(c->character == lastCtx);
str.pop_back();
state->actions().push_backspace(KM_CORE_BT_CHAR, c->character); // Cause prior char to be removed
} else if (type == KM_CORE_BT_MARKER) {
// it's a marker, 'worth' 3 uchars
assert(length >= 3);
assert(lastCtx == c->marker); // end of list
length -= 3;
// pop off the three-part sentinel string (in reverse order of course)
assert(str.back() == c->marker); // marker #
str.pop_back();
assert(str.back() == LDML_MARKER_CODE);
str.pop_back();
assert(str.back() == LDML_UC_SENTINEL);
str.pop_back();
// push a special backspace to delete the marker
state->actions().push_backspace(KM_CORE_BT_MARKER, c->marker);
}
}
// now, pop the right number of context items
for (size_t i = 0; i < contextRemoved; i++) {
// we don't pop during the above loop because the iterator gets confused
state->context().pop_back();
}
}

km_core_attr const & ldml_processor::attributes() const {
return engine_attrs;
}
Expand Down Expand Up @@ -395,25 +421,26 @@ ldml_processor::emit_marker(km_core_state *state, KMX_DWORD marker_no) {
}

size_t
ldml_processor::context_to_string(km_core_state *state, std::u32string &str) {
ldml_processor::context_to_string(km_core_state *state, std::u32string &str, bool include_markers) {
str.clear();
auto &cp = state->context();
size_t ctxlen = 0; // TODO-LDML: is this needed?
size_t ctxlen = 0; // TODO-LDML: not used by callers?
uint8_t last_type = KM_CORE_BT_UNKNOWN;
for (auto c = cp.rbegin(); c != cp.rend(); c++, ctxlen++) {
last_type = c->type;
if (last_type == KM_CORE_BT_CHAR) {
str.insert(0, 1, c->character);
} else if (last_type == KM_CORE_BT_MARKER) {
assert(km::kbp::kmx::is_valid_marker(c->marker));
prepend_marker(str, c->marker);
if (include_markers) {
prepend_marker(str, c->marker);
}
} else {
break;
}
}
return ctxlen; // consumed the entire context buffer.
}


} // namespace kbp
} // namespace km
11 changes: 10 additions & 1 deletion core/src/ldml/ldml_processor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,23 @@ namespace kbp {
static void emit_text(km_core_state *state, km_core_usv ch);
/** emit a marker */
static void emit_marker(km_core_state *state, KMX_DWORD marker);
/**
* Delete text from the state.
* @param str string with text to remove, from the end
* @param length number of chars from the end of str to drop
*/
static void remove_text(km_core_state *state, std::u32string &str, size_t length);

/** process a typed key */
void process_key_string(km_core_state *state, const std::u16string &key_str) const;

/**
* add the string+marker portion of the context to the beginning of str.
* Stop when a non-string and non-marker is hit.
* Convert markers into the UC_SENTINEL format.
* @return the number of context items consumed
*/
static size_t context_to_string(km_core_state *state, std::u32string &str);
static size_t context_to_string(km_core_state *state, std::u32string &str, bool include_markers = true);

/** prepend the marker string in UC_SENTINEL format to the str */
inline static void prepend_marker(std::u32string &str, KMX_DWORD marker);
Expand Down
Loading

0 comments on commit 16d24bc

Please sign in to comment.