diff --git a/src/somajo/alignment.py b/src/somajo/alignment.py
index 968cfbe..c7a91a0 100644
--- a/src/somajo/alignment.py
+++ b/src/somajo/alignment.py
@@ -7,7 +7,7 @@
def align_nfc(nfc, orig):
"""Character alignment from NFC version to original string."""
- assert len(nfc) <= len(orig)
+ assert len(nfc) <= len(orig), f"len({nfc}) > len({orig})"
alignment = {}
if nfc == "":
assert orig == ""
@@ -23,11 +23,11 @@ def align_nfc(nfc, orig):
orig_j = orig_i + 1
while (orig_j < len(orig)) and (unicodedata.combining(orig[orig_j]) > 0):
orig_j += 1
- assert nfc[nfc_i:nfc_j] == unicodedata.normalize("NFC", orig[orig_i:orig_j])
+ # assert nfc[nfc_i:nfc_j] == unicodedata.normalize("NFC", orig[orig_i:orig_j])
alignment[(nfc_i, nfc_j)] = (orig_i, orig_j)
nfc_i = nfc_j
orig_i = orig_j
- assert orig_j == len(orig)
+ assert orig_j == len(orig), f"{orig_j} != {len(orig)}; nfc: '{nfc}', orig: '{orig}'"
return alignment
@@ -57,31 +57,81 @@ def resolve_entities(xml):
return outstring, alignment
-def pretoken_offsets_xml(tokens, raw):
+def pretoken_offset_xml(token, raw):
# resolve entities
- raw_entityless, align_to_raw = resolve_entities(raw)
- offsets = token_offsets(tokens, raw_entityless, xml_input=True)
- offsets = [(align_to_raw[s][0], align_to_raw[e][1]) for s, e in offsets]
- return offsets
+ raw, align_to_raw = resolve_entities(raw)
+ # print("align_to_raw", align_to_raw)
+ # offsets = token_offsets([token], raw_entityless)
+ ###
+ raw = re.sub(r"\s", " ", raw)
+ text = token.text
+ if token.original_spelling is not None:
+ text = token.original_spelling
+ text = re.sub(r"\s", " ", text)
+ if token.markup:
+ text, align_to_text = resolve_entities(text)
+ text = text.replace("'", '"')
+ pattern = "(" + re.escape(text) + ")"
+ if not text.startswith(""):
+ pattern = pattern[:-2] + r"/?\s*" + pattern[-2:]
+ local_raw = raw.replace("'", '"')
+ m = re.search(pattern, local_raw)
+ if text.startswith("") and not m:
+ start, end = 0, 0
+ else:
+ assert m, f"'{text}' not found in '{local_raw}'"
+ start, end = m.span(1)
+ else:
+ pattern = "(" + re.escape(text) + ")"
+ m = re.search(pattern, raw)
+ assert m, f"'{text}' not found in '{raw}'"
+ start, end = m.span(1)
+ if start == end:
+ return (align_to_raw[start][0], align_to_raw[start][0])
+ else:
+ return (align_to_raw[start][0], align_to_raw[end - 1][1])
+ ###
+ # print("tokens", [t.text for t in tokens])
+ # print("raw", f"'{raw}'")
+ # print("offsets", offsets)
+ # offsets = [(align_to_raw[s][0], align_to_raw[s][0]) if s == e else (align_to_raw[s][0], align_to_raw[e - 1][1]) for s, e in offsets]
+ # return offsets
-def token_offsets(tokens, raw):
+def token_offsets(tokens, raw, position):
"""Determine start and end positions of tokens in the original raw (NFC) input."""
+ skipable_characters = r"[\s\u0000-\u001F\u007F-\u009F\u00AD\u061C\u200B-\u200F\u202A-\u202E\u2060\u2066-\u2069\uFEFF\uFE0F]*?"
+ # skipable_characters = r".*?"
offsets = []
raw_i = 0
+ raw = re.sub(r"\s", " ", raw)
for token in tokens:
text = token.text
- local_raw = raw
if token.original_spelling is not None:
text = token.original_spelling
+ text = re.sub(r"\s", " ", text)
if token.markup:
- text, align_to_text = resolve_entities(text)
- text = text.replace("'", '"')
- local_raw = raw.replace("'", '"')
- pattern = ".*?(" + ".*?".join([re.escape(c) for c in text]) + ")"
- m = re.search(pattern, local_raw, pos=raw_i)
- assert m
- start, end = m.span(1)
+ start, end = token.character_offset
+ start -= position
+ end -= position
+ # text, align_to_text = resolve_entities(text)
+ # text = text.replace("'", '"')
+ # pattern = skipable_characters + "(" + skipable_characters.join([re.escape(c) for c in text])
+ # if not text.startswith(""):
+ # pattern = pattern[:-1] + "/??" + skipable_characters + pattern[-1]
+ # pattern += ")"
+ # local_raw = raw.replace("'", '"')
+ # m = re.search(pattern, local_raw, pos=raw_i)
+ # if text.startswith("") and not m:
+ # start, end = raw_i, raw_i
+ # else:
+ # assert m, f"'{text}' not found in '{local_raw[raw_i:]}'"
+ # start, end = m.span(1)
+ else:
+ pattern = skipable_characters + "(" + skipable_characters.join([re.escape(c) for c in text]) + ")"
+ m = re.search(pattern, raw, pos=raw_i)
+ assert m, f"'{text}' not found in '{raw[raw_i:]}'\n{[ord(c) for c in text]} not found in {[ord(c) for c in raw[raw_i:]]}"
+ start, end = m.span(1)
offsets.append((start, end))
raw_i = end
return offsets
diff --git a/src/somajo/somajo.py b/src/somajo/somajo.py
index 0265696..5db26be 100644
--- a/src/somajo/somajo.py
+++ b/src/somajo/somajo.py
@@ -44,7 +44,7 @@ class SoMaJo:
paragraph_separators = {"empty_lines", "single_newlines"}
_default_parsep = "empty_lines"
- def __init__(self, language, *, split_camel_case=False, split_sentences=True, xml_sentences=None, character_offsets=False):
+ def __init__(self, language, *, split_camel_case=False, split_sentences=True, xml_sentences=None, character_offsets=True):
assert language in self.supported_languages
self.language = language
self.split_camel_case = split_camel_case
@@ -66,18 +66,59 @@ def _tokenize(self, token_info, xml_input):
token_list, raw, position = token_info
token_dll = doubly_linked_list.DLL(token_list)
tokens = self._tokenizer._tokenize(token_dll)
+ print([t.text for t in tokens])
+ print(raw)
if self.character_offsets:
if xml_input:
+ # print(len(raw), raw)
raw, align_to_entities = alignment.resolve_entities(raw)
- raw_nfc = unicodedata.normalize("NFC", raw)
+ align_from_entities = {i: v for v, k in enumerate(align_to_entities) for i in range(k[0], k[1])}
+ # print(raw)
+ # print(align_to_entities)
+ last = 0
+ chunks = []
+ for end in [t.character_offset[1] for t in tokens if t.markup]:
+ end -= position
+ # print(end, len(align_to_entities))
+ end = align_from_entities[end - 1] + 1
+ # print(end)
+ chunks.append(raw[last:end])
+ # print((last, end))
+ last = end
+ if last != len(raw):
+ chunks.append(raw[last:len(raw)])
+ chunks = [unicodedata.normalize("NFC", c) for c in chunks]
+ raw_nfc = "".join(chunks)
+ else:
+ raw_nfc = unicodedata.normalize("NFC", raw)
align_to_raw = alignment.align_nfc(raw_nfc, raw)
- align_starts = {k[0]: v[0] for k, v in align_to_raw.items()}
- align_ends = {k[1]: v[1] for k, v in align_to_raw.items()}
- offsets = alignment.token_offsets(tokens, raw_nfc)
- offsets = [(align_to_raw[align_starts[s]][0], align_to_raw[align_ends[e] - 1][1]) for s, e in offsets]
- if xml_input:
- offsets = [(align_to_entities[s][0], align_to_entities[e][1]) for s, e in offsets]
+ align_from_raw = {i: k for k, v in align_to_raw.items() for i in range(v[0], v[1])}
+ # align_starts = {k[0]: v[0] for k, v in align_to_raw.items()}
+ # align_ends = {k[1]: v[1] for k, v in align_to_raw.items()}
+ align_to_starts = {i: v[0] for k, v in align_to_raw.items() for i in range(k[0], k[1])}
+ align_to_ends = {i: v[1] for k, v in align_to_raw.items() for i in range(k[0], k[1])}
+ # print(align_to_starts)
+ # print(align_to_ends)
+ print(align_from_entities)
+ print(align_from_raw)
+ for i in range(len(tokens)):
+ if tokens[i].markup:
+ s, e = tokens[i].character_offset
+ print(s, e)
+ s -= position
+ e -= position
+ print(s, e)
+ tokens[i].character_offset = (align_from_raw[align_from_entities[s]][0] + position, align_from_raw[align_from_entities[e - 1]][1] + position)
+ offsets = alignment.token_offsets(tokens, raw_nfc, position)
assert len(tokens) == len(offsets)
+ # print(offsets)
+ # print(align_to_raw)
+ # print(align_to_starts)
+ # print(align_to_ends)
+ offsets = [(align_to_starts[s], align_to_ends[e - 1]) for s, e in offsets]
+ # print(offsets)
+ if xml_input:
+ offsets = [(align_to_entities[s][0], align_to_entities[e - 1][1]) for s, e in offsets]
for i in range(len(tokens)):
tokens[i].character_offset = offsets[i]
if self.split_sentences:
diff --git a/src/somajo/utils.py b/src/somajo/utils.py
index d07df44..2d94651 100644
--- a/src/somajo/utils.py
+++ b/src/somajo/utils.py
@@ -260,12 +260,6 @@ def _xml_chunk_generator(f, eos_tags=None, prune_tags=None, character_offsets=Fa
if character_offsets:
input_buffer += "".join(line_list)
for token in token_list:
- if character_offsets:
- token_end = alignment.pretoken_offsets_xml([token], input_buffer)[0][1]
- else:
- token_end = 0
- output_buffer.append(input_buffer[:token_end])
- input_buffer = input_buffer[token_end:]
if token.markup:
# markup
if token.markup_eos:
@@ -319,6 +313,15 @@ def _xml_chunk_generator(f, eos_tags=None, prune_tags=None, character_offsets=Fa
bos = False
token.first_in_sentence = True
lexical_tokens += 1
+ if character_offsets:
+ token_start, token_end = alignment.pretoken_offset_xml(token, input_buffer)
+ if token.markup:
+ len_output_buffer = sum(len(o) for o in output_buffer)
+ token.character_offset = (token_start + position + len_output_buffer, token_end + position + len_output_buffer)
+ else:
+ token_end = 0
+ output_buffer.append(input_buffer[:token_end])
+ input_buffer = input_buffer[token_end:]
current.append(token)
if len(current) > 0:
raw_xml = "".join(output_buffer)
diff --git a/tests/test_alignment.py b/tests/test_alignment.py
index 28019cd..14844f3 100644
--- a/tests/test_alignment.py
+++ b/tests/test_alignment.py
@@ -82,7 +82,9 @@ def test_entitites_01(self):
def test_entities_02(self):
xml = "Test"
resolved = "Test"
- alignment = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 14), (14, 15), (15, 16), (16, 17), (17, 18), (18, 19), (19, 20), (20, 21), (21, 22)]
+ alignment = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6),
+ (6, 14), (14, 15), (15, 16), (16, 17), (17, 18),
+ (18, 19), (19, 20), (20, 21), (21, 22)]
res, al = somajo.alignment.resolve_entities(xml)
self.assertEqual(res, resolved)
self.assertEqual(al, alignment)
@@ -180,3 +182,8 @@ def test_token_alignment_19(self):
def test_token_alignment_20(self):
self._equal_xml("Foo \"Bar\" 'Baz'", ["", "Foo", '"', "Bar", '"', "'", "Baz", "'", ""])
+
+
+# :\n)
+#
+#