Start debugging…

tsproisl · Oct 24, 2023 · cb52239 · cb52239
1 parent 07288ce
commit cb52239
Show file tree

Hide file tree

Showing 4 changed files with 133 additions and 32 deletions.
diff --git a/src/somajo/alignment.py b/src/somajo/alignment.py
@@ -7,7 +7,7 @@
 
 def align_nfc(nfc, orig):
     """Character alignment from NFC version to original string."""
-    assert len(nfc) <= len(orig)
+    assert len(nfc) <= len(orig), f"len({nfc}) > len({orig})"
     alignment = {}
     if nfc == "":
         assert orig == ""
@@ -23,11 +23,11 @@ def align_nfc(nfc, orig):
         orig_j = orig_i + 1
         while (orig_j < len(orig)) and (unicodedata.combining(orig[orig_j]) > 0):
             orig_j += 1
-        assert nfc[nfc_i:nfc_j] == unicodedata.normalize("NFC", orig[orig_i:orig_j])
+        # assert nfc[nfc_i:nfc_j] == unicodedata.normalize("NFC", orig[orig_i:orig_j])
         alignment[(nfc_i, nfc_j)] = (orig_i, orig_j)
         nfc_i = nfc_j
         orig_i = orig_j
-    assert orig_j == len(orig)
+    assert orig_j == len(orig), f"{orig_j} != {len(orig)}; nfc: '{nfc}', orig: '{orig}'"
     return alignment
 
 
@@ -57,31 +57,81 @@ def resolve_entities(xml):
     return outstring, alignment
 
 
-def pretoken_offsets_xml(tokens, raw):
+def pretoken_offset_xml(token, raw):
     # resolve entities
-    raw_entityless, align_to_raw = resolve_entities(raw)
-    offsets = token_offsets(tokens, raw_entityless, xml_input=True)
-    offsets = [(align_to_raw[s][0], align_to_raw[e][1]) for s, e in offsets]
-    return offsets
+    raw, align_to_raw = resolve_entities(raw)
+    # print("align_to_raw", align_to_raw)
+    # offsets = token_offsets([token], raw_entityless)
+    ###
+    raw = re.sub(r"\s", " ", raw)
+    text = token.text
+    if token.original_spelling is not None:
+        text = token.original_spelling
+    text = re.sub(r"\s", " ", text)
+    if token.markup:
+        text, align_to_text = resolve_entities(text)
+        text = text.replace("'", '"')
+        pattern = "(" + re.escape(text) + ")"
+        if not text.startswith("</"):
+            pattern = pattern[:-2] + r"/?\s*" + pattern[-2:]
+        local_raw = raw.replace("'", '"')
+        m = re.search(pattern, local_raw)
+        if text.startswith("</") and not m:
+            start, end = 0, 0
+        else:
+            assert m, f"'{text}' not found in '{local_raw}'"
+            start, end = m.span(1)
+    else:
+        pattern = "(" + re.escape(text) + ")"
+        m = re.search(pattern, raw)
+        assert m, f"'{text}' not found in '{raw}'"
+        start, end = m.span(1)
+    if start == end:
+        return (align_to_raw[start][0], align_to_raw[start][0])
+    else:
+        return (align_to_raw[start][0], align_to_raw[end - 1][1])
+    ###
+    # print("tokens", [t.text for t in tokens])
+    # print("raw", f"'{raw}'")
+    # print("offsets", offsets)
+    # offsets = [(align_to_raw[s][0], align_to_raw[s][0]) if s == e else (align_to_raw[s][0], align_to_raw[e - 1][1]) for s, e in offsets]
+    # return offsets
 
 
-def token_offsets(tokens, raw):
+def token_offsets(tokens, raw, position):
     """Determine start and end positions of tokens in the original raw (NFC) input."""
+    skipable_characters = r"[\s\u0000-\u001F\u007F-\u009F\u00AD\u061C\u200B-\u200F\u202A-\u202E\u2060\u2066-\u2069\uFEFF\uFE0F]*?"
+    # skipable_characters = r".*?"
     offsets = []
     raw_i = 0
+    raw = re.sub(r"\s", " ", raw)
     for token in tokens:
         text = token.text
-        local_raw = raw
         if token.original_spelling is not None:
             text = token.original_spelling
+        text = re.sub(r"\s", " ", text)
         if token.markup:
-            text, align_to_text = resolve_entities(text)
-            text = text.replace("'", '"')
-            local_raw = raw.replace("'", '"')
-        pattern = ".*?(" + ".*?".join([re.escape(c) for c in text]) + ")"
-        m = re.search(pattern, local_raw, pos=raw_i)
-        assert m
-        start, end = m.span(1)
+            start, end = token.character_offset
+            start -= position
+            end -= position
+            # text, align_to_text = resolve_entities(text)
+            # text = text.replace("'", '"')
+            # pattern = skipable_characters + "(" + skipable_characters.join([re.escape(c) for c in text])
+            # if not text.startswith("</"):
+            #     pattern = pattern[:-1] + "/??" + skipable_characters + pattern[-1]
+            # pattern += ")"
+            # local_raw = raw.replace("'", '"')
+            # m = re.search(pattern, local_raw, pos=raw_i)
+            # if text.startswith("</") and not m:
+            #     start, end = raw_i, raw_i
+            # else:
+            #     assert m, f"'{text}' not found in '{local_raw[raw_i:]}'"
+            #     start, end = m.span(1)
+        else:
+            pattern = skipable_characters + "(" + skipable_characters.join([re.escape(c) for c in text]) + ")"
+            m = re.search(pattern, raw, pos=raw_i)
+            assert m, f"'{text}' not found in '{raw[raw_i:]}'\n{[ord(c) for c in text]} not found in {[ord(c) for c in raw[raw_i:]]}"
+            start, end = m.span(1)
         offsets.append((start, end))
         raw_i = end
     return offsets

diff --git a/src/somajo/somajo.py b/src/somajo/somajo.py
@@ -44,7 +44,7 @@ class SoMaJo:
     paragraph_separators = {"empty_lines", "single_newlines"}
     _default_parsep = "empty_lines"
 
-    def __init__(self, language, *, split_camel_case=False, split_sentences=True, xml_sentences=None, character_offsets=False):
+    def __init__(self, language, *, split_camel_case=False, split_sentences=True, xml_sentences=None, character_offsets=True):
         assert language in self.supported_languages
         self.language = language
         self.split_camel_case = split_camel_case
@@ -66,18 +66,59 @@ def _tokenize(self, token_info, xml_input):
         token_list, raw, position = token_info
         token_dll = doubly_linked_list.DLL(token_list)
         tokens = self._tokenizer._tokenize(token_dll)
+        print([t.text for t in tokens])
+        print(raw)
         if self.character_offsets:
             if xml_input:
+                # print(len(raw), raw)
                 raw, align_to_entities = alignment.resolve_entities(raw)
-            raw_nfc = unicodedata.normalize("NFC", raw)
+                align_from_entities = {i: v for v, k in enumerate(align_to_entities) for i in range(k[0], k[1])}
+                # print(raw)
+                # print(align_to_entities)
+                last = 0
+                chunks = []
+                for end in [t.character_offset[1] for t in tokens if t.markup]:
+                    end -= position
+                    # print(end, len(align_to_entities))
+                    end = align_from_entities[end - 1] + 1
+                    # print(end)
+                    chunks.append(raw[last:end])
+                    # print((last, end))
+                    last = end
+                if last != len(raw):
+                    chunks.append(raw[last:len(raw)])
+                chunks = [unicodedata.normalize("NFC", c) for c in chunks]
+                raw_nfc = "".join(chunks)
+            else:
+                raw_nfc = unicodedata.normalize("NFC", raw)
             align_to_raw = alignment.align_nfc(raw_nfc, raw)
-            align_starts = {k[0]: v[0] for k, v in align_to_raw.items()}
-            align_ends = {k[1]: v[1] for k, v in align_to_raw.items()}
-            offsets = alignment.token_offsets(tokens, raw_nfc)
-            offsets = [(align_to_raw[align_starts[s]][0], align_to_raw[align_ends[e] - 1][1]) for s, e in offsets]
-            if xml_input:
-                offsets = [(align_to_entities[s][0], align_to_entities[e][1]) for s, e in offsets]
+            align_from_raw = {i: k for k, v in align_to_raw.items() for i in range(v[0], v[1])}
+            # align_starts = {k[0]: v[0] for k, v in align_to_raw.items()}
+            # align_ends = {k[1]: v[1] for k, v in align_to_raw.items()}
+            align_to_starts = {i: v[0] for k, v in align_to_raw.items() for i in range(k[0], k[1])}
+            align_to_ends = {i: v[1] for k, v in align_to_raw.items() for i in range(k[0], k[1])}
+            # print(align_to_starts)
+            # print(align_to_ends)
+            print(align_from_entities)
+            print(align_from_raw)
+            for i in range(len(tokens)):
+                if tokens[i].markup:
+                    s, e = tokens[i].character_offset
+                    print(s, e)
+                    s -= position
+                    e -= position
+                    print(s, e)
+                    tokens[i].character_offset = (align_from_raw[align_from_entities[s]][0] + position, align_from_raw[align_from_entities[e - 1]][1] + position)
+            offsets = alignment.token_offsets(tokens, raw_nfc, position)
             assert len(tokens) == len(offsets)
+            # print(offsets)
+            # print(align_to_raw)
+            # print(align_to_starts)
+            # print(align_to_ends)
+            offsets = [(align_to_starts[s], align_to_ends[e - 1]) for s, e in offsets]
+            # print(offsets)
+            if xml_input:
+                offsets = [(align_to_entities[s][0], align_to_entities[e - 1][1]) for s, e in offsets]
             for i in range(len(tokens)):
                 tokens[i].character_offset = offsets[i]
         if self.split_sentences:

diff --git a/src/somajo/utils.py b/src/somajo/utils.py
@@ -260,12 +260,6 @@ def _xml_chunk_generator(f, eos_tags=None, prune_tags=None, character_offsets=Fa
         if character_offsets:
             input_buffer += "".join(line_list)
         for token in token_list:
-            if character_offsets:
-                token_end = alignment.pretoken_offsets_xml([token], input_buffer)[0][1]
-            else:
-                token_end = 0
-            output_buffer.append(input_buffer[:token_end])
-            input_buffer = input_buffer[token_end:]
             if token.markup:
                 # markup
                 if token.markup_eos:
@@ -319,6 +313,15 @@ def _xml_chunk_generator(f, eos_tags=None, prune_tags=None, character_offsets=Fa
                         bos = False
                         token.first_in_sentence = True
                         lexical_tokens += 1
+            if character_offsets:
+                token_start, token_end = alignment.pretoken_offset_xml(token, input_buffer)
+                if token.markup:
+                    len_output_buffer = sum(len(o) for o in output_buffer)
+                    token.character_offset = (token_start + position + len_output_buffer, token_end + position + len_output_buffer)
+            else:
+                token_end = 0
+            output_buffer.append(input_buffer[:token_end])
+            input_buffer = input_buffer[token_end:]
             current.append(token)
     if len(current) > 0:
         raw_xml = "".join(output_buffer)

diff --git a/tests/test_alignment.py b/tests/test_alignment.py
@@ -82,7 +82,9 @@ def test_entitites_01(self):
     def test_entities_02(self):
         xml = "<foo>T&#x0065;st</foo>"
         resolved = "<foo>Test</foo>"
-        alignment = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 14), (14, 15), (15, 16), (16, 17), (17, 18), (18, 19), (19, 20), (20, 21), (21, 22)]
+        alignment = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6),
+                     (6, 14), (14, 15), (15, 16), (16, 17), (17, 18),
+                     (18, 19), (19, 20), (20, 21), (21, 22)]
         res, al = somajo.alignment.resolve_entities(xml)
         self.assertEqual(res, resolved)
         self.assertEqual(al, alignment)
@@ -180,3 +182,8 @@ def test_token_alignment_19(self):
 
     def test_token_alignment_20(self):
         self._equal_xml("<foo bar='ba\"z'>Foo \"Bar\" 'Baz'</foo>", ["<foo bar='ba\"z'>", "Foo", '"', "Bar", '"', "'", "Baz", "'", "</foo>"])
+
+
+# :\n)
+# <foo bar="baz"\nspam="eggs">
+# <br/>