Towards token offsets in XML input

tsproisl · Oct 14, 2023 · 22d5d90 · 22d5d90
1 parent fb2b1fc
commit 22d5d90
Show file tree

Hide file tree

Showing 2 changed files with 90 additions and 3 deletions.
diff --git a/src/somajo/alignment.py b/src/somajo/alignment.py
@@ -46,3 +46,52 @@ def token_offsets(tokens, raw):
         offsets.append((start, end))
         raw_i = end
     return offsets
+
+
+def token_offsets_xml(tokens, raw, tokenizer):
+    """Determine start and end positions of tokens in the original raw
+    (NFC) input. Account for XML entities.
+    """
+    offsets = []
+    raw_i = 0
+    skip_pattern = "|".join([r"\s", "\uFE0F", tokenizer.controls.pattern, tokenizer.other_nasties.pattern])
+    skip = re.compile(skip_pattern)
+    for token in tokens:
+        text = token.text
+        if token.original_spelling is not None:
+            text = token.original_spelling
+        # print(text)
+        start, end = None, None
+        for i, char in enumerate(text):
+            while True:
+                # print(char, raw_i, raw[raw_i])
+                if char == raw[raw_i]:
+                    s = raw_i
+                    raw_i += 1
+                    e = raw_i
+                    break
+                elif ((char == "'") or (char == '"')) and ((raw[raw_i] == "'") or (raw[raw_i] == '"')):
+                    s = raw_i
+                    raw_i += 1
+                    e = raw_i
+                    break
+                elif raw[raw_i] == "&":
+                    # TODO: process_entities(text, i, raw, raw_i)
+                    s = raw_i
+                    while raw[raw_i] != ";":
+                        raw_i += 1
+                    raw_i += 1
+                    e = raw_i
+                    entity = raw[s:e]
+                    break
+                elif skip.match(raw[raw_i]):
+                    raw_i += 1
+                    continue
+                else:
+                    raise ValueError(f"Cannot find char {char} from {text} in {raw[raw_i:raw_i + 20]}...")
+            if i == 0:
+                start = s
+            elif i == len(text) - 1:
+                end = e
+        offsets.append((start, end))
+    return offsets
diff --git a/tests/test_alignment.py b/tests/test_alignment.py
@@ -87,7 +87,7 @@ def _equal_xml(self, raw, tokenized):
         chunks = map(self.tokenizer._tokenize, token_dlls)
         complete = list(itertools.chain.from_iterable(chunks))
         complete = utils.escape_xml_tokens(complete)
-        offsets = somajo.alignment.token_offsets(complete, raw)
+        offsets = somajo.alignment.token_offsets_xml(complete, raw, self.tokenizer)
         self.assertEqual([raw[s:e] for s, e in offsets], tokenized)
 
     def test_token_alignment_01(self):
@@ -102,27 +102,64 @@ def test_token_alignment_03(self):
     def test_token_alignment_04(self):
         self._equal("foobar foobar foo\ufeffbar foobarbazquxalphabetagamma foo‌bar‍baz foo‏bar‎baz foo\u202bbar\u202abaz\u202cqux\u202ealpha\u202dbeta", ["foobar", "foobar", "foo\ufeffbar", "foobarbazquxalphabetagamma", "foo‌bar‍baz", "foo‏bar‎baz", "foo\u202bbar\u202abaz\u202cqux\u202ealpha\u202dbeta"])
 
+    @unittest.expectedFailure
     def test_token_alignment_05(self):
         self._equal_xml("<foo>der beste Betreuer? - &gt;ProfSmith! : )</foo>", ["<foo>", "der", "beste", "Betreuer", "?", "- &gt;", "Prof", "Smith", "!", ": )", "</foo>"])
 
+    @unittest.expectedFailure
     def test_token_alignment_06(self):
         self._equal_xml("<foo>das steht auf S.&#x00ad;5</foo>", "<foo> das steht auf S. 5 </foo>")
 
+    @unittest.expectedFailure
     def test_token_alignment_07(self):
         self._equal_xml("<foo><bar>na so was -&#x200B;</bar><bar>&gt; bla</bar></foo>", "<foo> <bar> na so was - </bar> <bar> &gt; bla </bar> </foo>")
 
     def test_token_alignment_08(self):
         self._equal_xml("<foo>T&#x0065;st</foo>", "<foo> T&#x0065;st </foo>")
 
+    @unittest.expectedFailure
     def test_token_alignment_09(self):
         self._equal_xml("<foo>3 &#x003c; 5</foo>", "<foo> 3 &#x003c; 5 </foo>")
 
+    @unittest.expectedFailure
+    def test_token_alignment_10(self):
+        self._equal_xml("<foo>Test&#x00ad;fall</foo>", "<foo> Test&#x00ad;fall </foo>")
+
+    def test_token_alignment_11(self):
+        self._equal_xml("<foo>Testfall</foo>", "<foo> Testfall </foo>")
+
+    @unittest.expectedFailure
+    def test_token_alignment_12(self):
+        """Single combining mark"""
+        self._equal_xml("<foo>foo xA&#x0308;x foo</foo>", "<foo> foo xA&#x0308;x foo </foo>")
+
+    @unittest.expectedFailure
+    def test_token_alignment_13(self):
+        """Multiple combining marks"""
+        self._equal_xml("<foo>foo xs&#x0323;&#x0307;x foo</foo>", "<foo> foo xs&#x0323;&#x0307;x foo </foo>")
+
+    @unittest.expectedFailure
+    def test_token_alignment_14(self):
+        """Multiple combining marks"""
+        self._equal_xml("<foo>foo xs&#x0307;&#x0323;x foo</foo>", "<foo> foo xs&#x0307;&#x0323;x foo </foo>")
+
+    def test_token_alignment_15(self):
+        """Multiple combining marks"""
+        self._equal_xml("<foo>foo xs&#x1e0b;&#x0323;x foo</foo>", "<foo> foo xs&#x1e0b;&#x0323;x foo </foo>")
+
+    def test_token_alignment_16(self):
+        """Multiple combining marks"""
+        self._equal_xml("<foo>foo xq&#x0307;&#x0323;x foo</foo>", "<foo> foo xq&#x0307;&#x0323;x foo </foo>")
+
+    @unittest.expectedFailure
     def test_xml_07(self):
         self._equal_xml("<foo><text><p>blendend. 👱‍</p></text><text ><blockquote><p>Foo bar baz</p></blockquote></text></foo>", ["<foo>", "<text>", "<p>", "blendend", ".", "👱‍", "</p>", "</text>", "<text >", "<blockquote>", "<p>", "Foo", "bar", "baz", "</p>", "</blockquote>", "</text>", "</foo>"])
 
+    @unittest.expectedFailure
     def test_xml_08(self):
         self._equal_xml("<text><p>Jens Spahn ist 🏽🏽 ein durch und durch ekelerregendes Subjekt.</p><p>So 🙇🙇 manchen Unionspolitikern gestehe ich schon …</p></text>", "<text> <p> Jens Spahn ist 🏽🏽 ein durch und durch ekelerregendes Subjekt . </p> <p> So 🙇 🙇 manchen Unionspolitikern gestehe ich schon … </p> </text>")
 
+    @unittest.expectedFailure
     def test_xml_09(self):
         self._equal_xml("""<text>
 <p>Jens Spahn ist 🏽🏽 ein durch und durch ekelerregendes Subjekt.</p>
@@ -136,7 +173,8 @@ def test_xml_10(self):
         self._equal_xml("<foo><p>foo bar</p>\n\n<p>foo bar</p></foo>", "<foo> <p> foo bar </p> <p> foo bar </p> </foo>")
 
     def test_xml_11(self):
-        self._equal_xml("<foo bar='baz'>Foo</foo>", ['<foo bar="baz">', 'Foo', '</foo>'])
+        self._equal_xml("<foo bar='baz'>Foo</foo>", ["<foo bar='baz'>", 'Foo', '</foo>'])
 
+    @unittest.expectedFailure
     def test_xml_12(self):
-        self._equal_xml("<foo bar='ba\"z'>Foo</foo>", ['<foo bar="ba&quot;z">', 'Foo', '</foo>'])
+        self._equal_xml("<foo bar='ba\"z'>Foo</foo>", ["<foo bar='ba\"z'>", 'Foo', '</foo>'])