Towards token offsets in XML input (part 2)

tsproisl · Oct 16, 2023 · 2e020d3 · 2e020d3
1 parent 22d5d90
commit 2e020d3
Show file tree

Hide file tree

Showing 2 changed files with 63 additions and 50 deletions.
diff --git a/src/somajo/alignment.py b/src/somajo/alignment.py
@@ -31,14 +31,16 @@ def align_nfc(nfc, orig):
     return alignment
 
 
-def token_offsets(tokens, raw):
+def token_offsets(tokens, raw, resolve_xml_entities=False):
     """Determine start and end positions of tokens in the original raw (NFC) input."""
     offsets = []
     raw_i = 0
     for token in tokens:
         text = token.text
         if token.original_spelling is not None:
             text = token.original_spelling
+        if resolve_xml_entities:
+            text, align_to_text = resolve_entities(text)
         pattern = ".*?(" + ".*?".join([re.escape(c) for c in text]) + ")"
         m = re.search(pattern, raw, pos=raw_i)
         assert m
@@ -48,6 +50,32 @@ def token_offsets(tokens, raw):
     return offsets
 
 
+def resolve_entities(xml):
+    entity = re.compile(r"&(?:#\d+|#x[0-9a-f]+|amp|apos|gt|lt|quot);", re.I)
+    named = {"&amp;": "&", "&apos;": "'", "&gt;": ">", "&lt;": "<", "&quot;": '"'}
+    outstring = ""
+    alignment = []
+    xml = xml.lower()
+    i = 0
+    for m in entity.finditer(xml):
+        start, end = m.span()
+        if xml[start + 2] == "x":
+            char = chr(int(xml[start + 3:end - 1], base=16))
+        elif xml[start + 1] == "#":
+            char = chr(int(xml[start + 2:end - 1]))
+        else:
+            char = named[xml[start:end]]
+        outstring += xml[i:start] + char
+        for j in range(i, start):
+            alignment.append((j, j + 1))
+        alignment.append((start, end))
+        i = end
+    outstring += xml[i:len(xml)]
+    for j in range(i, len(xml)):
+        alignment.append((j, j + 1))
+    return outstring, alignment
+
+
 def token_offsets_xml(tokens, raw, tokenizer):
     """Determine start and end positions of tokens in the original raw
     (NFC) input. Account for XML entities.
@@ -56,42 +84,14 @@ def token_offsets_xml(tokens, raw, tokenizer):
     raw_i = 0
     skip_pattern = "|".join([r"\s", "\uFE0F", tokenizer.controls.pattern, tokenizer.other_nasties.pattern])
     skip = re.compile(skip_pattern)
-    for token in tokens:
-        text = token.text
-        if token.original_spelling is not None:
-            text = token.original_spelling
-        # print(text)
-        start, end = None, None
-        for i, char in enumerate(text):
-            while True:
-                # print(char, raw_i, raw[raw_i])
-                if char == raw[raw_i]:
-                    s = raw_i
-                    raw_i += 1
-                    e = raw_i
-                    break
-                elif ((char == "'") or (char == '"')) and ((raw[raw_i] == "'") or (raw[raw_i] == '"')):
-                    s = raw_i
-                    raw_i += 1
-                    e = raw_i
-                    break
-                elif raw[raw_i] == "&":
-                    # TODO: process_entities(text, i, raw, raw_i)
-                    s = raw_i
-                    while raw[raw_i] != ";":
-                        raw_i += 1
-                    raw_i += 1
-                    e = raw_i
-                    entity = raw[s:e]
-                    break
-                elif skip.match(raw[raw_i]):
-                    raw_i += 1
-                    continue
-                else:
-                    raise ValueError(f"Cannot find char {char} from {text} in {raw[raw_i:raw_i + 20]}...")
-            if i == 0:
-                start = s
-            elif i == len(text) - 1:
-                end = e
-        offsets.append((start, end))
+    # resolve entities
+    raw_entityless, align_to_raw = resolve_entities(raw)
+    # convert to NFC
+    raw_nfc = unicodedata.normalize("NFC", raw_entityless)
+    # align NFC
+    align_to_entityless = align_nfc(raw_nfc, raw_entityless)
+    align_starts = {k[0]: v[0] for k, v in align_to_entityless.items()}
+    align_ends = {k[1]: v[1] for k, v in align_to_entityless.items()}
+    offsets = token_offsets(tokens, raw_nfc, resolve_xml_entities=True)
+    offsets = [(align_to_raw[align_starts[s]][0], align_to_raw[align_ends[e] - 1][1]) for s, e in offsets]
     return offsets
diff --git a/tests/test_alignment.py b/tests/test_alignment.py
@@ -62,6 +62,29 @@ def test_nfc_07(self):
         self.assertEqual(somajo.alignment.align_nfc(nfc, orig), alignment)
 
 
+class TestResolveEntities(unittest.TestCase):
+    def test_entitites_01(self):
+        xml = '<foo attr="bar &quot;baz&quot; qux">foo &lt;bar&gt; baz</foo>'
+        resolved = '<foo attr="bar "baz" qux">foo <bar> baz</foo>'
+        alignment = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6),
+                     (6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12),
+                     (12, 13), (13, 14), (14, 15), (15, 21), (21, 22),
+                     (22, 23), (23, 24), (24, 30), (30, 31), (31, 32),
+                     (32, 33), (33, 34), (34, 35), (35, 36), (36, 37),
+                     (37, 38), (38, 39), (39, 40), (40, 44), (44, 45),
+                     (45, 46), (46, 47), (47, 51), (51, 52), (52, 53),
+                     (53, 54), (54, 55), (55, 56), (56, 57), (57, 58),
+                     (58, 59), (59, 60), (60, 61)]
+        res, al = somajo.alignment.resolve_entities(xml)
+        self.assertEqual(res, resolved)
+        self.assertEqual(al, alignment)
+
+    def test_entities_02(self):
+        xml = "<foo>T&#x0065;st</foo>"
+        resolved = "<foo>Test</foo>"
+        alignment = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 14), (14, 15), (15, 16), (16, 17), (17, 18), (18, 19), (19, 20), (20, 21), (21, 22)]
+
+
 class TestTokenAlignment(unittest.TestCase):
     def setUp(self):
         """Necessary preparations"""
@@ -102,43 +125,35 @@ def test_token_alignment_03(self):
     def test_token_alignment_04(self):
         self._equal("foobar foobar foo\ufeffbar foobarbazquxalphabetagamma foo‌bar‍baz foo‏bar‎baz foo\u202bbar\u202abaz\u202cqux\u202ealpha\u202dbeta", ["foobar", "foobar", "foo\ufeffbar", "foobarbazquxalphabetagamma", "foo‌bar‍baz", "foo‏bar‎baz", "foo\u202bbar\u202abaz\u202cqux\u202ealpha\u202dbeta"])
 
-    @unittest.expectedFailure
     def test_token_alignment_05(self):
         self._equal_xml("<foo>der beste Betreuer? - &gt;ProfSmith! : )</foo>", ["<foo>", "der", "beste", "Betreuer", "?", "- &gt;", "Prof", "Smith", "!", ": )", "</foo>"])
 
-    @unittest.expectedFailure
     def test_token_alignment_06(self):
         self._equal_xml("<foo>das steht auf S.&#x00ad;5</foo>", "<foo> das steht auf S. 5 </foo>")
 
-    @unittest.expectedFailure
     def test_token_alignment_07(self):
         self._equal_xml("<foo><bar>na so was -&#x200B;</bar><bar>&gt; bla</bar></foo>", "<foo> <bar> na so was - </bar> <bar> &gt; bla </bar> </foo>")
 
     def test_token_alignment_08(self):
         self._equal_xml("<foo>T&#x0065;st</foo>", "<foo> T&#x0065;st </foo>")
 
-    @unittest.expectedFailure
     def test_token_alignment_09(self):
         self._equal_xml("<foo>3 &#x003c; 5</foo>", "<foo> 3 &#x003c; 5 </foo>")
 
-    @unittest.expectedFailure
     def test_token_alignment_10(self):
         self._equal_xml("<foo>Test&#x00ad;fall</foo>", "<foo> Test&#x00ad;fall </foo>")
 
     def test_token_alignment_11(self):
         self._equal_xml("<foo>Testfall</foo>", "<foo> Testfall </foo>")
 
-    @unittest.expectedFailure
     def test_token_alignment_12(self):
         """Single combining mark"""
         self._equal_xml("<foo>foo xA&#x0308;x foo</foo>", "<foo> foo xA&#x0308;x foo </foo>")
 
-    @unittest.expectedFailure
     def test_token_alignment_13(self):
         """Multiple combining marks"""
         self._equal_xml("<foo>foo xs&#x0323;&#x0307;x foo</foo>", "<foo> foo xs&#x0323;&#x0307;x foo </foo>")
 
-    @unittest.expectedFailure
     def test_token_alignment_14(self):
         """Multiple combining marks"""
         self._equal_xml("<foo>foo xs&#x0307;&#x0323;x foo</foo>", "<foo> foo xs&#x0307;&#x0323;x foo </foo>")
@@ -151,15 +166,12 @@ def test_token_alignment_16(self):
         """Multiple combining marks"""
         self._equal_xml("<foo>foo xq&#x0307;&#x0323;x foo</foo>", "<foo> foo xq&#x0307;&#x0323;x foo </foo>")
 
-    @unittest.expectedFailure
     def test_xml_07(self):
         self._equal_xml("<foo><text><p>blendend. 👱‍</p></text><text ><blockquote><p>Foo bar baz</p></blockquote></text></foo>", ["<foo>", "<text>", "<p>", "blendend", ".", "👱‍", "</p>", "</text>", "<text >", "<blockquote>", "<p>", "Foo", "bar", "baz", "</p>", "</blockquote>", "</text>", "</foo>"])
 
-    @unittest.expectedFailure
     def test_xml_08(self):
         self._equal_xml("<text><p>Jens Spahn ist 🏽🏽 ein durch und durch ekelerregendes Subjekt.</p><p>So 🙇🙇 manchen Unionspolitikern gestehe ich schon …</p></text>", "<text> <p> Jens Spahn ist 🏽🏽 ein durch und durch ekelerregendes Subjekt . </p> <p> So 🙇 🙇 manchen Unionspolitikern gestehe ich schon … </p> </text>")
 
-    @unittest.expectedFailure
     def test_xml_09(self):
         self._equal_xml("""<text>
 <p>Jens Spahn ist 🏽🏽 ein durch und durch ekelerregendes Subjekt.</p>
@@ -172,6 +184,7 @@ def test_xml_09(self):
     def test_xml_10(self):
         self._equal_xml("<foo><p>foo bar</p>\n\n<p>foo bar</p></foo>", "<foo> <p> foo bar </p> <p> foo bar </p> </foo>")
 
+    @unittest.expectedFailure
     def test_xml_11(self):
         self._equal_xml("<foo bar='baz'>Foo</foo>", ["<foo bar='baz'>", 'Foo', '</foo>'])