Skip to content

Commit

Permalink
Towards token offsets in XML input (part 2)
Browse files Browse the repository at this point in the history
  • Loading branch information
Thomas Proisl committed Oct 16, 2023
1 parent 22d5d90 commit 2e020d3
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 50 deletions.
78 changes: 39 additions & 39 deletions src/somajo/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,16 @@ def align_nfc(nfc, orig):
return alignment


def token_offsets(tokens, raw):
def token_offsets(tokens, raw, resolve_xml_entities=False):
"""Determine start and end positions of tokens in the original raw (NFC) input."""
offsets = []
raw_i = 0
for token in tokens:
text = token.text
if token.original_spelling is not None:
text = token.original_spelling
if resolve_xml_entities:
text, align_to_text = resolve_entities(text)
pattern = ".*?(" + ".*?".join([re.escape(c) for c in text]) + ")"
m = re.search(pattern, raw, pos=raw_i)
assert m
Expand All @@ -48,6 +50,32 @@ def token_offsets(tokens, raw):
return offsets


def resolve_entities(xml):
entity = re.compile(r"&(?:#\d+|#x[0-9a-f]+|amp|apos|gt|lt|quot);", re.I)
named = {"&amp;": "&", "&apos;": "'", "&gt;": ">", "&lt;": "<", "&quot;": '"'}
outstring = ""
alignment = []
xml = xml.lower()
i = 0
for m in entity.finditer(xml):
start, end = m.span()
if xml[start + 2] == "x":
char = chr(int(xml[start + 3:end - 1], base=16))
elif xml[start + 1] == "#":
char = chr(int(xml[start + 2:end - 1]))
else:
char = named[xml[start:end]]
outstring += xml[i:start] + char
for j in range(i, start):
alignment.append((j, j + 1))
alignment.append((start, end))
i = end
outstring += xml[i:len(xml)]
for j in range(i, len(xml)):
alignment.append((j, j + 1))
return outstring, alignment


def token_offsets_xml(tokens, raw, tokenizer):
"""Determine start and end positions of tokens in the original raw
(NFC) input. Account for XML entities.
Expand All @@ -56,42 +84,14 @@ def token_offsets_xml(tokens, raw, tokenizer):
raw_i = 0
skip_pattern = "|".join([r"\s", "\uFE0F", tokenizer.controls.pattern, tokenizer.other_nasties.pattern])
skip = re.compile(skip_pattern)
for token in tokens:
text = token.text
if token.original_spelling is not None:
text = token.original_spelling
# print(text)
start, end = None, None
for i, char in enumerate(text):
while True:
# print(char, raw_i, raw[raw_i])
if char == raw[raw_i]:
s = raw_i
raw_i += 1
e = raw_i
break
elif ((char == "'") or (char == '"')) and ((raw[raw_i] == "'") or (raw[raw_i] == '"')):
s = raw_i
raw_i += 1
e = raw_i
break
elif raw[raw_i] == "&":
# TODO: process_entities(text, i, raw, raw_i)
s = raw_i
while raw[raw_i] != ";":
raw_i += 1
raw_i += 1
e = raw_i
entity = raw[s:e]
break
elif skip.match(raw[raw_i]):
raw_i += 1
continue
else:
raise ValueError(f"Cannot find char {char} from {text} in {raw[raw_i:raw_i + 20]}...")
if i == 0:
start = s
elif i == len(text) - 1:
end = e
offsets.append((start, end))
# resolve entities
raw_entityless, align_to_raw = resolve_entities(raw)
# convert to NFC
raw_nfc = unicodedata.normalize("NFC", raw_entityless)
# align NFC
align_to_entityless = align_nfc(raw_nfc, raw_entityless)
align_starts = {k[0]: v[0] for k, v in align_to_entityless.items()}
align_ends = {k[1]: v[1] for k, v in align_to_entityless.items()}
offsets = token_offsets(tokens, raw_nfc, resolve_xml_entities=True)
offsets = [(align_to_raw[align_starts[s]][0], align_to_raw[align_ends[e] - 1][1]) for s, e in offsets]
return offsets
35 changes: 24 additions & 11 deletions tests/test_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,29 @@ def test_nfc_07(self):
self.assertEqual(somajo.alignment.align_nfc(nfc, orig), alignment)


class TestResolveEntities(unittest.TestCase):
def test_entitites_01(self):
xml = '<foo attr="bar &quot;baz&quot; qux">foo &lt;bar&gt; baz</foo>'
resolved = '<foo attr="bar "baz" qux">foo <bar> baz</foo>'
alignment = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6),
(6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12),
(12, 13), (13, 14), (14, 15), (15, 21), (21, 22),
(22, 23), (23, 24), (24, 30), (30, 31), (31, 32),
(32, 33), (33, 34), (34, 35), (35, 36), (36, 37),
(37, 38), (38, 39), (39, 40), (40, 44), (44, 45),
(45, 46), (46, 47), (47, 51), (51, 52), (52, 53),
(53, 54), (54, 55), (55, 56), (56, 57), (57, 58),
(58, 59), (59, 60), (60, 61)]
res, al = somajo.alignment.resolve_entities(xml)
self.assertEqual(res, resolved)
self.assertEqual(al, alignment)

def test_entities_02(self):
xml = "<foo>T&#x0065;st</foo>"
resolved = "<foo>Test</foo>"
alignment = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 14), (14, 15), (15, 16), (16, 17), (17, 18), (18, 19), (19, 20), (20, 21), (21, 22)]


class TestTokenAlignment(unittest.TestCase):
def setUp(self):
"""Necessary preparations"""
Expand Down Expand Up @@ -102,43 +125,35 @@ def test_token_alignment_03(self):
def test_token_alignment_04(self):
self._equal("foo​bar foo­bar foo\ufeffbar foobarbazquxalphabetagamma foo‌bar‍baz foo‏bar‎baz foo\u202bbar\u202abaz\u202cqux\u202ealpha\u202dbeta", ["foo​bar", "foo­bar", "foo\ufeffbar", "foobarbazquxalphabetagamma", "foo‌bar‍baz", "foo‏bar‎baz", "foo\u202bbar\u202abaz\u202cqux\u202ealpha\u202dbeta"])

@unittest.expectedFailure
def test_token_alignment_05(self):
self._equal_xml("<foo>der beste Betreuer? - &gt;ProfSmith! : )</foo>", ["<foo>", "der", "beste", "Betreuer", "?", "- &gt;", "Prof", "Smith", "!", ": )", "</foo>"])

@unittest.expectedFailure
def test_token_alignment_06(self):
self._equal_xml("<foo>das steht auf S.&#x00ad;5</foo>", "<foo> das steht auf S. 5 </foo>")

@unittest.expectedFailure
def test_token_alignment_07(self):
self._equal_xml("<foo><bar>na so was -&#x200B;</bar><bar>&gt; bla</bar></foo>", "<foo> <bar> na so was - </bar> <bar> &gt; bla </bar> </foo>")

def test_token_alignment_08(self):
self._equal_xml("<foo>T&#x0065;st</foo>", "<foo> T&#x0065;st </foo>")

@unittest.expectedFailure
def test_token_alignment_09(self):
self._equal_xml("<foo>3 &#x003c; 5</foo>", "<foo> 3 &#x003c; 5 </foo>")

@unittest.expectedFailure
def test_token_alignment_10(self):
self._equal_xml("<foo>Test&#x00ad;fall</foo>", "<foo> Test&#x00ad;fall </foo>")

def test_token_alignment_11(self):
self._equal_xml("<foo>Test­fall</foo>", "<foo> Test­fall </foo>")

@unittest.expectedFailure
def test_token_alignment_12(self):
"""Single combining mark"""
self._equal_xml("<foo>foo xA&#x0308;x foo</foo>", "<foo> foo xA&#x0308;x foo </foo>")

@unittest.expectedFailure
def test_token_alignment_13(self):
"""Multiple combining marks"""
self._equal_xml("<foo>foo xs&#x0323;&#x0307;x foo</foo>", "<foo> foo xs&#x0323;&#x0307;x foo </foo>")

@unittest.expectedFailure
def test_token_alignment_14(self):
"""Multiple combining marks"""
self._equal_xml("<foo>foo xs&#x0307;&#x0323;x foo</foo>", "<foo> foo xs&#x0307;&#x0323;x foo </foo>")
Expand All @@ -151,15 +166,12 @@ def test_token_alignment_16(self):
"""Multiple combining marks"""
self._equal_xml("<foo>foo xq&#x0307;&#x0323;x foo</foo>", "<foo> foo xq&#x0307;&#x0323;x foo </foo>")

@unittest.expectedFailure
def test_xml_07(self):
self._equal_xml("<foo><text><p>blendend. 👱‍</p></text><text ><blockquote><p>Foo bar baz</p></blockquote></text></foo>", ["<foo>", "<text>", "<p>", "blendend", ".", "👱‍", "</p>", "</text>", "<text >", "<blockquote>", "<p>", "Foo", "bar", "baz", "</p>", "</blockquote>", "</text>", "</foo>"])

@unittest.expectedFailure
def test_xml_08(self):
self._equal_xml("<text><p>Jens Spahn ist 🏽🏽 ein durch und durch ekelerregendes Subjekt.</p><p>So 🙇🙇 manchen Unionspolitikern gestehe ich schon …</p></text>", "<text> <p> Jens Spahn ist 🏽🏽 ein durch und durch ekelerregendes Subjekt . </p> <p> So 🙇 🙇 manchen Unionspolitikern gestehe ich schon … </p> </text>")

@unittest.expectedFailure
def test_xml_09(self):
self._equal_xml("""<text>
<p>Jens Spahn ist 🏽🏽 ein durch und durch ekelerregendes Subjekt.</p>
Expand All @@ -172,6 +184,7 @@ def test_xml_09(self):
def test_xml_10(self):
self._equal_xml("<foo><p>foo bar</p>\n\n<p>foo bar</p></foo>", "<foo> <p> foo bar </p> <p> foo bar </p> </foo>")

@unittest.expectedFailure
def test_xml_11(self):
self._equal_xml("<foo bar='baz'>Foo</foo>", ["<foo bar='baz'>", 'Foo', '</foo>'])

Expand Down

0 comments on commit 2e020d3

Please sign in to comment.