Skip to content

Commit

Permalink
Towards token offsets in XML input
Browse files Browse the repository at this point in the history
  • Loading branch information
tsproisl committed Oct 14, 2023
1 parent fb2b1fc commit 22d5d90
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 3 deletions.
49 changes: 49 additions & 0 deletions src/somajo/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,52 @@ def token_offsets(tokens, raw):
offsets.append((start, end))
raw_i = end
return offsets


def token_offsets_xml(tokens, raw, tokenizer):
"""Determine start and end positions of tokens in the original raw
(NFC) input. Account for XML entities.
"""
offsets = []
raw_i = 0
skip_pattern = "|".join([r"\s", "\uFE0F", tokenizer.controls.pattern, tokenizer.other_nasties.pattern])
skip = re.compile(skip_pattern)
for token in tokens:
text = token.text
if token.original_spelling is not None:
text = token.original_spelling
# print(text)
start, end = None, None
for i, char in enumerate(text):
while True:
# print(char, raw_i, raw[raw_i])
if char == raw[raw_i]:
s = raw_i
raw_i += 1
e = raw_i
break
elif ((char == "'") or (char == '"')) and ((raw[raw_i] == "'") or (raw[raw_i] == '"')):
s = raw_i
raw_i += 1
e = raw_i
break
elif raw[raw_i] == "&":
# TODO: process_entities(text, i, raw, raw_i)
s = raw_i
while raw[raw_i] != ";":
raw_i += 1
raw_i += 1
e = raw_i
entity = raw[s:e]
break
elif skip.match(raw[raw_i]):
raw_i += 1
continue
else:
raise ValueError(f"Cannot find char {char} from {text} in {raw[raw_i:raw_i + 20]}...")
if i == 0:
start = s
elif i == len(text) - 1:
end = e
offsets.append((start, end))
return offsets
44 changes: 41 additions & 3 deletions tests/test_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def _equal_xml(self, raw, tokenized):
chunks = map(self.tokenizer._tokenize, token_dlls)
complete = list(itertools.chain.from_iterable(chunks))
complete = utils.escape_xml_tokens(complete)
offsets = somajo.alignment.token_offsets(complete, raw)
offsets = somajo.alignment.token_offsets_xml(complete, raw, self.tokenizer)
self.assertEqual([raw[s:e] for s, e in offsets], tokenized)

def test_token_alignment_01(self):
Expand All @@ -102,27 +102,64 @@ def test_token_alignment_03(self):
def test_token_alignment_04(self):
self._equal("foo​bar foo­bar foo\ufeffbar foobarbazquxalphabetagamma foo‌bar‍baz foo‏bar‎baz foo\u202bbar\u202abaz\u202cqux\u202ealpha\u202dbeta", ["foo​bar", "foo­bar", "foo\ufeffbar", "foobarbazquxalphabetagamma", "foo‌bar‍baz", "foo‏bar‎baz", "foo\u202bbar\u202abaz\u202cqux\u202ealpha\u202dbeta"])

@unittest.expectedFailure
def test_token_alignment_05(self):
self._equal_xml("<foo>der beste Betreuer? - &gt;ProfSmith! : )</foo>", ["<foo>", "der", "beste", "Betreuer", "?", "- &gt;", "Prof", "Smith", "!", ": )", "</foo>"])

@unittest.expectedFailure
def test_token_alignment_06(self):
self._equal_xml("<foo>das steht auf S.&#x00ad;5</foo>", "<foo> das steht auf S. 5 </foo>")

@unittest.expectedFailure
def test_token_alignment_07(self):
self._equal_xml("<foo><bar>na so was -&#x200B;</bar><bar>&gt; bla</bar></foo>", "<foo> <bar> na so was - </bar> <bar> &gt; bla </bar> </foo>")

def test_token_alignment_08(self):
self._equal_xml("<foo>T&#x0065;st</foo>", "<foo> T&#x0065;st </foo>")

@unittest.expectedFailure
def test_token_alignment_09(self):
self._equal_xml("<foo>3 &#x003c; 5</foo>", "<foo> 3 &#x003c; 5 </foo>")

@unittest.expectedFailure
def test_token_alignment_10(self):
self._equal_xml("<foo>Test&#x00ad;fall</foo>", "<foo> Test&#x00ad;fall </foo>")

def test_token_alignment_11(self):
self._equal_xml("<foo>Test­fall</foo>", "<foo> Test­fall </foo>")

@unittest.expectedFailure
def test_token_alignment_12(self):
"""Single combining mark"""
self._equal_xml("<foo>foo xA&#x0308;x foo</foo>", "<foo> foo xA&#x0308;x foo </foo>")

@unittest.expectedFailure
def test_token_alignment_13(self):
"""Multiple combining marks"""
self._equal_xml("<foo>foo xs&#x0323;&#x0307;x foo</foo>", "<foo> foo xs&#x0323;&#x0307;x foo </foo>")

@unittest.expectedFailure
def test_token_alignment_14(self):
"""Multiple combining marks"""
self._equal_xml("<foo>foo xs&#x0307;&#x0323;x foo</foo>", "<foo> foo xs&#x0307;&#x0323;x foo </foo>")

def test_token_alignment_15(self):
"""Multiple combining marks"""
self._equal_xml("<foo>foo xs&#x1e0b;&#x0323;x foo</foo>", "<foo> foo xs&#x1e0b;&#x0323;x foo </foo>")

def test_token_alignment_16(self):
"""Multiple combining marks"""
self._equal_xml("<foo>foo xq&#x0307;&#x0323;x foo</foo>", "<foo> foo xq&#x0307;&#x0323;x foo </foo>")

@unittest.expectedFailure
def test_xml_07(self):
self._equal_xml("<foo><text><p>blendend. 👱‍</p></text><text ><blockquote><p>Foo bar baz</p></blockquote></text></foo>", ["<foo>", "<text>", "<p>", "blendend", ".", "👱‍", "</p>", "</text>", "<text >", "<blockquote>", "<p>", "Foo", "bar", "baz", "</p>", "</blockquote>", "</text>", "</foo>"])

@unittest.expectedFailure
def test_xml_08(self):
self._equal_xml("<text><p>Jens Spahn ist 🏽🏽 ein durch und durch ekelerregendes Subjekt.</p><p>So 🙇🙇 manchen Unionspolitikern gestehe ich schon …</p></text>", "<text> <p> Jens Spahn ist 🏽🏽 ein durch und durch ekelerregendes Subjekt . </p> <p> So 🙇 🙇 manchen Unionspolitikern gestehe ich schon … </p> </text>")

@unittest.expectedFailure
def test_xml_09(self):
self._equal_xml("""<text>
<p>Jens Spahn ist 🏽🏽 ein durch und durch ekelerregendes Subjekt.</p>
Expand All @@ -136,7 +173,8 @@ def test_xml_10(self):
self._equal_xml("<foo><p>foo bar</p>\n\n<p>foo bar</p></foo>", "<foo> <p> foo bar </p> <p> foo bar </p> </foo>")

def test_xml_11(self):
self._equal_xml("<foo bar='baz'>Foo</foo>", ['<foo bar="baz">', 'Foo', '</foo>'])
self._equal_xml("<foo bar='baz'>Foo</foo>", ["<foo bar='baz'>", 'Foo', '</foo>'])

@unittest.expectedFailure
def test_xml_12(self):
self._equal_xml("<foo bar='ba\"z'>Foo</foo>", ['<foo bar="ba&quot;z">', 'Foo', '</foo>'])
self._equal_xml("<foo bar='ba\"z'>Foo</foo>", ["<foo bar='ba\"z'>", 'Foo', '</foo>'])

0 comments on commit 22d5d90

Please sign in to comment.