From 22d5d902018e3f8d509a4af20bb25a5e989ef5a5 Mon Sep 17 00:00:00 2001 From: Thomas Proisl Date: Sat, 14 Oct 2023 23:12:59 +0200 Subject: [PATCH] Towards token offsets in XML input --- src/somajo/alignment.py | 49 +++++++++++++++++++++++++++++++++++++++++ tests/test_alignment.py | 44 +++++++++++++++++++++++++++++++++--- 2 files changed, 90 insertions(+), 3 deletions(-) diff --git a/src/somajo/alignment.py b/src/somajo/alignment.py index 22f9374..fca0e50 100644 --- a/src/somajo/alignment.py +++ b/src/somajo/alignment.py @@ -46,3 +46,52 @@ def token_offsets(tokens, raw): offsets.append((start, end)) raw_i = end return offsets + + +def token_offsets_xml(tokens, raw, tokenizer): + """Determine start and end positions of tokens in the original raw + (NFC) input. Account for XML entities. + """ + offsets = [] + raw_i = 0 + skip_pattern = "|".join([r"\s", "\uFE0F", tokenizer.controls.pattern, tokenizer.other_nasties.pattern]) + skip = re.compile(skip_pattern) + for token in tokens: + text = token.text + if token.original_spelling is not None: + text = token.original_spelling + # print(text) + start, end = None, None + for i, char in enumerate(text): + while True: + # print(char, raw_i, raw[raw_i]) + if char == raw[raw_i]: + s = raw_i + raw_i += 1 + e = raw_i + break + elif ((char == "'") or (char == '"')) and ((raw[raw_i] == "'") or (raw[raw_i] == '"')): + s = raw_i + raw_i += 1 + e = raw_i + break + elif raw[raw_i] == "&": + # TODO: process_entities(text, i, raw, raw_i) + s = raw_i + while raw[raw_i] != ";": + raw_i += 1 + raw_i += 1 + e = raw_i + entity = raw[s:e] + break + elif skip.match(raw[raw_i]): + raw_i += 1 + continue + else: + raise ValueError(f"Cannot find char {char} from {text} in {raw[raw_i:raw_i + 20]}...") + if i == 0: + start = s + elif i == len(text) - 1: + end = e + offsets.append((start, end)) + return offsets diff --git a/tests/test_alignment.py b/tests/test_alignment.py index 6f43902..9f31320 100644 --- a/tests/test_alignment.py +++ b/tests/test_alignment.py @@ -87,7 +87,7 @@ def _equal_xml(self, raw, tokenized): chunks = map(self.tokenizer._tokenize, token_dlls) complete = list(itertools.chain.from_iterable(chunks)) complete = utils.escape_xml_tokens(complete) - offsets = somajo.alignment.token_offsets(complete, raw) + offsets = somajo.alignment.token_offsets_xml(complete, raw, self.tokenizer) self.assertEqual([raw[s:e] for s, e in offsets], tokenized) def test_token_alignment_01(self): @@ -102,27 +102,64 @@ def test_token_alignment_03(self): def test_token_alignment_04(self): self._equal("foo​bar foo­bar foo\ufeffbar foobarbazquxalphabetagamma foo‌bar‍baz foo‏bar‎baz foo\u202bbar\u202abaz\u202cqux\u202ealpha\u202dbeta", ["foo​bar", "foo­bar", "foo\ufeffbar", "foobarbazquxalphabetagamma", "foo‌bar‍baz", "foo‏bar‎baz", "foo\u202bbar\u202abaz\u202cqux\u202ealpha\u202dbeta"]) + @unittest.expectedFailure def test_token_alignment_05(self): self._equal_xml("der beste Betreuer? - >ProfSmith! : )", ["", "der", "beste", "Betreuer", "?", "- >", "Prof", "Smith", "!", ": )", ""]) + @unittest.expectedFailure def test_token_alignment_06(self): self._equal_xml("das steht auf S.­5", " das steht auf S. 5 ") + @unittest.expectedFailure def test_token_alignment_07(self): self._equal_xml("na so was -​> bla", " na so was - > bla ") def test_token_alignment_08(self): self._equal_xml("Test", " Test ") + @unittest.expectedFailure def test_token_alignment_09(self): self._equal_xml("3 < 5", " 3 < 5 ") + @unittest.expectedFailure + def test_token_alignment_10(self): + self._equal_xml("Test­fall", " Test­fall ") + + def test_token_alignment_11(self): + self._equal_xml("Test­fall", " Test­fall ") + + @unittest.expectedFailure + def test_token_alignment_12(self): + """Single combining mark""" + self._equal_xml("foo xÄx foo", " foo xÄx foo ") + + @unittest.expectedFailure + def test_token_alignment_13(self): + """Multiple combining marks""" + self._equal_xml("foo xṩx foo", " foo xṩx foo ") + + @unittest.expectedFailure + def test_token_alignment_14(self): + """Multiple combining marks""" + self._equal_xml("foo xṩx foo", " foo xṩx foo ") + + def test_token_alignment_15(self): + """Multiple combining marks""" + self._equal_xml("foo xsḍ̇x foo", " foo xsḍ̇x foo ") + + def test_token_alignment_16(self): + """Multiple combining marks""" + self._equal_xml("foo xq̣̇x foo", " foo xq̣̇x foo ") + + @unittest.expectedFailure def test_xml_07(self): self._equal_xml("

blendend. 👱‍

Foo bar baz

", ["", "", "

", "blendend", ".", "👱‍", "

", "
", "", "
", "

", "Foo", "bar", "baz", "

", "
", "
", "
"]) + @unittest.expectedFailure def test_xml_08(self): self._equal_xml("

Jens Spahn ist 🏽🏽 ein durch und durch ekelerregendes Subjekt.

So 🙇🙇 manchen Unionspolitikern gestehe ich schon …

", "

Jens Spahn ist 🏽🏽 ein durch und durch ekelerregendes Subjekt .

So 🙇 🙇 manchen Unionspolitikern gestehe ich schon …

") + @unittest.expectedFailure def test_xml_09(self): self._equal_xml("""

Jens Spahn ist 🏽🏽 ein durch und durch ekelerregendes Subjekt.

@@ -136,7 +173,8 @@ def test_xml_10(self): self._equal_xml("

foo bar

\n\n

foo bar

", "

foo bar

foo bar

") def test_xml_11(self): - self._equal_xml("Foo", ['', 'Foo', '']) + self._equal_xml("Foo", ["", 'Foo', '']) + @unittest.expectedFailure def test_xml_12(self): - self._equal_xml("Foo", ['', 'Foo', '']) + self._equal_xml("Foo", ["", 'Foo', ''])