diff --git a/src/somajo/alignment.py b/src/somajo/alignment.py index 968cfbe..c7a91a0 100644 --- a/src/somajo/alignment.py +++ b/src/somajo/alignment.py @@ -7,7 +7,7 @@ def align_nfc(nfc, orig): """Character alignment from NFC version to original string.""" - assert len(nfc) <= len(orig) + assert len(nfc) <= len(orig), f"len({nfc}) > len({orig})" alignment = {} if nfc == "": assert orig == "" @@ -23,11 +23,11 @@ def align_nfc(nfc, orig): orig_j = orig_i + 1 while (orig_j < len(orig)) and (unicodedata.combining(orig[orig_j]) > 0): orig_j += 1 - assert nfc[nfc_i:nfc_j] == unicodedata.normalize("NFC", orig[orig_i:orig_j]) + # assert nfc[nfc_i:nfc_j] == unicodedata.normalize("NFC", orig[orig_i:orig_j]) alignment[(nfc_i, nfc_j)] = (orig_i, orig_j) nfc_i = nfc_j orig_i = orig_j - assert orig_j == len(orig) + assert orig_j == len(orig), f"{orig_j} != {len(orig)}; nfc: '{nfc}', orig: '{orig}'" return alignment @@ -57,31 +57,81 @@ def resolve_entities(xml): return outstring, alignment -def pretoken_offsets_xml(tokens, raw): +def pretoken_offset_xml(token, raw): # resolve entities - raw_entityless, align_to_raw = resolve_entities(raw) - offsets = token_offsets(tokens, raw_entityless, xml_input=True) - offsets = [(align_to_raw[s][0], align_to_raw[e][1]) for s, e in offsets] - return offsets + raw, align_to_raw = resolve_entities(raw) + # print("align_to_raw", align_to_raw) + # offsets = token_offsets([token], raw_entityless) + ### + raw = re.sub(r"\s", " ", raw) + text = token.text + if token.original_spelling is not None: + text = token.original_spelling + text = re.sub(r"\s", " ", text) + if token.markup: + text, align_to_text = resolve_entities(text) + text = text.replace("'", '"') + pattern = "(" + re.escape(text) + ")" + if not text.startswith(" 0: raw_xml = "".join(output_buffer) diff --git a/tests/test_alignment.py b/tests/test_alignment.py index 28019cd..14844f3 100644 --- a/tests/test_alignment.py +++ b/tests/test_alignment.py @@ -82,7 +82,9 @@ def test_entitites_01(self): def test_entities_02(self): xml = "Test" resolved = "Test" - alignment = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 14), (14, 15), (15, 16), (16, 17), (17, 18), (18, 19), (19, 20), (20, 21), (21, 22)] + alignment = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), + (6, 14), (14, 15), (15, 16), (16, 17), (17, 18), + (18, 19), (19, 20), (20, 21), (21, 22)] res, al = somajo.alignment.resolve_entities(xml) self.assertEqual(res, resolved) self.assertEqual(al, alignment) @@ -180,3 +182,8 @@ def test_token_alignment_19(self): def test_token_alignment_20(self): self._equal_xml("Foo \"Bar\" 'Baz'", ["", "Foo", '"', "Bar", '"', "'", "Baz", "'", ""]) + + +# :\n) +# +#