Skip to content

Commit

Permalink
Start debugging…
Browse files Browse the repository at this point in the history
  • Loading branch information
tsproisl committed Oct 24, 2023
1 parent 07288ce commit cb52239
Show file tree
Hide file tree
Showing 4 changed files with 133 additions and 32 deletions.
84 changes: 67 additions & 17 deletions src/somajo/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

def align_nfc(nfc, orig):
"""Character alignment from NFC version to original string."""
assert len(nfc) <= len(orig)
assert len(nfc) <= len(orig), f"len({nfc}) > len({orig})"
alignment = {}
if nfc == "":
assert orig == ""
Expand All @@ -23,11 +23,11 @@ def align_nfc(nfc, orig):
orig_j = orig_i + 1
while (orig_j < len(orig)) and (unicodedata.combining(orig[orig_j]) > 0):
orig_j += 1
assert nfc[nfc_i:nfc_j] == unicodedata.normalize("NFC", orig[orig_i:orig_j])
# assert nfc[nfc_i:nfc_j] == unicodedata.normalize("NFC", orig[orig_i:orig_j])
alignment[(nfc_i, nfc_j)] = (orig_i, orig_j)
nfc_i = nfc_j
orig_i = orig_j
assert orig_j == len(orig)
assert orig_j == len(orig), f"{orig_j} != {len(orig)}; nfc: '{nfc}', orig: '{orig}'"
return alignment


Expand Down Expand Up @@ -57,31 +57,81 @@ def resolve_entities(xml):
return outstring, alignment


def pretoken_offsets_xml(tokens, raw):
def pretoken_offset_xml(token, raw):
# resolve entities
raw_entityless, align_to_raw = resolve_entities(raw)
offsets = token_offsets(tokens, raw_entityless, xml_input=True)
offsets = [(align_to_raw[s][0], align_to_raw[e][1]) for s, e in offsets]
return offsets
raw, align_to_raw = resolve_entities(raw)
# print("align_to_raw", align_to_raw)
# offsets = token_offsets([token], raw_entityless)
###
raw = re.sub(r"\s", " ", raw)
text = token.text
if token.original_spelling is not None:
text = token.original_spelling
text = re.sub(r"\s", " ", text)
if token.markup:
text, align_to_text = resolve_entities(text)
text = text.replace("'", '"')
pattern = "(" + re.escape(text) + ")"
if not text.startswith("</"):
pattern = pattern[:-2] + r"/?\s*" + pattern[-2:]
local_raw = raw.replace("'", '"')
m = re.search(pattern, local_raw)
if text.startswith("</") and not m:
start, end = 0, 0
else:
assert m, f"'{text}' not found in '{local_raw}'"
start, end = m.span(1)
else:
pattern = "(" + re.escape(text) + ")"
m = re.search(pattern, raw)
assert m, f"'{text}' not found in '{raw}'"
start, end = m.span(1)
if start == end:
return (align_to_raw[start][0], align_to_raw[start][0])
else:
return (align_to_raw[start][0], align_to_raw[end - 1][1])
###
# print("tokens", [t.text for t in tokens])
# print("raw", f"'{raw}'")
# print("offsets", offsets)
# offsets = [(align_to_raw[s][0], align_to_raw[s][0]) if s == e else (align_to_raw[s][0], align_to_raw[e - 1][1]) for s, e in offsets]
# return offsets


def token_offsets(tokens, raw):
def token_offsets(tokens, raw, position):
"""Determine start and end positions of tokens in the original raw (NFC) input."""
skipable_characters = r"[\s\u0000-\u001F\u007F-\u009F\u00AD\u061C\u200B-\u200F\u202A-\u202E\u2060\u2066-\u2069\uFEFF\uFE0F]*?"
# skipable_characters = r".*?"
offsets = []
raw_i = 0
raw = re.sub(r"\s", " ", raw)
for token in tokens:
text = token.text
local_raw = raw
if token.original_spelling is not None:
text = token.original_spelling
text = re.sub(r"\s", " ", text)
if token.markup:
text, align_to_text = resolve_entities(text)
text = text.replace("'", '"')
local_raw = raw.replace("'", '"')
pattern = ".*?(" + ".*?".join([re.escape(c) for c in text]) + ")"
m = re.search(pattern, local_raw, pos=raw_i)
assert m
start, end = m.span(1)
start, end = token.character_offset
start -= position
end -= position
# text, align_to_text = resolve_entities(text)
# text = text.replace("'", '"')
# pattern = skipable_characters + "(" + skipable_characters.join([re.escape(c) for c in text])
# if not text.startswith("</"):
# pattern = pattern[:-1] + "/??" + skipable_characters + pattern[-1]
# pattern += ")"
# local_raw = raw.replace("'", '"')
# m = re.search(pattern, local_raw, pos=raw_i)
# if text.startswith("</") and not m:
# start, end = raw_i, raw_i
# else:
# assert m, f"'{text}' not found in '{local_raw[raw_i:]}'"
# start, end = m.span(1)
else:
pattern = skipable_characters + "(" + skipable_characters.join([re.escape(c) for c in text]) + ")"
m = re.search(pattern, raw, pos=raw_i)
assert m, f"'{text}' not found in '{raw[raw_i:]}'\n{[ord(c) for c in text]} not found in {[ord(c) for c in raw[raw_i:]]}"
start, end = m.span(1)
offsets.append((start, end))
raw_i = end
return offsets
Expand Down
57 changes: 49 additions & 8 deletions src/somajo/somajo.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class SoMaJo:
paragraph_separators = {"empty_lines", "single_newlines"}
_default_parsep = "empty_lines"

def __init__(self, language, *, split_camel_case=False, split_sentences=True, xml_sentences=None, character_offsets=False):
def __init__(self, language, *, split_camel_case=False, split_sentences=True, xml_sentences=None, character_offsets=True):
assert language in self.supported_languages
self.language = language
self.split_camel_case = split_camel_case
Expand All @@ -66,18 +66,59 @@ def _tokenize(self, token_info, xml_input):
token_list, raw, position = token_info
token_dll = doubly_linked_list.DLL(token_list)
tokens = self._tokenizer._tokenize(token_dll)
print([t.text for t in tokens])
print(raw)
if self.character_offsets:
if xml_input:
# print(len(raw), raw)
raw, align_to_entities = alignment.resolve_entities(raw)
raw_nfc = unicodedata.normalize("NFC", raw)
align_from_entities = {i: v for v, k in enumerate(align_to_entities) for i in range(k[0], k[1])}
# print(raw)
# print(align_to_entities)
last = 0
chunks = []
for end in [t.character_offset[1] for t in tokens if t.markup]:
end -= position
# print(end, len(align_to_entities))
end = align_from_entities[end - 1] + 1
# print(end)
chunks.append(raw[last:end])
# print((last, end))
last = end
if last != len(raw):
chunks.append(raw[last:len(raw)])
chunks = [unicodedata.normalize("NFC", c) for c in chunks]
raw_nfc = "".join(chunks)
else:
raw_nfc = unicodedata.normalize("NFC", raw)
align_to_raw = alignment.align_nfc(raw_nfc, raw)
align_starts = {k[0]: v[0] for k, v in align_to_raw.items()}
align_ends = {k[1]: v[1] for k, v in align_to_raw.items()}
offsets = alignment.token_offsets(tokens, raw_nfc)
offsets = [(align_to_raw[align_starts[s]][0], align_to_raw[align_ends[e] - 1][1]) for s, e in offsets]
if xml_input:
offsets = [(align_to_entities[s][0], align_to_entities[e][1]) for s, e in offsets]
align_from_raw = {i: k for k, v in align_to_raw.items() for i in range(v[0], v[1])}
# align_starts = {k[0]: v[0] for k, v in align_to_raw.items()}
# align_ends = {k[1]: v[1] for k, v in align_to_raw.items()}
align_to_starts = {i: v[0] for k, v in align_to_raw.items() for i in range(k[0], k[1])}
align_to_ends = {i: v[1] for k, v in align_to_raw.items() for i in range(k[0], k[1])}
# print(align_to_starts)
# print(align_to_ends)
print(align_from_entities)
print(align_from_raw)
for i in range(len(tokens)):
if tokens[i].markup:
s, e = tokens[i].character_offset
print(s, e)
s -= position
e -= position
print(s, e)
tokens[i].character_offset = (align_from_raw[align_from_entities[s]][0] + position, align_from_raw[align_from_entities[e - 1]][1] + position)
offsets = alignment.token_offsets(tokens, raw_nfc, position)
assert len(tokens) == len(offsets)
# print(offsets)
# print(align_to_raw)
# print(align_to_starts)
# print(align_to_ends)
offsets = [(align_to_starts[s], align_to_ends[e - 1]) for s, e in offsets]
# print(offsets)
if xml_input:
offsets = [(align_to_entities[s][0], align_to_entities[e - 1][1]) for s, e in offsets]
for i in range(len(tokens)):
tokens[i].character_offset = offsets[i]
if self.split_sentences:
Expand Down
15 changes: 9 additions & 6 deletions src/somajo/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,12 +260,6 @@ def _xml_chunk_generator(f, eos_tags=None, prune_tags=None, character_offsets=Fa
if character_offsets:
input_buffer += "".join(line_list)
for token in token_list:
if character_offsets:
token_end = alignment.pretoken_offsets_xml([token], input_buffer)[0][1]
else:
token_end = 0
output_buffer.append(input_buffer[:token_end])
input_buffer = input_buffer[token_end:]
if token.markup:
# markup
if token.markup_eos:
Expand Down Expand Up @@ -319,6 +313,15 @@ def _xml_chunk_generator(f, eos_tags=None, prune_tags=None, character_offsets=Fa
bos = False
token.first_in_sentence = True
lexical_tokens += 1
if character_offsets:
token_start, token_end = alignment.pretoken_offset_xml(token, input_buffer)
if token.markup:
len_output_buffer = sum(len(o) for o in output_buffer)
token.character_offset = (token_start + position + len_output_buffer, token_end + position + len_output_buffer)
else:
token_end = 0
output_buffer.append(input_buffer[:token_end])
input_buffer = input_buffer[token_end:]
current.append(token)
if len(current) > 0:
raw_xml = "".join(output_buffer)
Expand Down
9 changes: 8 additions & 1 deletion tests/test_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,9 @@ def test_entitites_01(self):
def test_entities_02(self):
xml = "<foo>T&#x0065;st</foo>"
resolved = "<foo>Test</foo>"
alignment = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 14), (14, 15), (15, 16), (16, 17), (17, 18), (18, 19), (19, 20), (20, 21), (21, 22)]
alignment = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6),
(6, 14), (14, 15), (15, 16), (16, 17), (17, 18),
(18, 19), (19, 20), (20, 21), (21, 22)]
res, al = somajo.alignment.resolve_entities(xml)
self.assertEqual(res, resolved)
self.assertEqual(al, alignment)
Expand Down Expand Up @@ -180,3 +182,8 @@ def test_token_alignment_19(self):

def test_token_alignment_20(self):
self._equal_xml("<foo bar='ba\"z'>Foo \"Bar\" 'Baz'</foo>", ["<foo bar='ba\"z'>", "Foo", '"', "Bar", '"', "'", "Baz", "'", "</foo>"])


# :\n)
# <foo bar="baz"\nspam="eggs">
# <br/>

0 comments on commit cb52239

Please sign in to comment.