Skip to content

Commit

Permalink
Clean-up and documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
Thomas Proisl committed Oct 25, 2023
1 parent 61006ab commit 8fa6e27
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 83 deletions.
101 changes: 43 additions & 58 deletions src/somajo/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,24 @@
import regex as re


ranges = [
_ranges = [
(0x0000, 0x001F),
(0x007F, 0x009F),
(0x2000, 0x200A), # whitespace
(0x200B, 0x200F),
(0x202A, 0x202E),
(0x2066, 0x2069)
]
single_characters = ["\u00AD", "\u061C", "\u2060", "\uFEFF", "\uFE0F"]
whitespace = [" ", "\u00A0", "\u1680", "\u2028", "\u2029", "\u202F", "\u205F", "\u3000"]
skipable_characters = set(single_characters + whitespace + [chr(i) for start, end in ranges for i in range(start, end + 1)])
_single_characters = ["\u00AD", "\u061C", "\u2060", "\uFEFF", "\uFE0F"]
_whitespace = [" ", "\u00A0", "\u1680", "\u2028", "\u2029", "\u202F", "\u205F", "\u3000"]
_skipable_characters = set(_single_characters + _whitespace + [chr(i) for start, end in _ranges for i in range(start, end + 1)])


def _align_nfc(nfc, orig):
"""Character alignment from NFC version to original string."""
alignment = {}
if nfc == "":
assert orig == ""
assert orig == "", "NFC string is empty - expected original string to be also empty; it is '{orig}' instead"
return alignment
nfc_i, nfc_j = 0, 0
orig_i, orig_j = 0, 0
Expand All @@ -33,7 +33,7 @@ def _align_nfc(nfc, orig):
orig_j = orig_i + 1
while (orig_j < len(orig)) and (unicodedata.combining(orig[orig_j]) > 0):
orig_j += 1
assert nfc[nfc_i:nfc_j] == unicodedata.normalize("NFC", orig[orig_i:orig_j])
assert nfc[nfc_i:nfc_j] == unicodedata.normalize("NFC", orig[orig_i:orig_j]), f"'{nfc[nfc_i:nfc_j]}' != unicodedata.normalize('NFC', '{orig[orig_i:orig_j]}')"
alignment[(nfc_i, nfc_j)] = (orig_i, orig_j)
nfc_i = nfc_j
orig_i = orig_j
Expand Down Expand Up @@ -66,14 +66,15 @@ def _determine_offsets(tokens, raw, position):
end = j + 1
break
else:
assert raw[j] in skipable_characters, f"'{raw[j]}' ({hex(ord(raw[j]))}) not a skipable character; token: '{text}', raw: '{raw[raw_i:]}'"
assert raw[j] in _skipable_characters, f"'{raw[j]}' ({hex(ord(raw[j]))}) is not a skipable character; token: '{text}', raw: '{raw[raw_i:]}'"
raw_start = j + 1
offsets.append((start, end))
raw_i = end
return offsets


def _resolve_entities(xml):
"""Resolve XML entities and provide an alignment from output string to input string."""
entity = re.compile(r"&(?:#\d+|#x[0-9a-f]+|amp|apos|gt|lt|quot);", re.I)
named = {"&amp;": "&", "&apos;": "'", "&gt;": ">", "&lt;": "<", "&quot;": '"'}
outstring = ""
Expand All @@ -99,42 +100,8 @@ def _resolve_entities(xml):
return outstring, alignment


def pretoken_offset_xml(token, raw):
# resolve entities
raw, align_to_raw = _resolve_entities(raw)
raw = re.sub(r"\s", " ", raw)
text = token.text
text = re.sub(r"\s", " ", text)
if token.markup:
text, align_to_text = _resolve_entities(text)
text = text.replace("'", '"')
if raw.startswith(text):
start = 0
end = len(text)
else:
pattern = "(" + re.escape(text) + ")"
pattern = pattern.replace(r"\ ", r"\s+")
pattern = pattern.replace("=", r"\s*=\s*")
if not text.startswith("</"):
pattern = pattern[:-2] + r"\s*/?\s*" + pattern[-2:]
local_raw = raw.replace("'", '"')
m = re.match(pattern, local_raw)
if text.startswith("</") and not m:
start, end = 0, 0
else:
assert m, f"'{text}' not found in '{local_raw}'"
start, end = m.span(1)
else:
assert raw.startswith(text), f"'{raw}' does not start with '{text}'"
start = 0
end = len(text)
if start == end:
return (align_to_raw[start][0], align_to_raw[start][0])
else:
return (align_to_raw[start][0], align_to_raw[end - 1][1])


def token_offsets(token_list, raw, position, xml_input, tokens):
"""Determine character offsets for tokens."""
if xml_input:
chunk_offsets = [(t.character_offset[0] - position, t.character_offset[1] - position) for t in token_list]
raw, align_to_entities = _resolve_entities(raw)
Expand Down Expand Up @@ -164,25 +131,43 @@ def token_offsets(token_list, raw, position, xml_input, tokens):
align_from_raw[align_from_entities[e - position - 1]][1] + position
)
offsets = _determine_offsets(tokens, raw_nfc, position)
assert len(tokens) == len(offsets)
assert len(tokens) == len(offsets), f"Not as many tokens as offsets: {len(tokens)} != {len(offsets)}"
offsets = [(align_to_starts[s], align_to_ends[e - 1]) for s, e in offsets]
if xml_input:
offsets = [(align_to_entities[s][0], align_to_entities[e - 1][1]) for s, e in offsets]
return offsets


def token_offsets_xml(tokens, raw):
"""Determine start and end positions of tokens in the original raw
(NFC) input. Account for XML entities.
"""
# resolve entities
raw_entityless, align_to_raw = _resolve_entities(raw)
# convert to NFC
raw_nfc = unicodedata.normalize("NFC", raw_entityless)
# align NFC
align_to_entityless = _align_nfc(raw_nfc, raw_entityless)
align_starts = {k[0]: v[0] for k, v in align_to_entityless.items()}
align_ends = {k[1]: v[1] for k, v in align_to_entityless.items()}
offsets = _determine_offsets(tokens, raw_nfc)
offsets = [(align_to_raw[align_starts[s]][0], align_to_raw[align_ends[e] - 1][1]) for s, e in offsets]
return offsets
def xml_chunk_offset(token, raw):
"""Determine character offset for an XML chunk created by `utils._xml_chunk_generator`."""
raw, align_to_raw = _resolve_entities(raw)
raw = re.sub(r"\s", " ", raw)
text = token.text
text = re.sub(r"\s", " ", text)
if token.markup:
text, align_to_text = _resolve_entities(text)
text = text.replace("'", '"')
if raw.startswith(text):
start = 0
end = len(text)
else:
pattern = "(" + re.escape(text) + ")"
pattern = pattern.replace(r"\ ", r"\s+")
pattern = pattern.replace("=", r"\s*=\s*")
if not text.startswith("</"):
pattern = pattern[:-2] + r"\s*/?\s*" + pattern[-2:]
local_raw = raw.replace("'", '"')
m = re.match(pattern, local_raw)
if text.startswith("</") and not m:
start, end = 0, 0
else:
assert m, f"'{text}' not found in '{local_raw}'"
start, end = m.span(1)
else:
assert raw.startswith(text), f"'{raw}' does not start with '{text}'"
start = 0
end = len(text)
if start == end:
return (align_to_raw[start][0], align_to_raw[start][0])
else:
return (align_to_raw[start][0], align_to_raw[end - 1][1])
1 change: 0 additions & 1 deletion src/somajo/somajo.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import functools
import itertools
import multiprocessing
import unicodedata

from . import (
alignment,
Expand Down
34 changes: 26 additions & 8 deletions src/somajo/token.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ class Token:
Is it the first token of a sentence?
last_in_sentence : bool, (default=False)
Is it the last token of a sentence?
character_offset : tuple, (default=None)
Character offset of the token in the input as tuple `(start, end)`
such that `input[start:end] == text` (if there are no changes to
the token text during tokenization)
"""

Expand All @@ -50,19 +54,33 @@ class Token:
"time",
}

def __init__(self, text, *, markup=False, markup_class=None, markup_eos=None, locked=False, token_class=None, space_after=True, original_spelling=None, first_in_sentence=False, last_in_sentence=False, character_offset=None):
def __init__(
self,
text,
*,
markup=False,
markup_class=None,
markup_eos=None,
locked=False,
token_class=None,
space_after=True,
original_spelling=None,
first_in_sentence=False,
last_in_sentence=False,
character_offset=None
):
self.text = text
if markup:
assert markup_class is not None
assert markup_eos is not None
assert markup_class is not None, "You need to specify a `markup_class` for markup tokens."
assert markup_eos is not None, "You need to provide a value for `markup_eos` for markup tokens."
if markup_class is not None:
assert markup
assert markup_class == "start" or markup_class == "end"
assert markup, "You can only specify a `markup_class` for markup tokens."
assert markup_class == "start" or markup_class == "end", f"'{markup_class}' is not a recognized markup class."
if markup_eos is not None:
assert markup
assert isinstance(markup_eos, bool)
assert markup, "You can only use `markup_eos` for markup tokens."
assert isinstance(markup_eos, bool), f"'{markup_eos}' is not a Boolean value."
if token_class is not None:
assert token_class in self.token_classes, f"'{token_class}' is not a recognized token class"
assert token_class in self.token_classes, f"'{token_class}' is not a recognized token class."
self.markup = markup
self.markup_class = markup_class
self.markup_eos = markup_eos
Expand Down
2 changes: 1 addition & 1 deletion src/somajo/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ def _xml_chunk_generator(f, eos_tags=None, prune_tags=None, character_offsets=Fa
token.first_in_sentence = True
lexical_tokens += 1
if character_offsets:
token_start, token_end = alignment.pretoken_offset_xml(token, input_buffer)
token_start, token_end = alignment.xml_chunk_offset(token, input_buffer)
len_output_buffer = sum(len(o) for o in output_buffer)
token.character_offset = (token_start + position + len_output_buffer, token_end + position + len_output_buffer)
else:
Expand Down
32 changes: 17 additions & 15 deletions tests/test_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def test_entities_02(self):
self.assertEqual(al, alignment)


class TestTokenAlignment(unittest.TestCase):
class TestDetermineOffsets(unittest.TestCase):
def setUp(self):
"""Necessary preparations"""
self.tokenizer = Tokenizer(split_camel_case=True, language="de_CMC")
Expand All @@ -104,20 +104,6 @@ def _equal(self, raw, tokenized):
offsets = somajo.alignment._determine_offsets(tokens, raw, position=0)
self.assertEqual([raw[s:e] for s, e in offsets], tokenized)

def _equal_xml(self, raw, tokenized):
raw = unicodedata.normalize("NFC", raw)
if isinstance(tokenized, str):
tokenized = tokenized.split()
eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split()
eos_tags = set(eos_tags)
chunk_info = utils.xml_chunk_generator(raw, is_file=False, eos_tags=eos_tags, character_offsets=True)
chunk_lists = [ci[0] for ci in chunk_info]
token_dlls = map(DLL, chunk_lists)
chunks = map(self.tokenizer._tokenize, token_dlls)
complete = list(itertools.chain.from_iterable(chunks))
offsets = somajo.alignment.token_offsets(list(itertools.chain.from_iterable(chunk_lists)), raw, 0, True, complete)
self.assertEqual([raw[s:e] for s, e in offsets], tokenized)

def test_token_alignment_01(self):
self._equal("Ein simpler Test.", "Ein simpler Test .")

Expand All @@ -136,6 +122,22 @@ def test_token_alignment_04(self):
["foo​bar", "foo­bar", "foo\ufeffbar", "foobarbazquxalphabetagamma", "foo‌bar‍baz", "foo‏bar‎baz", "foo\u202bbar\u202abaz\u202cqux\u202ealpha\u202dbeta"]
)


class TestTokenOffsets(unittest.TestCase):
def _equal_xml(self, raw, tokenized):
raw = unicodedata.normalize("NFC", raw)
if isinstance(tokenized, str):
tokenized = tokenized.split()
eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split()
eos_tags = set(eos_tags)
chunk_info = utils.xml_chunk_generator(raw, is_file=False, eos_tags=eos_tags, character_offsets=True)
chunk_lists = [ci[0] for ci in chunk_info]
token_dlls = map(DLL, chunk_lists)
chunks = map(self.tokenizer._tokenize, token_dlls)
complete = list(itertools.chain.from_iterable(chunks))
offsets = somajo.alignment.token_offsets(list(itertools.chain.from_iterable(chunk_lists)), raw, 0, True, complete)
self.assertEqual([raw[s:e] for s, e in offsets], tokenized)

def test_token_alignment_05(self):
self._equal_xml(
"<foo>der beste Betreuer? - &gt;ProfSmith! : )</foo>",
Expand Down

0 comments on commit 8fa6e27

Please sign in to comment.