Clean-up and documentation

tsproisl · Oct 25, 2023 · 8fa6e27 · 8fa6e27
1 parent 61006ab
commit 8fa6e27
Show file tree

Hide file tree

Showing 5 changed files with 87 additions and 83 deletions.
diff --git a/src/somajo/alignment.py b/src/somajo/alignment.py
@@ -5,24 +5,24 @@
 import regex as re
 
 
-ranges = [
+_ranges = [
     (0x0000, 0x001F),
     (0x007F, 0x009F),
     (0x2000, 0x200A),           # whitespace
     (0x200B, 0x200F),
     (0x202A, 0x202E),
     (0x2066, 0x2069)
 ]
-single_characters = ["\u00AD", "\u061C", "\u2060", "\uFEFF", "\uFE0F"]
-whitespace = [" ", "\u00A0", "\u1680", "\u2028", "\u2029", "\u202F", "\u205F", "\u3000"]
-skipable_characters = set(single_characters + whitespace + [chr(i) for start, end in ranges for i in range(start, end + 1)])
+_single_characters = ["\u00AD", "\u061C", "\u2060", "\uFEFF", "\uFE0F"]
+_whitespace = [" ", "\u00A0", "\u1680", "\u2028", "\u2029", "\u202F", "\u205F", "\u3000"]
+_skipable_characters = set(_single_characters + _whitespace + [chr(i) for start, end in _ranges for i in range(start, end + 1)])
 
 
 def _align_nfc(nfc, orig):
     """Character alignment from NFC version to original string."""
     alignment = {}
     if nfc == "":
-        assert orig == ""
+        assert orig == "", "NFC string is empty - expected original string to be also empty; it is '{orig}' instead"
         return alignment
     nfc_i, nfc_j = 0, 0
     orig_i, orig_j = 0, 0
@@ -33,7 +33,7 @@ def _align_nfc(nfc, orig):
         orig_j = orig_i + 1
         while (orig_j < len(orig)) and (unicodedata.combining(orig[orig_j]) > 0):
             orig_j += 1
-        assert nfc[nfc_i:nfc_j] == unicodedata.normalize("NFC", orig[orig_i:orig_j])
+        assert nfc[nfc_i:nfc_j] == unicodedata.normalize("NFC", orig[orig_i:orig_j]), f"'{nfc[nfc_i:nfc_j]}' != unicodedata.normalize('NFC', '{orig[orig_i:orig_j]}')"
         alignment[(nfc_i, nfc_j)] = (orig_i, orig_j)
         nfc_i = nfc_j
         orig_i = orig_j
@@ -66,14 +66,15 @@ def _determine_offsets(tokens, raw, position):
                             end = j + 1
                         break
                     else:
-                        assert raw[j] in skipable_characters, f"'{raw[j]}' ({hex(ord(raw[j]))}) not a skipable character; token: '{text}', raw: '{raw[raw_i:]}'"
+                        assert raw[j] in _skipable_characters, f"'{raw[j]}' ({hex(ord(raw[j]))}) is not a skipable character; token: '{text}', raw: '{raw[raw_i:]}'"
                 raw_start = j + 1
         offsets.append((start, end))
         raw_i = end
     return offsets
 
 
 def _resolve_entities(xml):
+    """Resolve XML entities and provide an alignment from output string to input string."""
     entity = re.compile(r"&(?:#\d+|#x[0-9a-f]+|amp|apos|gt|lt|quot);", re.I)
     named = {"&amp;": "&", "&apos;": "'", "&gt;": ">", "&lt;": "<", "&quot;": '"'}
     outstring = ""
@@ -99,42 +100,8 @@ def _resolve_entities(xml):
     return outstring, alignment
 
 
-def pretoken_offset_xml(token, raw):
-    # resolve entities
-    raw, align_to_raw = _resolve_entities(raw)
-    raw = re.sub(r"\s", " ", raw)
-    text = token.text
-    text = re.sub(r"\s", " ", text)
-    if token.markup:
-        text, align_to_text = _resolve_entities(text)
-        text = text.replace("'", '"')
-        if raw.startswith(text):
-            start = 0
-            end = len(text)
-        else:
-            pattern = "(" + re.escape(text) + ")"
-            pattern = pattern.replace(r"\ ", r"\s+")
-            pattern = pattern.replace("=", r"\s*=\s*")
-            if not text.startswith("</"):
-                pattern = pattern[:-2] + r"\s*/?\s*" + pattern[-2:]
-            local_raw = raw.replace("'", '"')
-            m = re.match(pattern, local_raw)
-            if text.startswith("</") and not m:
-                start, end = 0, 0
-            else:
-                assert m, f"'{text}' not found in '{local_raw}'"
-                start, end = m.span(1)
-    else:
-        assert raw.startswith(text), f"'{raw}' does not start with '{text}'"
-        start = 0
-        end = len(text)
-    if start == end:
-        return (align_to_raw[start][0], align_to_raw[start][0])
-    else:
-        return (align_to_raw[start][0], align_to_raw[end - 1][1])
-
-
 def token_offsets(token_list, raw, position, xml_input, tokens):
+    """Determine character offsets for tokens."""
     if xml_input:
         chunk_offsets = [(t.character_offset[0] - position, t.character_offset[1] - position) for t in token_list]
         raw, align_to_entities = _resolve_entities(raw)
@@ -164,25 +131,43 @@ def token_offsets(token_list, raw, position, xml_input, tokens):
                     align_from_raw[align_from_entities[e - position - 1]][1] + position
                 )
     offsets = _determine_offsets(tokens, raw_nfc, position)
-    assert len(tokens) == len(offsets)
+    assert len(tokens) == len(offsets), f"Not as many tokens as offsets: {len(tokens)} != {len(offsets)}"
     offsets = [(align_to_starts[s], align_to_ends[e - 1]) for s, e in offsets]
     if xml_input:
         offsets = [(align_to_entities[s][0], align_to_entities[e - 1][1]) for s, e in offsets]
     return offsets
 
 
-def token_offsets_xml(tokens, raw):
-    """Determine start and end positions of tokens in the original raw
-    (NFC) input. Account for XML entities.
-    """
-    # resolve entities
-    raw_entityless, align_to_raw = _resolve_entities(raw)
-    # convert to NFC
-    raw_nfc = unicodedata.normalize("NFC", raw_entityless)
-    # align NFC
-    align_to_entityless = _align_nfc(raw_nfc, raw_entityless)
-    align_starts = {k[0]: v[0] for k, v in align_to_entityless.items()}
-    align_ends = {k[1]: v[1] for k, v in align_to_entityless.items()}
-    offsets = _determine_offsets(tokens, raw_nfc)
-    offsets = [(align_to_raw[align_starts[s]][0], align_to_raw[align_ends[e] - 1][1]) for s, e in offsets]
-    return offsets
+def xml_chunk_offset(token, raw):
+    """Determine character offset for an XML chunk created by `utils._xml_chunk_generator`."""
+    raw, align_to_raw = _resolve_entities(raw)
+    raw = re.sub(r"\s", " ", raw)
+    text = token.text
+    text = re.sub(r"\s", " ", text)
+    if token.markup:
+        text, align_to_text = _resolve_entities(text)
+        text = text.replace("'", '"')
+        if raw.startswith(text):
+            start = 0
+            end = len(text)
+        else:
+            pattern = "(" + re.escape(text) + ")"
+            pattern = pattern.replace(r"\ ", r"\s+")
+            pattern = pattern.replace("=", r"\s*=\s*")
+            if not text.startswith("</"):
+                pattern = pattern[:-2] + r"\s*/?\s*" + pattern[-2:]
+            local_raw = raw.replace("'", '"')
+            m = re.match(pattern, local_raw)
+            if text.startswith("</") and not m:
+                start, end = 0, 0
+            else:
+                assert m, f"'{text}' not found in '{local_raw}'"
+                start, end = m.span(1)
+    else:
+        assert raw.startswith(text), f"'{raw}' does not start with '{text}'"
+        start = 0
+        end = len(text)
+    if start == end:
+        return (align_to_raw[start][0], align_to_raw[start][0])
+    else:
+        return (align_to_raw[start][0], align_to_raw[end - 1][1])
diff --git a/src/somajo/somajo.py b/src/somajo/somajo.py
@@ -3,7 +3,6 @@
 import functools
 import itertools
 import multiprocessing
-import unicodedata
 
 from . import (
     alignment,

diff --git a/src/somajo/token.py b/src/somajo/token.py
@@ -26,6 +26,10 @@ class Token:
         Is it the first token of a sentence?
     last_in_sentence : bool, (default=False)
         Is it the last token of a sentence?
+    character_offset : tuple, (default=None)
+        Character offset of the token in the input as tuple `(start, end)`
+        such that `input[start:end] == text` (if there are no changes to
+        the token text during tokenization)
 
     """
 
@@ -50,19 +54,33 @@ class Token:
         "time",
     }
 
-    def __init__(self, text, *, markup=False, markup_class=None, markup_eos=None, locked=False, token_class=None, space_after=True, original_spelling=None, first_in_sentence=False, last_in_sentence=False, character_offset=None):
+    def __init__(
+            self,
+            text,
+            *,
+            markup=False,
+            markup_class=None,
+            markup_eos=None,
+            locked=False,
+            token_class=None,
+            space_after=True,
+            original_spelling=None,
+            first_in_sentence=False,
+            last_in_sentence=False,
+            character_offset=None
+    ):
         self.text = text
         if markup:
-            assert markup_class is not None
-            assert markup_eos is not None
+            assert markup_class is not None, "You need to specify a `markup_class` for markup tokens."
+            assert markup_eos is not None, "You need to provide a value for `markup_eos` for markup tokens."
         if markup_class is not None:
-            assert markup
-            assert markup_class == "start" or markup_class == "end"
+            assert markup, "You can only specify a `markup_class` for markup tokens."
+            assert markup_class == "start" or markup_class == "end", f"'{markup_class}' is not a recognized markup class."
         if markup_eos is not None:
-            assert markup
-            assert isinstance(markup_eos, bool)
+            assert markup, "You can only use `markup_eos` for markup tokens."
+            assert isinstance(markup_eos, bool), f"'{markup_eos}' is not a Boolean value."
         if token_class is not None:
-            assert token_class in self.token_classes, f"'{token_class}' is not a recognized token class"
+            assert token_class in self.token_classes, f"'{token_class}' is not a recognized token class."
         self.markup = markup
         self.markup_class = markup_class
         self.markup_eos = markup_eos

diff --git a/src/somajo/utils.py b/src/somajo/utils.py
@@ -314,7 +314,7 @@ def _xml_chunk_generator(f, eos_tags=None, prune_tags=None, character_offsets=Fa
                         token.first_in_sentence = True
                         lexical_tokens += 1
             if character_offsets:
-                token_start, token_end = alignment.pretoken_offset_xml(token, input_buffer)
+                token_start, token_end = alignment.xml_chunk_offset(token, input_buffer)
                 len_output_buffer = sum(len(o) for o in output_buffer)
                 token.character_offset = (token_start + position + len_output_buffer, token_end + position + len_output_buffer)
             else:

diff --git a/tests/test_alignment.py b/tests/test_alignment.py
@@ -90,7 +90,7 @@ def test_entities_02(self):
         self.assertEqual(al, alignment)
 
 
-class TestTokenAlignment(unittest.TestCase):
+class TestDetermineOffsets(unittest.TestCase):
     def setUp(self):
         """Necessary preparations"""
         self.tokenizer = Tokenizer(split_camel_case=True, language="de_CMC")
@@ -104,20 +104,6 @@ def _equal(self, raw, tokenized):
         offsets = somajo.alignment._determine_offsets(tokens, raw, position=0)
         self.assertEqual([raw[s:e] for s, e in offsets], tokenized)
 
-    def _equal_xml(self, raw, tokenized):
-        raw = unicodedata.normalize("NFC", raw)
-        if isinstance(tokenized, str):
-            tokenized = tokenized.split()
-        eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split()
-        eos_tags = set(eos_tags)
-        chunk_info = utils.xml_chunk_generator(raw, is_file=False, eos_tags=eos_tags, character_offsets=True)
-        chunk_lists = [ci[0] for ci in chunk_info]
-        token_dlls = map(DLL, chunk_lists)
-        chunks = map(self.tokenizer._tokenize, token_dlls)
-        complete = list(itertools.chain.from_iterable(chunks))
-        offsets = somajo.alignment.token_offsets(list(itertools.chain.from_iterable(chunk_lists)), raw, 0, True, complete)
-        self.assertEqual([raw[s:e] for s, e in offsets], tokenized)
-
     def test_token_alignment_01(self):
         self._equal("Ein simpler Test.", "Ein simpler Test .")
 
@@ -136,6 +122,22 @@ def test_token_alignment_04(self):
             ["foobar", "foobar", "foo\ufeffbar", "foobarbazquxalphabetagamma", "foo‌bar‍baz", "foo‏bar‎baz", "foo\u202bbar\u202abaz\u202cqux\u202ealpha\u202dbeta"]
         )
 
+
+class TestTokenOffsets(unittest.TestCase):
+    def _equal_xml(self, raw, tokenized):
+        raw = unicodedata.normalize("NFC", raw)
+        if isinstance(tokenized, str):
+            tokenized = tokenized.split()
+        eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split()
+        eos_tags = set(eos_tags)
+        chunk_info = utils.xml_chunk_generator(raw, is_file=False, eos_tags=eos_tags, character_offsets=True)
+        chunk_lists = [ci[0] for ci in chunk_info]
+        token_dlls = map(DLL, chunk_lists)
+        chunks = map(self.tokenizer._tokenize, token_dlls)
+        complete = list(itertools.chain.from_iterable(chunks))
+        offsets = somajo.alignment.token_offsets(list(itertools.chain.from_iterable(chunk_lists)), raw, 0, True, complete)
+        self.assertEqual([raw[s:e] for s, e in offsets], tokenized)
+
     def test_token_alignment_05(self):
         self._equal_xml(
             "<foo>der beste Betreuer? - &gt;ProfSmith! : )</foo>",