Skip to content

Commit

Permalink
XML chunk offsets
Browse files Browse the repository at this point in the history
  • Loading branch information
Thomas Proisl committed Oct 18, 2023
1 parent 3c665c5 commit 225ad8c
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 15 deletions.
10 changes: 9 additions & 1 deletion src/somajo/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ def resolve_entities(xml):
return outstring, alignment


def pretoken_offsets_xml(tokens, raw):
# resolve entities
raw_entityless, align_to_raw = resolve_entities(raw)
offsets = token_offsets(tokens, raw_entityless, xml_input=True)
offsets = [(align_to_raw[s][0], align_to_raw[e][1]) for s, e in offsets]
return offsets


def token_offsets(tokens, raw, xml_input=False):
"""Determine start and end positions of tokens in the original raw (NFC) input."""
offsets = []
Expand All @@ -78,7 +86,7 @@ def token_offsets(tokens, raw, xml_input=False):
return offsets


def token_offsets_xml(tokens, raw, tokenizer):
def token_offsets_xml(tokens, raw):
"""Determine start and end positions of tokens in the original raw
(NFC) input. Account for XML entities.
"""
Expand Down
52 changes: 39 additions & 13 deletions src/somajo/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import xml.sax
import xml.sax.saxutils

import somajo.alignment
from somajo.token import Token


Expand Down Expand Up @@ -110,21 +111,24 @@ def incremental_xml_parser(f, eos_tags=None, prune_tags=None):
parser = xml.sax.make_parser(["xml.sax.xmlreader.IncrementalParser"])
handler = SaxTokenHandler(eos_tags, prune_tags)
parser.setContentHandler(handler)
line_buffer = []
for line in f:
parser.feed(line)
line_buffer.append(line)
if len(handler.token_list) > 0:
yield handler.token_list
yield handler.token_list, line_buffer
handler.token_list = []
line_buffer = []
parser.close()


def _xml_chunk_generator(f, eos_tags=None, prune_tags=None):
def _xml_chunk_generator(f, eos_tags=None, prune_tags=None, character_offsets=False):
"""Parse the XML data and yield doubly linked lists of Token objects
that are delimited by eos_tags.
"""
non_whitespace = re.compile(r"\S")
token_lists = incremental_xml_parser(f, eos_tags, prune_tags)
token_and_line_lists = incremental_xml_parser(f, eos_tags, prune_tags)
current = []
bos, eos = True, False
lexical_tokens = 0
Expand Down Expand Up @@ -249,8 +253,19 @@ def _xml_chunk_generator(f, eos_tags=None, prune_tags=None):
+----------------------+
"""
del algo_dot, algo_sketch
for token_list in token_lists:
input_buffer = ""
output_buffer = []
position = 0
for token_list, line_list in token_and_line_lists:
if character_offsets:
input_buffer += "".join(line_list)
for token in token_list:
if character_offsets:
token_end = somajo.alignment.pretoken_offsets_xml([token], input_buffer)[0][1]
else:
token_end = 0
output_buffer.append(input_buffer[:token_end])
input_buffer = input_buffer[token_end:]
if token.markup:
# markup
if token.markup_eos:
Expand All @@ -263,20 +278,27 @@ def _xml_chunk_generator(f, eos_tags=None, prune_tags=None):
eos = False
if lexical_tokens > 0:
# remove trailing opening tags from current
temp_list = []
temp_list, temp_output_buffer = [], []
while current[-1].markup_class == "start" or (not non_whitespace.search(current[-1].text)):
temp_list.append(current.pop())
yield current
temp_output_buffer.append(output_buffer.pop())
raw_xml = "".join(output_buffer)
yield current, raw_xml, position
current = temp_list[::-1]
output_buffer = temp_output_buffer[::-1]
position += len(raw_xml)
lexical_tokens = 0
elif token.markup_class == "end":
eos = True
else:
if eos and token.markup_class == "start":
eos = False
if lexical_tokens > 0:
yield current
raw_xml = "".join(output_buffer)
yield current, raw_xml, position
current = []
output_buffer = []
position += len(raw_xml)
lexical_tokens = 0
else:
# non-markup
Expand All @@ -287,33 +309,37 @@ def _xml_chunk_generator(f, eos_tags=None, prune_tags=None):
if eos:
eos = False
if lexical_tokens > 0:
yield current
raw_xml = "".join(output_buffer)
yield current, raw_xml, position
current = []
output_buffer = []
position += len(raw_xml)
lexical_tokens = 0
if bos:
bos = False
token.first_in_sentence = True
lexical_tokens += 1
current.append(token)
if len(current) > 0:
yield current
raw_xml = "".join(output_buffer)
yield current, raw_xml, position


def xml_chunk_generator(data, is_file=True, eos_tags=None, prune_tags=None):
def xml_chunk_generator(data, is_file=True, eos_tags=None, prune_tags=None, character_offsets=False):
"""Parse the XML data and yield doubly linked lists of Token objects
that are delimited by eos_tags.
"""
if is_file:
if isinstance(data, str):
with open(data, encoding="utf-8") as f:
for chunk in _xml_chunk_generator(f, eos_tags, prune_tags):
for chunk, raw_xml, position in _xml_chunk_generator(f, eos_tags, prune_tags, character_offsets):
yield chunk
else:
for chunk in _xml_chunk_generator(data, eos_tags, prune_tags):
for chunk, raw_xml, position in _xml_chunk_generator(data, eos_tags, prune_tags, character_offsets):
yield chunk
else:
for chunk in _xml_chunk_generator(io.StringIO(data), eos_tags, prune_tags):
for chunk, raw_xml, position in _xml_chunk_generator(io.StringIO(data), eos_tags, prune_tags, character_offsets):
yield chunk


Expand Down
2 changes: 1 addition & 1 deletion tests/test_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def _equal_xml(self, raw, tokenized):
chunks = map(self.tokenizer._tokenize, token_dlls)
complete = list(itertools.chain.from_iterable(chunks))
complete = utils.escape_xml_tokens(complete)
offsets = somajo.alignment.token_offsets_xml(complete, raw, self.tokenizer)
offsets = somajo.alignment.token_offsets_xml(complete, raw)
self.assertEqual([raw[s:e] for s, e in offsets], tokenized)

def test_token_alignment_01(self):
Expand Down

0 comments on commit 225ad8c

Please sign in to comment.