Skip to content

Commit

Permalink
✨ handle concatenated fields with inner quotes in splitter (#398)
Browse files Browse the repository at this point in the history
This allows the splitter to correctly handle #-based string concatenation.

Note: This will still lead to downstream problems, as some of these concatenated fields will not have a recognized enclosing, and as string interpolation does not yet work with concatenated references. However, these cases did not work before either and this this PR does not (knowingly) introduce any regressions. The hereby mentioned problems will be addressed in a subsequent PR.

This is the first pr to address (but not yet close) #396
  • Loading branch information
MiWeiss authored Sep 18, 2023
1 parent c9190cf commit 084c3dd
Show file tree
Hide file tree
Showing 2 changed files with 117 additions and 35 deletions.
93 changes: 58 additions & 35 deletions bibtexparser/splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,19 +128,64 @@ def _move_to_closed_bracket(self) -> int:
end_index=m.start() - 1,
)

def _move_to_end_of_double_quoted_string(self) -> int:
"""Index of the closing double quote."""
def _move_to_comma_or_closing_curly_bracket(
self, currently_quote_escaped=False, num_open_curls=0
) -> int:
"""Index of the end of the field, taking quote-escape into account."""

if num_open_curls > 0 and currently_quote_escaped:
raise ParserStateException(
message="Internal error in parser. "
"Found a field-value that is both quote-escaped and curly-escaped. "
"Please report this bug."
)

def _is_escaped():
return currently_quote_escaped or num_open_curls > 0

# iterate over marks until we find end of field
while True:
m = self._next_mark(accept_eof=False)
next_mark = self._next_mark(accept_eof=False)

# Handle "escape" characters
if next_mark.group(0) == '"' and not num_open_curls > 0:
currently_quote_escaped = not currently_quote_escaped
continue
elif next_mark.group(0) == "{" and not currently_quote_escaped:
num_open_curls += 1
continue
elif (
next_mark.group(0) == "}"
and not currently_quote_escaped
and num_open_curls > 0
):
num_open_curls -= 1
continue

# Check for end of field
elif next_mark.group(0) == "," and not _is_escaped():
self._unaccepted_mark = next_mark
return next_mark.start()
# Check for end of entry:
elif next_mark.group(0) == "}" and not _is_escaped():
self._unaccepted_mark = next_mark
return next_mark.start()

# Sanity-check: If new block is starting, we abort
elif next_mark.group(0).startswith("@"):
self._unaccepted_mark = next_mark

if currently_quote_escaped:
looking_for = '`"`'
elif num_open_curls > 0:
looking_for = "`}`"
else:
looking_for = "`,` or `}`"

if m.group(0) == '"':
return m.start()
elif m.group(0).startswith("@"):
self._unaccepted_mark = m
raise BlockAbortedException(
abort_reason=f"Unexpected block start: `{m.group(0)}`. "
f'Was still looking for field-value closing `"`',
end_index=m.start() - 1,
abort_reason=f"Unexpected block start: `{next_mark.group(0)}`. "
f"Was still looking for field-value closing {looking_for} ",
end_index=next_mark.start() - 1,
)

def _move_to_end_of_entry(
Expand Down Expand Up @@ -171,31 +216,9 @@ def _move_to_end_of_entry(
start_line = self._current_line
key_end = equals_mark.start()
value_start = equals_mark.end()
value_start_mark = self._next_mark(accept_eof=False)

if value_start_mark.group(0) == "{":
value_end = self._move_to_closed_bracket() + 1
elif value_start_mark.group(0) == '"':
value_end = self._move_to_end_of_double_quoted_string() + 1
else:
# e.g. String reference or integer. Ended by the observed mark
# (as there is no start mark).
# Should be either a comma or a "}"
value_start = equals_mark.end()
value_end = value_start_mark.start()
# We expect a comma (after a closed field-value), or at the end of entry, a closing bracket
if not value_start_mark.group(0) in [
",",
"}",
]:
self._unaccepted_mark = value_start_mark
raise BlockAbortedException(
abort_reason=f"Unexpected character `{value_start_mark.group(0)}` "
f"after field-value. Expected a comma or closing bracket.",
end_index=value_start_mark.start(),
)
# Put comma back into stream, as still expected.
self._unaccepted_mark = value_start_mark
value_end = self._move_to_comma_or_closing_curly_bracket(
currently_quote_escaped=False, num_open_curls=0
)

key = self.bibstr[key_start:key_end].strip()
value = self.bibstr[value_start:value_end].strip()
Expand Down
59 changes: 59 additions & 0 deletions tests/splitter_tests/test_splitter_entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,65 @@ def test_entry_without_fields(entry_without_fields: str):
assert len(library.entries[1].fields) == 1


@pytest.mark.parametrize(
"entry, expected",
[
# See issue #396
pytest.param(
r'@INBOOK{inbook-full, relevant_field = 10 # "~" # jan}',
r'10 # "~" # jan',
id="inner quotes",
),
pytest.param(
r'@INBOOK{inbook-full, relevant_field = 10 # "~" # jan,}',
r'10 # "~" # jan',
id="inner quotes + comma",
),
pytest.param(
r'@INBOOK{inbook-full, relevant_field = 10 # "~" # jan, author = "Paul"}',
r'10 # "~" # jan',
id="inner quotes + other field",
),
pytest.param(
r'@INBOOK{inbook-full, relevant_field = "~" # jan}',
r'"~" # jan',
id=r"starting quotes",
),
pytest.param(
r'@INBOOK{inbook-full, relevant_field = "~" # jan, }',
r'"~" # jan',
id=r"starting quotes + comma",
),
pytest.param(
r'@INBOOK{inbook-full, relevant_field = "~" # jan, author = "Paul"}',
r'"~" # jan',
id="starting quotes + other field",
),
pytest.param(
r'@INBOOK{inbook-full, relevant_field = jan # "~"}',
r'jan # "~"',
id=r"ending quotes",
),
pytest.param(
r'@INBOOK{inbook-full, relevant_field = jan # "~",}',
r'jan # "~"',
id=r"ending quotes + comma",
),
pytest.param(
r'@INBOOK{inbook-full, relevant_field = jan # "~", author = "Paul"}',
r'jan # "~"',
id="ending quotes + other field",
),
],
)
def test_entry_with_concatenated_field(entry, expected):
"""For motivation why we need this, please see issue #384"""
library: Library = Splitter(entry).split()
assert len(library.entries) == 1
assert len(library.failed_blocks) == 0
assert library.entries[0]["relevant_field"] == expected


@pytest.mark.parametrize(
"entry",
[
Expand Down

0 comments on commit 084c3dd

Please sign in to comment.