Skip to content

Commit

Permalink
Merge pull request #77 from fgnt/test_readme
Browse files Browse the repository at this point in the history
Add testcase that runs code blocks in readme
  • Loading branch information
thequilo authored Jun 27, 2024
2 parents d96999c + 6bab678 commit 6e42e01
Show file tree
Hide file tree
Showing 4 changed files with 152 additions and 8 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
fail-fast: false
matrix:
# Remember to update "classifiers" in setup.py when changing Python version
python-version: [3.7, 3.8, 3.9, '3.10', '3.11']
python-version: [3.8, 3.9, '3.10', '3.11', '3.12']

steps:
- uses: actions/checkout@v3
Expand Down
11 changes: 5 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ Each segment should have a minimum set of keys `"session_id"` and `"words"`.
Depending on the metric, additional keys may be required (`"speaker"`, `"start_time"`, `"end_time"`).

An example is shown below:
```py
```python
[
{
"session_id": "recordingA", # Required
Expand All @@ -98,7 +98,7 @@ Another example can be found [here](example_files/hyp.seglst.json).
#### [Segmental Time Mark (STM)](https://github.com/usnistgov/SCTK/blob/master/doc/infmts.htm#L75)
Each line in an `STM` file represents one "utterance" and is defined as

```STM
```
STM :== <filename> <channel> <speaker_id> <begin_time> <end_time> <transcript>
```
where
Expand All @@ -110,21 +110,20 @@ where
- `transcript`: space-separated list of words

for example:
```
```STM
recording1 1 Alice 0 0 Hello Bob.
recording1 1 Bob 1 0 Hello Alice.
recording1 1 Alice 2 0 How are you?
...
recording2 1 Alice 0 0 Hello Carol.
...
;; ...
```

An example `STM` file can be found in [here](example_files/ref.stm).

#### [Time Marked Conversation (CTM)](https://github.com/usnistgov/SCTK/blob/master/doc/infmts.htm#L286)
The CTM format is defined as

```CTM
```
CTM :== <filename> <channel> <begin_time> <duration> <word> [<confidence>]
```

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,11 @@

# Specify the Python versions you support here. In particular, ensure
# that you indicate whether you support Python 2, Python 3 or both.
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
'Programming Language :: Python :: 3.12',
],

python_requires=">=3.5",
Expand Down
145 changes: 145 additions & 0 deletions tests/test_docs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import pytest
import re
from pathlib import Path
import ast

MEETEVAL_ROOT = Path(__file__).parent.parent

FENCED_CODE_BLOCK_REGEX = re.compile(r'```([^`\n]*)?\n((?:.|\n)*?)\n```')

# List of language blocks that are not tested
LANG_BLACKLIST = ['shell', 'bibtex', '']

# Markdown files for which the context is kept between code blocks
KEEP_CONTEXT = ['doc/algorithms.md']


@pytest.fixture(scope='session')
def global_state():
"""Used to track global state across code blocks in the files listed in `KEEP_CONTEXT`."""
return {}


def get_fenced_code_blocks(markdown_string: str):
"""
Returns a list of tuples (lang, code, lineno) for each fenced code block in the markdown string.
lineno corresponds to the line where the code starts (after the opening ```).
"""
def get_lineno(offset):
return markdown_string[:offset].count('\n') + 1

return [
(m.group(1), m.group(2), get_lineno(m.span()[0]))
for m in FENCED_CODE_BLOCK_REGEX.finditer(markdown_string)
]


def split_code_block_comment_output(code):
"""Splits a code block where a line starts with `print` and the following
line is a comment.
The comment is expected to be the output of the print statement.
>>> split_code_block_comment_output(r'''
... # this is a comment
... print('hello')
... print('world')
... # hello
... # world
...
... # Here starts the second block
... a = 2
...
... print(
... a
... )
... # 2
... ''')
[("\\n# this is a comment\\nprint('hello')\\nprint('world')", ' hello\\n world', 0), ('\\n# Here starts the second block\\na = 2\\n\\nprint(\\n a\\n)', ' 2', 6)]
"""
c = ast.parse(code)
lines = code.splitlines()
last_match = 0
blocks = []
l = 0
for s in c.body:
if l > s.end_lineno:
continue

# If we parsed a print statement at the root level
if isinstance(s, ast.Expr) and isinstance(s.value, ast.Call) and isinstance(s.value.func, ast.Name) and s.value.func.id == 'print':
# Collect any lines that follow directly and start with a #
output = []
l = s.end_lineno
if not lines[l].startswith('#'):
continue
while l < len(lines) and lines[l].startswith('#'):
output.append(lines[l][1:])
l += 1
blocks.append(('\n'.join(lines[last_match:s.end_lineno]), '\n'.join(output), last_match))
last_match = l
if last_match < len(lines):
blocks.append(('\n'.join(lines[last_match:]), '', last_match))
return blocks


def exec_with_source(code, filename, lineno, globals_=None, locals_=None):
"""
Like `compile` followed by `exec`, but sets the correct line number for the code block.
This is required for correct traceback display.
Captures stdout and returns it as a string.
"""
compiled = ast.parse(code, str(filename), 'exec')
ast.increment_lineno(compiled, lineno)
compiled = compile(compiled, str(filename), 'exec', optimize=0)
from io import StringIO
from contextlib import redirect_stdout

f = StringIO()
with redirect_stdout(f):
exec(compiled, globals_, locals_)
return f.getvalue()


@pytest.mark.parametrize(
('filename', 'codeblock'),
[
(str(filename.relative_to(MEETEVAL_ROOT)), codeblock)
for filename in MEETEVAL_ROOT.glob('**/*.md')
for codeblock in get_fenced_code_blocks(filename.read_text())
]
)
def test_readme(filename, codeblock, global_state, monkeypatch):
"""Run fenced code blocks in markdown files in the MeetEval repository."""
# Some code blocks in the readme file must run in the meeteval root directory
# because they access the example files in `MEETEVAL_ROOT/example_files`
monkeypatch.chdir(MEETEVAL_ROOT)

lang, code, lineno = codeblock
if lang in LANG_BLACKLIST:
return

try:
if lang == 'python':
if filename in KEEP_CONTEXT:
globals_ = global_state.setdefault(filename, {})
else:
globals_ = {}
for code, expected_output, line_offset in split_code_block_comment_output(code):
output = exec_with_source(code, str(filename), lineno + line_offset, globals_)
if expected_output is not None:
# Check that the output is equal to the expected output, but we want to ignore whitespace
# for formatting / clarity reasons.
# This is a very basic check that ignores all whitespace, but it should be
# sufficient for most cases.
output_ = output.replace(' ', '').replace('\n', '')
expected_output_ = expected_output.replace(' ', '').replace('\n', '')
assert output_ == expected_output_, f'Output mismatch: {output} != {expected_output}'
elif lang == 'STM':
# Test if the STM code block is valid.
import meeteval
meeteval.io.STM.parse(code)
else:
raise ValueError(f'Unsupported language: {lang}')
except Exception:
print(f'Error in {lang} code block:\n', code)
raise

0 comments on commit 6e42e01

Please sign in to comment.