From 5d5e71458e38a89137bd5dd73fc9dfea08004362 Mon Sep 17 00:00:00 2001 From: Thilo von Neumann Date: Mon, 10 Jun 2024 14:37:20 +0200 Subject: [PATCH 1/6] Add testcase that runs code blocks in readme --- README.md | 11 +++--- tests/test_docs.py | 99 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+), 6 deletions(-) create mode 100644 tests/test_docs.py diff --git a/README.md b/README.md index 0855f74a..b7e98007 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,7 @@ Each segment should have a minimum set of keys `"session_id"` and `"words"`. Depending on the metric, additional keys may be required (`"speaker"`, `"start_time"`, `"end_time"`). An example is shown below: -```py +```python [ { "session_id": "recordingA", # Required @@ -98,7 +98,7 @@ Another example can be found [here](example_files/hyp.seglst.json). #### [Segmental Time Mark (STM)](https://github.com/usnistgov/SCTK/blob/master/doc/infmts.htm#L75) Each line in an `STM` file represents one "utterance" and is defined as -```STM +``` STM :== ``` where @@ -110,13 +110,12 @@ where - `transcript`: space-separated list of words for example: -``` +```STM recording1 1 Alice 0 0 Hello Bob. recording1 1 Bob 1 0 Hello Alice. recording1 1 Alice 2 0 How are you? -... recording2 1 Alice 0 0 Hello Carol. -... +;; ... ``` An example `STM` file can be found in [here](example_files/ref.stm). @@ -124,7 +123,7 @@ An example `STM` file can be found in [here](example_files/ref.stm). #### [Time Marked Conversation (CTM)](https://github.com/usnistgov/SCTK/blob/master/doc/infmts.htm#L286) The CTM format is defined as -```CTM +``` CTM :== [] ``` diff --git a/tests/test_docs.py b/tests/test_docs.py new file mode 100644 index 00000000..7e7e661e --- /dev/null +++ b/tests/test_docs.py @@ -0,0 +1,99 @@ +import pytest +import re +from pathlib import Path +import ast + +FENCED_CODE_BLOCK_REGEX = re.compile(r'```([^`\n]*)?\n((?:.|\n)*?)\n```') + +# List of language blocks that are not tested +LANG_BLACKLIST = ['shell', 'bibtex', ''] + + +def get_fenced_code_blocks(markdown_string: str): + """ + Returns a list of tuples (lang, code, lineno) for each fenced code block in the markdown string. + lineno corresponds to the line where the code starts (after the opening ```). + """ + def get_lineno(offset): + return markdown_string[:offset].count('\n') + 1 + + return [ + (m.group(1), m.group(2), get_lineno(m.span()[0])) + for m in FENCED_CODE_BLOCK_REGEX.finditer(markdown_string) + ] + + +def split_code_block_comment_output(code): + """Splits a code block where a line starts with `print` and the following + line is a comment. + The comment is expected to be the output of the print statement. + + Returns a list of tuples with the code block and the expected output. + The print statement in the code block is replaced with `__output = ` so + that the result can be inspected after `exec`. + """ + def get_line_offset(offset): + return code[:offset].count('\n') + + last_match = 0 + blocks = [] + for m in list(re.finditer(r'print\((.*)\)\n((?:#.*\n)*#.*)', code)): + expected_output = '\n'.join(s[2:] for s in m.group(2).split('\n')) + + blocks.append(( + code[last_match:m.span()[0]] + '__output = ' + m.group(1), + expected_output, + get_line_offset(last_match) + )) + last_match = m.span()[1] + if last_match < len(code) - 1: + blocks.append((code[last_match:], None, get_line_offset(last_match))) + return blocks + + +def exec_with_source(code, filename, lineno, globals_=None, locals_=None): + """ + Like `compile` followed by `exec`, but sets the correct line number for the code block. + This is required for correct traceback display. + """ + compiled = ast.parse(code, str(filename), 'exec') + ast.increment_lineno(compiled, lineno) + compiled = compile(compiled, str(filename), 'exec', optimize=0) + exec(compiled, globals_, locals_) + + +README = Path(__file__).parent.parent / 'README.md' + + +@pytest.mark.parametrize('codeblock', get_fenced_code_blocks(README.read_text())) +def test_readme(codeblock): + """Run fenced code blocks in readme isolated""" + import os + os.chdir(Path(__file__).parent.parent) + lang, code, lineno = codeblock + if lang in LANG_BLACKLIST: + return + + try: + if lang == 'python': + globals_ = {} + for code, expected_output, line_offset in split_code_block_comment_output(code): + exec_with_source(code, str(README), lineno + line_offset, globals_) + output = str(globals_.pop('__output', None)) + if expected_output is not None: + # Check that the output is equal to the expected output, but we want to ignore whitespace + # for formatting / clarity reasons. + # This is a very basic check that ignores all whitespace, but it should be + # sufficient for most cases. + output_ = output.replace(' ', '').replace('\n', '') + expected_output_ = expected_output.replace(' ', '').replace('\n', '') + assert output_ == expected_output_, f'Output mismatch: {output} != {expected_output}' + elif lang == 'STM': + # Test if the STM code block is valid. + import meeteval + meeteval.io.STM.parse(code) + else: + raise ValueError(f'Unsupported language: {lang}') + except Exception: + print(f'Error in {lang} code block:\n', code) + raise From 16c136e5414454df4eb425d60f082dbd007ed77f Mon Sep 17 00:00:00 2001 From: Thilo von Neumann Date: Wed, 26 Jun 2024 13:43:13 +0200 Subject: [PATCH 2/6] Search for all markdown files and track global state --- tests/test_docs.py | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/tests/test_docs.py b/tests/test_docs.py index 7e7e661e..41751acc 100644 --- a/tests/test_docs.py +++ b/tests/test_docs.py @@ -3,11 +3,20 @@ from pathlib import Path import ast +MEETEVAL_ROOT = Path(__file__).parent.parent + FENCED_CODE_BLOCK_REGEX = re.compile(r'```([^`\n]*)?\n((?:.|\n)*?)\n```') # List of language blocks that are not tested LANG_BLACKLIST = ['shell', 'bibtex', ''] +KEEP_CONTEXT = ['doc/algorithms.md'] + + +@pytest.fixture(scope='session') +def global_state(): + return {} + def get_fenced_code_blocks(markdown_string: str): """ @@ -62,23 +71,31 @@ def exec_with_source(code, filename, lineno, globals_=None, locals_=None): exec(compiled, globals_, locals_) -README = Path(__file__).parent.parent / 'README.md' - - -@pytest.mark.parametrize('codeblock', get_fenced_code_blocks(README.read_text())) -def test_readme(codeblock): +@pytest.mark.parametrize( + ('filename', 'codeblock'), + [ + # pytest.param(filename, codeblock, id=f'{str(filename.relative_to(MEETEVAL_ROOT))}-codeblock{codeblock_index}') + (str(filename.relative_to(MEETEVAL_ROOT)), codeblock) + for filename in MEETEVAL_ROOT.glob('**/*.md') + for codeblock_index, codeblock in enumerate(get_fenced_code_blocks(filename.read_text())) + ] +) +def test_readme(filename, codeblock, global_state): """Run fenced code blocks in readme isolated""" import os - os.chdir(Path(__file__).parent.parent) + os.chdir(MEETEVAL_ROOT) lang, code, lineno = codeblock if lang in LANG_BLACKLIST: return try: if lang == 'python': - globals_ = {} + if filename in KEEP_CONTEXT: + globals_ = global_state.setdefault(filename, {}) + else: + globals_ = {} for code, expected_output, line_offset in split_code_block_comment_output(code): - exec_with_source(code, str(README), lineno + line_offset, globals_) + exec_with_source(code, str(filename), lineno + line_offset, globals_) output = str(globals_.pop('__output', None)) if expected_output is not None: # Check that the output is equal to the expected output, but we want to ignore whitespace From f3ac1b09279a7293206bbaa4131404275fd3b6db Mon Sep 17 00:00:00 2001 From: Thilo von Neumann Date: Wed, 26 Jun 2024 14:59:54 +0200 Subject: [PATCH 3/6] Improve splitting at comments by using ast and capture prints by redirect_output --- tests/test_docs.py | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/tests/test_docs.py b/tests/test_docs.py index 41751acc..2fd86c02 100644 --- a/tests/test_docs.py +++ b/tests/test_docs.py @@ -15,6 +15,7 @@ @pytest.fixture(scope='session') def global_state(): + """Used to track global state across code blocks in the files listed in `KEEP_CONTEXT`.""" return {} @@ -41,22 +42,22 @@ def split_code_block_comment_output(code): The print statement in the code block is replaced with `__output = ` so that the result can be inspected after `exec`. """ - def get_line_offset(offset): - return code[:offset].count('\n') - + c = ast.parse(code) + lines = code.splitlines() last_match = 0 blocks = [] - for m in list(re.finditer(r'print\((.*)\)\n((?:#.*\n)*#.*)', code)): - expected_output = '\n'.join(s[2:] for s in m.group(2).split('\n')) - - blocks.append(( - code[last_match:m.span()[0]] + '__output = ' + m.group(1), - expected_output, - get_line_offset(last_match) - )) - last_match = m.span()[1] - if last_match < len(code) - 1: - blocks.append((code[last_match:], None, get_line_offset(last_match))) + for s in c.body: + # If we parsed a print statement at the root level + if isinstance(s, ast.Expr) and isinstance(s.value, ast.Call) and isinstance(s.value.func, ast.Name) and s.value.func.id == 'print': + # Collect any lines that follow directly and start with a # + output = [] + l = s.end_lineno + while l < len(lines) and lines[l].startswith('#'): + output.append(lines[l][1:]) + l += 1 + blocks.append(('\n'.join(lines[last_match:s.end_lineno]), '\n'.join(output), last_match)) + last_match = l + blocks.append(('\n'.join(lines[last_match:]), '', last_match)) return blocks @@ -68,20 +69,25 @@ def exec_with_source(code, filename, lineno, globals_=None, locals_=None): compiled = ast.parse(code, str(filename), 'exec') ast.increment_lineno(compiled, lineno) compiled = compile(compiled, str(filename), 'exec', optimize=0) - exec(compiled, globals_, locals_) + from io import StringIO + from contextlib import redirect_stdout + + f = StringIO() + with redirect_stdout(f): + exec(compiled, globals_, locals_) + return f.getvalue() @pytest.mark.parametrize( ('filename', 'codeblock'), [ - # pytest.param(filename, codeblock, id=f'{str(filename.relative_to(MEETEVAL_ROOT))}-codeblock{codeblock_index}') (str(filename.relative_to(MEETEVAL_ROOT)), codeblock) for filename in MEETEVAL_ROOT.glob('**/*.md') - for codeblock_index, codeblock in enumerate(get_fenced_code_blocks(filename.read_text())) + for codeblock in get_fenced_code_blocks(filename.read_text()) ] ) def test_readme(filename, codeblock, global_state): - """Run fenced code blocks in readme isolated""" + """Run fenced code blocks in markdown files in the MeetEval repository.""" import os os.chdir(MEETEVAL_ROOT) lang, code, lineno = codeblock @@ -95,8 +101,7 @@ def test_readme(filename, codeblock, global_state): else: globals_ = {} for code, expected_output, line_offset in split_code_block_comment_output(code): - exec_with_source(code, str(filename), lineno + line_offset, globals_) - output = str(globals_.pop('__output', None)) + output = exec_with_source(code, str(filename), lineno + line_offset, globals_) if expected_output is not None: # Check that the output is equal to the expected output, but we want to ignore whitespace # for formatting / clarity reasons. From 83c630b34a36e1f194ffe269fb67c76cc16c3f11 Mon Sep 17 00:00:00 2001 From: Thilo von Neumann Date: Wed, 26 Jun 2024 15:06:13 +0200 Subject: [PATCH 4/6] Remove py3.7 tests and add py3.12 tests --- .github/workflows/pytest.yml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index e7a6729a..c6ad0aab 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -17,7 +17,7 @@ jobs: fail-fast: false matrix: # Remember to update "classifiers" in setup.py when changing Python version - python-version: [3.7, 3.8, 3.9, '3.10', '3.11'] + python-version: [3.8, 3.9, '3.10', '3.11', '3.12'] steps: - uses: actions/checkout@v3 diff --git a/setup.py b/setup.py index 044638e4..1c5fd411 100644 --- a/setup.py +++ b/setup.py @@ -100,11 +100,11 @@ # Specify the Python versions you support here. In particular, ensure # that you indicate whether you support Python 2, Python 3 or both. - 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', ], python_requires=">=3.5", From 3ab77ba1f269b37a06a54514b5209eb9469fad03 Mon Sep 17 00:00:00 2001 From: Thilo von Neumann Date: Thu, 27 Jun 2024 08:28:25 +0200 Subject: [PATCH 5/6] Update docstrings --- tests/test_docs.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/test_docs.py b/tests/test_docs.py index 2fd86c02..e82e36c6 100644 --- a/tests/test_docs.py +++ b/tests/test_docs.py @@ -10,6 +10,7 @@ # List of language blocks that are not tested LANG_BLACKLIST = ['shell', 'bibtex', ''] +# Markdown files for which the context is kept between code blocks KEEP_CONTEXT = ['doc/algorithms.md'] @@ -37,10 +38,6 @@ def split_code_block_comment_output(code): """Splits a code block where a line starts with `print` and the following line is a comment. The comment is expected to be the output of the print statement. - - Returns a list of tuples with the code block and the expected output. - The print statement in the code block is replaced with `__output = ` so - that the result can be inspected after `exec`. """ c = ast.parse(code) lines = code.splitlines() @@ -65,6 +62,7 @@ def exec_with_source(code, filename, lineno, globals_=None, locals_=None): """ Like `compile` followed by `exec`, but sets the correct line number for the code block. This is required for correct traceback display. + Captures stdout and returns it as a string. """ compiled = ast.parse(code, str(filename), 'exec') ast.increment_lineno(compiled, lineno) From 6bab678080c36534e98de42c4a18d80aa8afb234 Mon Sep 17 00:00:00 2001 From: Thilo von Neumann Date: Thu, 27 Jun 2024 12:33:35 +0200 Subject: [PATCH 6/6] Review comments --- tests/test_docs.py | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/tests/test_docs.py b/tests/test_docs.py index e82e36c6..1c26e619 100644 --- a/tests/test_docs.py +++ b/tests/test_docs.py @@ -38,23 +38,47 @@ def split_code_block_comment_output(code): """Splits a code block where a line starts with `print` and the following line is a comment. The comment is expected to be the output of the print statement. + + >>> split_code_block_comment_output(r''' + ... # this is a comment + ... print('hello') + ... print('world') + ... # hello + ... # world + ... + ... # Here starts the second block + ... a = 2 + ... + ... print( + ... a + ... ) + ... # 2 + ... ''') + [("\\n# this is a comment\\nprint('hello')\\nprint('world')", ' hello\\n world', 0), ('\\n# Here starts the second block\\na = 2\\n\\nprint(\\n a\\n)', ' 2', 6)] """ c = ast.parse(code) lines = code.splitlines() last_match = 0 blocks = [] + l = 0 for s in c.body: + if l > s.end_lineno: + continue + # If we parsed a print statement at the root level if isinstance(s, ast.Expr) and isinstance(s.value, ast.Call) and isinstance(s.value.func, ast.Name) and s.value.func.id == 'print': # Collect any lines that follow directly and start with a # output = [] l = s.end_lineno + if not lines[l].startswith('#'): + continue while l < len(lines) and lines[l].startswith('#'): output.append(lines[l][1:]) l += 1 blocks.append(('\n'.join(lines[last_match:s.end_lineno]), '\n'.join(output), last_match)) last_match = l - blocks.append(('\n'.join(lines[last_match:]), '', last_match)) + if last_match < len(lines): + blocks.append(('\n'.join(lines[last_match:]), '', last_match)) return blocks @@ -84,10 +108,12 @@ def exec_with_source(code, filename, lineno, globals_=None, locals_=None): for codeblock in get_fenced_code_blocks(filename.read_text()) ] ) -def test_readme(filename, codeblock, global_state): +def test_readme(filename, codeblock, global_state, monkeypatch): """Run fenced code blocks in markdown files in the MeetEval repository.""" - import os - os.chdir(MEETEVAL_ROOT) + # Some code blocks in the readme file must run in the meeteval root directory + # because they access the example files in `MEETEVAL_ROOT/example_files` + monkeypatch.chdir(MEETEVAL_ROOT) + lang, code, lineno = codeblock if lang in LANG_BLACKLIST: return