diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index e7a6729a..c6ad0aab 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -17,7 +17,7 @@ jobs:
       fail-fast: false
       matrix:
         # Remember to update "classifiers" in setup.py when changing Python version
-        python-version: [3.7, 3.8, 3.9, '3.10', '3.11']
+        python-version: [3.8, 3.9, '3.10', '3.11', '3.12']
 
     steps:
     - uses: actions/checkout@v3
diff --git a/README.md b/README.md
index 0855f74a..b7e98007 100644
--- a/README.md
+++ b/README.md
@@ -80,7 +80,7 @@ Each segment should have a minimum set of keys `"session_id"` and `"words"`.
 Depending on the metric, additional keys may be required (`"speaker"`, `"start_time"`, `"end_time"`).
 
 An example is shown below:
-```py
+```python
 [
     {
         "session_id": "recordingA", # Required
@@ -98,7 +98,7 @@ Another example can be found [here](example_files/hyp.seglst.json).
 #### [Segmental Time Mark (STM)](https://github.com/usnistgov/SCTK/blob/master/doc/infmts.htm#L75)
 Each line in an `STM` file represents one "utterance" and is defined as
 
-```STM
+```
 STM :== <filename> <channel> <speaker_id> <begin_time> <end_time> <transcript>
 ```
 where
@@ -110,13 +110,12 @@ where
 - `transcript`: space-separated list of words
 
 for example:
-```
+```STM
 recording1 1 Alice 0 0 Hello Bob.
 recording1 1 Bob 1 0 Hello Alice.
 recording1 1 Alice 2 0 How are you?
-...
 recording2 1 Alice 0 0 Hello Carol.
-...
+;; ...
 ```
 
 An example `STM` file can be found in [here](example_files/ref.stm).
@@ -124,7 +123,7 @@ An example `STM` file can be found in [here](example_files/ref.stm).
 #### [Time Marked Conversation (CTM)](https://github.com/usnistgov/SCTK/blob/master/doc/infmts.htm#L286)
 The CTM format is defined as
 
-```CTM
+```
 CTM :== <filename> <channel> <begin_time> <duration> <word> [<confidence>]
 ```
 
diff --git a/setup.py b/setup.py
index 044638e4..1c5fd411 100644
--- a/setup.py
+++ b/setup.py
@@ -100,11 +100,11 @@
 
         # Specify the Python versions you support here. In particular, ensure
         # that you indicate whether you support Python 2, Python 3 or both.
-        'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
         'Programming Language :: Python :: 3.9',
         'Programming Language :: Python :: 3.10',
         'Programming Language :: Python :: 3.11',
+        'Programming Language :: Python :: 3.12',
     ],
 
     python_requires=">=3.5",
diff --git a/tests/test_docs.py b/tests/test_docs.py
new file mode 100644
index 00000000..1c26e619
--- /dev/null
+++ b/tests/test_docs.py
@@ -0,0 +1,145 @@
+import pytest
+import re
+from pathlib import Path
+import ast
+
+MEETEVAL_ROOT = Path(__file__).parent.parent
+
+FENCED_CODE_BLOCK_REGEX = re.compile(r'```([^`\n]*)?\n((?:.|\n)*?)\n```')
+
+# List of language blocks that are not tested
+LANG_BLACKLIST = ['shell', 'bibtex', '']
+
+# Markdown files for which the context is kept between code blocks
+KEEP_CONTEXT = ['doc/algorithms.md']
+
+
+@pytest.fixture(scope='session')
+def global_state():
+    """Used to track global state across code blocks in the files listed in `KEEP_CONTEXT`."""
+    return {}
+
+
+def get_fenced_code_blocks(markdown_string: str):
+    """
+    Returns a list of tuples (lang, code, lineno) for each fenced code block in the markdown string.
+    lineno corresponds to the line where the code starts (after the opening ```).
+    """
+    def get_lineno(offset):
+        return markdown_string[:offset].count('\n') + 1
+
+    return [
+        (m.group(1), m.group(2), get_lineno(m.span()[0]))
+        for m in FENCED_CODE_BLOCK_REGEX.finditer(markdown_string)
+    ]
+
+
+def split_code_block_comment_output(code):
+    """Splits a code block where a line starts with `print` and the following
+    line is a comment.
+    The comment is expected to be the output of the print statement.
+
+    >>> split_code_block_comment_output(r'''
+    ... # this is a comment
+    ... print('hello')
+    ... print('world')
+    ... # hello
+    ... # world
+    ...
+    ... # Here starts the second block
+    ... a = 2
+    ...
+    ... print(
+    ...     a
+    ... )
+    ... # 2
+    ... ''')
+    [("\\n# this is a comment\\nprint('hello')\\nprint('world')", ' hello\\n world', 0), ('\\n# Here starts the second block\\na = 2\\n\\nprint(\\n    a\\n)', ' 2', 6)]
+    """
+    c = ast.parse(code)
+    lines = code.splitlines()
+    last_match = 0
+    blocks = []
+    l = 0
+    for s in c.body:
+        if l > s.end_lineno:
+            continue
+
+        # If we parsed a print statement at the root level
+        if isinstance(s, ast.Expr) and isinstance(s.value, ast.Call) and isinstance(s.value.func, ast.Name) and s.value.func.id == 'print':
+            # Collect any lines that follow directly and start with a #
+            output = []
+            l = s.end_lineno
+            if not lines[l].startswith('#'):
+                continue
+            while l < len(lines) and lines[l].startswith('#'):
+                output.append(lines[l][1:])
+                l += 1
+            blocks.append(('\n'.join(lines[last_match:s.end_lineno]), '\n'.join(output), last_match))
+            last_match = l
+    if last_match < len(lines):
+        blocks.append(('\n'.join(lines[last_match:]), '', last_match))
+    return blocks
+
+
+def exec_with_source(code, filename, lineno, globals_=None, locals_=None):
+    """
+    Like `compile` followed by `exec`, but sets the correct line number for the code block.
+    This is required for correct traceback display.
+    Captures stdout and returns it as a string.
+    """
+    compiled = ast.parse(code, str(filename), 'exec')
+    ast.increment_lineno(compiled, lineno)
+    compiled = compile(compiled, str(filename), 'exec', optimize=0)
+    from io import StringIO
+    from contextlib import redirect_stdout
+
+    f = StringIO()
+    with redirect_stdout(f):
+        exec(compiled, globals_, locals_)
+    return f.getvalue()
+
+
+@pytest.mark.parametrize(
+        ('filename', 'codeblock'), 
+        [
+            (str(filename.relative_to(MEETEVAL_ROOT)), codeblock)
+            for filename in MEETEVAL_ROOT.glob('**/*.md')
+            for codeblock in get_fenced_code_blocks(filename.read_text())
+        ]
+)
+def test_readme(filename, codeblock, global_state, monkeypatch):
+    """Run fenced code blocks in markdown files in the MeetEval repository."""
+    # Some code blocks in the readme file must run in the meeteval root directory
+    # because they access the example files in `MEETEVAL_ROOT/example_files`
+    monkeypatch.chdir(MEETEVAL_ROOT)
+
+    lang, code, lineno = codeblock
+    if lang in LANG_BLACKLIST:
+        return
+
+    try:
+        if lang == 'python':
+            if filename in KEEP_CONTEXT:
+                globals_ = global_state.setdefault(filename, {})
+            else:
+                globals_ = {}
+            for code, expected_output, line_offset in split_code_block_comment_output(code):
+                output = exec_with_source(code, str(filename), lineno + line_offset, globals_)
+                if expected_output is not None:
+                    # Check that the output is equal to the expected output, but we want to ignore whitespace
+                    # for formatting / clarity reasons.
+                    # This is a very basic check that ignores all whitespace, but it should be
+                    # sufficient for most cases.
+                    output_ = output.replace(' ', '').replace('\n', '')
+                    expected_output_ = expected_output.replace(' ', '').replace('\n', '')
+                    assert output_ == expected_output_, f'Output mismatch: {output} != {expected_output}'
+        elif lang == 'STM':
+            # Test if the STM code block is valid.
+            import meeteval
+            meeteval.io.STM.parse(code)
+        else:
+            raise ValueError(f'Unsupported language: {lang}')
+    except Exception:
+        print(f'Error in {lang} code block:\n', code)
+        raise