diff --git a/pyproject.toml b/pyproject.toml index e38a537..43fbb6c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -104,10 +104,12 @@ ignore = [ "E402", # Module level import not at top of file "E731", # Do not assign a lambda expression, use a def "E741", # Ambiguous variable name + "PLR2004", # magic-value-comparison ] fix = true unfixable = [ "F401", # unused imports + "F841", # unused variables ] [tool.ruff.per-file-ignores] diff --git a/src/clean_notebook/clean.py b/src/clean_notebook/clean.py index a89b0fe..fc0935f 100644 --- a/src/clean_notebook/clean.py +++ b/src/clean_notebook/clean.py @@ -1,9 +1,12 @@ from __future__ import annotations import json +import uuid from pathlib import Path from typing import Any, AnyStr, Iterator +__all__ = ("clean_notebook", "clean_single_notebook") + def clean_notebook( paths: list[str | Path], @@ -29,6 +32,11 @@ def find_line_ending(s: AnyStr) -> AnyStr: return counter[max(counter)] +def _check_set_id(nb: dict[str, Any]) -> bool: + # https://jupyter.org/enhancement-proposals/62-cell-id/cell-id.html + return (nb["nbformat"] == 4 and nb["nbformat_minor"] >= 5) or nb["nbformat"] >= 5 + + def clean_single_notebook( file: Path, *, @@ -42,6 +50,7 @@ def clean_single_notebook( newline = find_line_ending(raw) nb = json.loads(raw) + set_id = _check_set_id(nb) cleaned = False for cell in nb["cells"].copy(): cleaned |= _update_value(cell, "outputs", []) @@ -53,6 +62,9 @@ def clean_single_notebook( if "attachments" in cell and len(cell["attachments"]) == 0: del cell["attachments"] cleaned = True + if set_id and cell.get("id") is None: + cell["id"] = str(uuid.uuid4()) + cleaned = True if not nb["cells"]: print(f"Notebook '{file}' does not have any valid cells.") @@ -63,7 +75,7 @@ def clean_single_notebook( if cleaned and not dryrun: with open(file, "w", encoding="utf8", newline=newline) as f: - json.dump(nb, f, indent=1, ensure_ascii=False) + json.dump(nb, f, indent=1, ensure_ascii=False, sort_keys=True) f.write(newline) # empty line at the end of the file print(f"Cleaned notebook: {file}") elif cleaned: diff --git a/tests/data/clean_colab.ipynb b/tests/data/clean_colab.ipynb index 38eb476..b83cb28 100644 --- a/tests/data/clean_colab.ipynb +++ b/tests/data/clean_colab.ipynb @@ -1,12 +1,4 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "language_info": { - "name": "python", - "pygments_lexer": "ipython3" - } - }, "cells": [ { "cell_type": "code", @@ -21,21 +13,29 @@ }, { "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "print(\"saved from colab\")" - ], - "metadata": {}, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "" - ], - "metadata": {}, - "execution_count": null, - "outputs": [] + ] } - ] + ], + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/tests/data/clean_id.ipynb b/tests/data/clean_id.ipynb new file mode 100644 index 0000000..3a83443 --- /dev/null +++ b/tests/data/clean_id.ipynb @@ -0,0 +1,38 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "3d183bd1-509f-4758-9f2d-db94b23c58f9", + "metadata": {}, + "outputs": [], + "source": [ + "a = 2" + ] + }, + { + "cell_type": "markdown", + "id": "66cdc779-4931-4306-881a-4bf30cb0fdbb", + "metadata": {}, + "source": [ + "Markdown" + ] + }, + { + "cell_type": "raw", + "id": "5cbb8154-79ee-4290-953c-89a89b4276b7", + "metadata": {}, + "source": [ + "Raw" + ] + } + ], + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/data/dirty_id.ipynb b/tests/data/dirty_id.ipynb new file mode 100644 index 0000000..b1fa9f7 --- /dev/null +++ b/tests/data/dirty_id.ipynb @@ -0,0 +1,42 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "a = 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Markdown" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "Raw" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/test_clean_notebook.py b/tests/test_clean_notebook.py index 6be68e1..8c874c6 100644 --- a/tests/test_clean_notebook.py +++ b/tests/test_clean_notebook.py @@ -1,5 +1,6 @@ from __future__ import annotations +import uuid from pathlib import Path from shutil import copy2, rmtree from typing import TYPE_CHECKING, Iterator @@ -10,6 +11,7 @@ if TYPE_CHECKING: from _pytest.capture import CaptureFixture + from _pytest.monkeypatch import MonkeyPatch from _pytest.tmpdir import TempPathFactory @@ -34,7 +36,7 @@ def temp_path(tmp_path_factory: TempPathFactory) -> Iterator[Path]: TESTS = ["ascii", "jupyterlab", "vscode", "colab", "empty_cell", "empty_multi_cell"] -@pytest.mark.parametrize("test", [*TESTS, "ignore_slideshow"]) +@pytest.mark.parametrize("test", [*TESTS, "ignore_slideshow", "id"]) def test_noclean_notebook(temp_path: Path, test: str) -> None: dirty = temp_path / f"dirty_{test}.ipynb" clean = temp_path / f"clean_{test}.ipynb" @@ -74,6 +76,31 @@ def test_ignore_metadata(temp_path: Path) -> None: assert clean_bytes != dirty_bytes +def test_notebook_id(temp_path: Path, monkeypatch: MonkeyPatch) -> None: + test = "id" + dirty = temp_path / f"dirty_{test}.ipynb" + clean = temp_path / f"clean_{test}.ipynb" + + ids = [ + "3d183bd1-509f-4758-9f2d-db94b23c58f9", + "66cdc779-4931-4306-881a-4bf30cb0fdbb", + "5cbb8154-79ee-4290-953c-89a89b4276b7", + ] + iterator = iter(ids) + monkeypatch.setattr(uuid, "uuid4", lambda: next(iterator)) + + clean_single_notebook(dirty) + clean_bytes = load_file(clean) + dirty_bytes = load_file(dirty) + assert clean_bytes == dirty_bytes + + +def test_notebook_no_overwrite_ids(temp_path: Path) -> None: + test = "id" + clean = temp_path / f"clean_{test}.ipynb" + assert not clean_single_notebook(clean) + + def test_empty_notebook(capsys: CaptureFixture[str], temp_path: Path) -> None: dirty = temp_path / "dirty_empty.ipynb"