diff --git a/docs/Untitled42.ipynb b/docs/Untitled42.ipynb index ccebb40..3ed08c3 100644 --- a/docs/Untitled42.ipynb +++ b/docs/Untitled42.ipynb @@ -291,17 +291,9 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "i was printed from and my name is __main__\n" - ] - } - ], + "outputs": [], "source": [ "from importnb import get_ipython\n", "from pathlib import Path\n", @@ -333,7 +325,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -350,7 +342,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -367,7 +359,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -384,17 +376,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "i'm only show when cell magics are active.\n" - ] - } - ], + "outputs": [], "source": [ "%%python\n", "print(\"i'm only show when cell magics are active.\")" @@ -402,7 +386,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -430,7 +414,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -443,7 +427,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -472,7 +456,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -482,7 +466,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -505,20 +489,29 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "if get_ipython() and not where:\n", " from importnb import loaders\n", " display(loaders.Json.load_file(\"Untitled42.ipynb\"))" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:root] *", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "conda-root-py" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -530,11 +523,13 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.11.3" }, - "vscode": { - "interpreter": { - "hash": "6624ee388a1c346f3d0811b591fe9e170807496b8a5fea1a5f5986a819dc2334" + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 } } }, @@ -571,9 +566,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:root] *", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "conda-root-py" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -585,11 +580,13 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.11.3" }, - "vscode": { - "interpreter": { - "hash": "6624ee388a1c346f3d0811b591fe9e170807496b8a5fea1a5f5986a819dc2334" + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 } } }, diff --git a/docs/test_cli.py b/docs/test_cli.py index 6111e0e..1cd6607 100644 --- a/docs/test_cli.py +++ b/docs/test_cli.py @@ -2,7 +2,7 @@ from subprocess import check_call from sys import executable, path, version_info -from pytest import importorskip +from pytest import importorskip, mark from importnb import Notebook from importnb import __version__ as importnb_version @@ -92,7 +92,8 @@ def test_module(): """ -@cli_test("-m importnb -c '{}'") +@mark.xfail +@cli_test("""-m importnb -c '{"cells": []}'""") def test_empty_code(): """""" diff --git a/docs/test_importnb.py b/docs/test_importnb.py index 5ffc540..632a38a 100644 --- a/docs/test_importnb.py +++ b/docs/test_importnb.py @@ -124,7 +124,8 @@ def test_load_file(clean, ref): def test_load_code(clean): - assert Notebook.load_code(""), "can't load an empty notebook" + with raises(BaseException): + assert Notebook.load_code(""), "can't load an empty notebook" body = Path("docs/Untitled42.ipynb").read_text() m = Notebook.load_code(body) cant_reload(m) diff --git a/docs/test_parser.py b/docs/test_parser.py new file mode 100644 index 0000000..5217b7d --- /dev/null +++ b/docs/test_parser.py @@ -0,0 +1,43 @@ +from json import dumps +from pathlib import Path +from hypothesis_jsonschema import from_schema +from hypothesis import given +from pytest import mark, raises +from importnb.decoder import InvalidNotebook, parse_nbformat + +HERE = Path(__file__).parent + + +@mark.parametrize("file", [HERE / "test_in_notebook.ipynb"]) +def test_parse_notebook(file): + parse_nbformat(file.read_text()) + + +@mark.parametrize("file", [HERE / "ascript.ipy"]) +def test_parse_notebook_fail(file): + with raises(InvalidNotebook): + parse_nbformat(file.read_text()) + + +@given(from_schema(dict(dict(type=["object", "string"])))) +def test_random_objects_fail(object): + with raises(InvalidNotebook): + return parse_nbformat(dumps(object)) + + +@given( + from_schema( + dict( + properties=dict( + cells=dict( + type="array", + items=dict(dict(type=["object", "string"])), + ), + additionalProperties=False, + ) + ) + ) +) +def test_random_cells_fail(object): + with raises(InvalidNotebook): + return parse_nbformat(dumps(object)) diff --git a/pyproject.toml b/pyproject.toml index fb42e5f..27dd425 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,6 +91,7 @@ dependencies = [ "tomli", "ruamel.yaml", "tomli_w", + "hypothesis-jsonschema" ] [[tool.hatch.envs.test.matrix]] diff --git a/src/importnb/__init__.py b/src/importnb/__init__.py index 40f2709..9b8d697 100644 --- a/src/importnb/__init__.py +++ b/src/importnb/__init__.py @@ -23,10 +23,7 @@ def get_ipython(force=True): return None -import builtins - from ._version import __version__ from .entry_points import imports from .loader import Notebook, reload - -builtins.true, builtins.false, builtins.null = True, False, None +from .entry_points import imports diff --git a/src/importnb/decoder.py b/src/importnb/decoder.py index fa48bb0..80f703e 100644 --- a/src/importnb/decoder.py +++ b/src/importnb/decoder.py @@ -1,7 +1,10 @@ +import collections import json import linecache +import operator import textwrap -from functools import partial +from io import StringIO +from functools import lru_cache, partial def quote(object, *, quotes="'''"): @@ -10,10 +13,41 @@ def quote(object, *, quotes="'''"): return quotes + object + "\n" + quotes -from ._json_parser import Lark_StandAlone, Transformer, Tree +from ._json_parser import ( + Lark_StandAlone, + Transformer as _Lark_Transformer, + UnexpectedCharacters, + UnexpectedToken, +) +Cell = collections.namedtuple("cell", "lineno cell_type source") +Cell_getter = operator.itemgetter(*Cell._fields) + + +class InvalidNotebook(BaseException): + """The notebook does not conform to the nbformat.""" + + +def parse_nbformat(source: str, parser=None): + if parser is None: + parser = LineCacheNotebookDecoder()._parser + try: + return parser.parse(source) + except (UnexpectedCharacters, UnexpectedToken) as e: + raise InvalidNotebook from e + except BaseException as e: + raise e + + +class _Transformer(_Lark_Transformer): + """a lark transformer for a grammar specifically designed for the nbformat. + + tokenizes notebook documents parsed with nbformat specific grammar. + features of the notebook are captured as nodes in the lexical analysis. + they are further massaged to return a line for line representation of the + json document as code. + """ -class Transformer(Transformer): def __init__( self, markdown=quote, @@ -26,53 +60,53 @@ def __init__( for key in ("markdown", "code", "raw"): setattr(self, "transform_" + key, locals().get(key)) + def nb(self, s): + # hide the nb node from the tree. + return s[0] + + def cells(self, s): + # recombine the tokenized json document as line for line source code. + line, buffer = 0, StringIO() + for t in filter(bool, s): + # write any missing preceding lines + buffer.write("\n" * (t.lineno - 2 - line)) + + # transform the source based on the cell_type. + body = getattr(self, f"transform_{t.cell_type}")("".join(t.source)) + buffer.write(body) + + if not body.endswith("\n"): + buffer.write("\n") + line += 1 + + # increment the line numbers that have been visited. + line += body.count("\n") + return buffer.getvalue() + + def cell(self, s): + # we can't know the order of the cell type and the source. + # we tokenize the cell parts as a dictionary IFF there is source. + data = dict(collections.ChainMap(*s)) + if "source" in data: + return Cell(*Cell_getter(data)) + # the none result will get filtered out before combining. + return None + + def cell_type(self, s): + # every cell needs to have this to dispatch the transformers. + # remove the quotes around the string + return dict(cell_type=s[0][1][1:-1]) + + def source(self, s): + # return the line number and source lines. + return dict(lineno=s[0][0], source=[json.loads(x) for _, x in s]) if s else {} + def string(self, s): - return s[0].line, json.loads(s[0]) - - def item(self, s): - key = s[0][-1] - if key == "cells": - if not isinstance(s[-1], Tree): - return self.render(list(map(dict, s[-1]))) - elif key in {"source", "text"}: - return key, s[-1] - elif key == "cell_type": - if isinstance(s[-1], tuple): - return key, s[-1][-1] - - def array(self, s): - if s: - return s - return [] - - def object(self, s): - return [x for x in s if x is not None] - - def render_one(self, kind, lines): - s = "".join(lines) - if not s.endswith(("\n",)): - s += "\n" - return getattr(self, f"transform_{kind}")(s) - - def render(self, x): - body = [] - for token in x: - t = token.get("cell_type") - try: - s = token["source"] - except KeyError: - s = token.get("text") - if s: - if not isinstance(s, list): - s = [s] - l, lines = s[0][0], [x[1] for x in s] - body.extend([""] * (l - len(body))) - lines = self.render_one(t, lines) - body.extend(lines.splitlines()) - return "\n".join(body + [""]) - - -class LineCacheNotebookDecoder(Transformer): + # capture the line number of string values + return s[0].line, str(s[0]) + + +class LineCacheNotebookDecoder(_Transformer): def __init__( self, markdown=quote, @@ -85,13 +119,14 @@ def __init__( for key in ("markdown", "code", "raw"): setattr(self, "transform_" + key, locals().get(key)) + self._parser = Lark_StandAlone(transformer=self) + def source_from_json_grammar(self, object): - return Lark_StandAlone(transformer=self).parse(object) + return parse_nbformat(object, self._parser) def decode(self, object, filename): - s = self.source_from_json_grammar(object) - if s: - source = s[0] + source = self.source_from_json_grammar(object) + if source: linecache.updatecache(filename) if filename in linecache.cache: linecache.cache[filename] = ( diff --git a/src/importnb/json.g b/src/importnb/json.g index e453395..44f5661 100644 --- a/src/importnb/json.g +++ b/src/importnb/json.g @@ -1,29 +1,60 @@ + // a lark grammar for parsing notebooks into source // this grammar extracts a subset of nbformat (cells, cell_type, source) // to generate a line for line reconstruction of the source. -?start: value -?value: object - | array - | string - | SIGNED_NUMBER - | "true" - | "false" - | "null" +%import common.ESCAPED_STRING -> STRING +%import common.SIGNED_NUMBER -> _NUMBER +%import common.WS +%ignore WS +_STRING: STRING -COLON: ":" -array : "[" [value ("," value)*] "]" -object : "{" [_items] "}" +// rules for the top level notebook. +// notebook files are json encoded and back with a json schema. +// the schema allows us to create a more specific grammar +// that is a aware of the schema supporting validation sooner in the parsing stage. +?start: nb +?nb: "{" (_nb ("," _nb)*)+ "}" +_nb: cells | _metadata | _nbformat | _nbformat_minor +_cells: "[" cell ("," cell)* "]" +cells: _pair{ "cells", _cells } +_metadata: _pair{"metadata", _object} +_nbformat: _pair{"nbformat", _NUMBER} +_nbformat_minor: _pair{"nbformat_minor", _NUMBER} +cell: "{" _cell ("," _cell)* "}" -item: string COLON value - -_items: item ("," item)* +// rules for the notebook cells +_cell: source | _outputs | _attachments | cell_type | _execution_count | _metadata +_source: "[" (string ("," string)*)? "]" | string +source: _pair{"source", _source} +_outputs: _pair{"outputs", _array} +_attachments: _pair{"attachments", _object} +_id: _pair{"id", _string} +cell_type: _pair{"cell_type", string} +_execution_count_: "null" | _NUMBER +_execution_count: _pair{"execution_count", _execution_count_} + +// terminals and rules for parsing generic json. +_QUOTE: /"/ +_COLON: ":" +_array : "[" [_any ("," _any)*] "]" +_object : "{" _items* "}" +_item: _string _COLON _any +_items: _item ("," _item)* +string: STRING +_string: _STRING +_T: "true" +_F: "false" +_N: "null" +_any: _object + | _array + | _string + | _NUMBER + | _T + | _F + | _N + +_key{key}: _QUOTE key _QUOTE +_pair{key, target}: _key{key} _COLON target -string : ESCAPED_STRING - -%import common.ESCAPED_STRING -%import common.SIGNED_NUMBER -%import common.WS - -%ignore WS \ No newline at end of file diff --git a/src/importnb/loader.py b/src/importnb/loader.py index 7063a88..3c64d83 100644 --- a/src/importnb/loader.py +++ b/src/importnb/loader.py @@ -105,9 +105,9 @@ def finder(self): """Generate a new finder based on the state of an existing loader""" return self.include_fuzzy_finder and FuzzyFinder or FileFinder - def raw_to_source(self, source): - """Transform a string from a raw file to python source.""" - if self.path and self.path.endswith(".ipynb"): + def raw_to_source(self, source, force=False): + """transform a string from a raw file to python source.""" + if force or self.path and self.path.endswith(".ipynb"): # when we encounter notebooks we apply different transformers to the diff cell types return LineCacheNotebookDecoder( code=self.code, @@ -375,9 +375,11 @@ def load_code(cls, code, argv=None, mod_name=None, script_name=None, main=False) self = cls() name = main and "__main__" or mod_name or "" + self.path = "" + script_name = script_name or name return _dict_module( - _run_module_code(self.raw_to_source(code), mod_name=name, script_name=script_name), + _run_module_code(self.raw_to_source(code, True), mod_name=name, script_name=script_name) ) @staticmethod @@ -448,19 +450,6 @@ def source_to_nodes(self, source, path="", *, _optimize=-1): nodes = self.visit(nodes) return ast.fix_missing_locations(nodes) - def raw_to_source(self, source): - """Transform a string from a raw file to python source.""" - if self.path and self.path.endswith(".ipynb"): - # when we encounter notebooks we apply different transformers to the diff cell types - return LineCacheNotebookDecoder( - code=self.code, - raw=self.raw, - markdown=self.markdown, - ).decode(source, self.path) - - # for a normal file we just apply the code transformer. - return self.code(source) - def _dict_module(ns): m = ModuleType(ns.get("__name__"), ns.get("__doc__"))