diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 3b9775e..feb4cb9 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -32,6 +32,42 @@ jobs: # code quality check, stop the build for any errors ruff check . --show-fixes --exit-non-zero-on-fix + test-minimal: + needs: fmt + name: minimal tests + runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + - name: Install pytest + run: | + python -m pip install pytest pytest-cov pytest-regressions pytest-xdist nbmake + - name: Install sharrow + run: | + python -m pip install . + - name: Initial simple tests + # tests that sharrow can be imported and that categorical tests can be run + run: | + python -m pytest sharrow/tests/test_categorical.py + - name: Install openmatrix + run: | + python -m pip install openmatrix + - name: Dataset tests + # tests that the datasets can be read and that the tests can be run + run: | + python -m pytest sharrow/tests/test_datasets.py + - name: Install zarr and dask-diagnostics + run: | + python -m pip install zarr "dask[diagnostics]" + - name: More complete test with pytest + run: | + python -m pytest -v --disable-warnings sharrow/tests + test: needs: fmt name: ${{ matrix.os }} py${{ matrix.python-version }} diff --git a/docs/walkthrough/encoding.ipynb b/docs/walkthrough/encoding.ipynb index 4c9786d..8e1046b 100644 --- a/docs/walkthrough/encoding.ipynb +++ b/docs/walkthrough/encoding.ipynb @@ -63,9 +63,22 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "id": "4", "metadata": {}, + "outputs": [], + "source": [ + "# TEST\n", + "import pytest\n", + "\n", + "pytest.importorskip(\"sparse\")" + ] + }, + { + "cell_type": "markdown", + "id": "5", + "metadata": {}, "source": [ "## Example Data\n", "\n", @@ -78,7 +91,7 @@ }, { "cell_type": "markdown", - "id": "5", + "id": "6", "metadata": {}, "source": [ "We'll load them as a multi-dimensional `xarray.Dataset` — or, more exactly, a \n", @@ -89,7 +102,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6", + "id": "7", "metadata": {}, "outputs": [], "source": [ @@ -99,7 +112,7 @@ }, { "cell_type": "markdown", - "id": "7", + "id": "8", "metadata": {}, "source": [ "Because sharrow uses the `xarray.Dataset` format to work with data, individual \n", @@ -113,7 +126,7 @@ }, { "cell_type": "markdown", - "id": "8", + "id": "9", "metadata": {}, "source": [ "## Fixed Point Encoding\n", @@ -144,7 +157,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9", + "id": "10", "metadata": {}, "outputs": [], "source": [ @@ -153,7 +166,7 @@ }, { "cell_type": "markdown", - "id": "10", + "id": "11", "metadata": {}, "source": [ "The distance data in the skims is a great candidate for fixed point\n", @@ -163,7 +176,7 @@ { "cell_type": "code", "execution_count": null, - "id": "11", + "id": "12", "metadata": {}, "outputs": [], "source": [ @@ -172,7 +185,7 @@ }, { "cell_type": "markdown", - "id": "12", + "id": "13", "metadata": {}, "source": [ "The data are all small(ish) values with two decimal point fixed\n", @@ -184,7 +197,7 @@ { "cell_type": "code", "execution_count": null, - "id": "13", + "id": "14", "metadata": {}, "outputs": [], "source": [ @@ -193,7 +206,7 @@ }, { "cell_type": "markdown", - "id": "14", + "id": "15", "metadata": {}, "source": [ "That's a really small range because this is only test data. But even \n", @@ -207,7 +220,7 @@ { "cell_type": "code", "execution_count": null, - "id": "15", + "id": "16", "metadata": {}, "outputs": [], "source": [ @@ -218,7 +231,7 @@ { "cell_type": "code", "execution_count": null, - "id": "16", + "id": "17", "metadata": { "tags": [ "remove-cell" @@ -241,7 +254,7 @@ }, { "cell_type": "markdown", - "id": "17", + "id": "18", "metadata": {}, "source": [ "We can apply that function for any number of variables in the skims, and\n", @@ -251,7 +264,7 @@ { "cell_type": "code", "execution_count": null, - "id": "18", + "id": "19", "metadata": {}, "outputs": [], "source": [ @@ -260,7 +273,7 @@ }, { "cell_type": "markdown", - "id": "19", + "id": "20", "metadata": {}, "source": [ "To manage the digital encodings across an entire dataset, sharrow implements\n", @@ -271,7 +284,7 @@ { "cell_type": "code", "execution_count": null, - "id": "20", + "id": "21", "metadata": {}, "outputs": [], "source": [ @@ -282,7 +295,7 @@ }, { "cell_type": "markdown", - "id": "21", + "id": "22", "metadata": {}, "source": [ "And you can review the encodings for every variable in the dataset like this:" @@ -291,7 +304,7 @@ { "cell_type": "code", "execution_count": null, - "id": "22", + "id": "23", "metadata": {}, "outputs": [], "source": [ @@ -301,7 +314,7 @@ { "cell_type": "code", "execution_count": null, - "id": "23", + "id": "24", "metadata": { "tags": [ "remove-cell" @@ -319,7 +332,7 @@ }, { "cell_type": "markdown", - "id": "24", + "id": "25", "metadata": {}, "source": [ "To demonstrate that the encoding works transparently with a `Flow`,\n", @@ -332,7 +345,7 @@ { "cell_type": "code", "execution_count": null, - "id": "25", + "id": "26", "metadata": {}, "outputs": [], "source": [ @@ -352,7 +365,7 @@ }, { "cell_type": "markdown", - "id": "26", + "id": "27", "metadata": {}, "source": [ "We can do the same for the encoded skims, and we get exactly the\n", @@ -362,7 +375,7 @@ { "cell_type": "code", "execution_count": null, - "id": "27", + "id": "28", "metadata": {}, "outputs": [], "source": [ @@ -382,7 +395,7 @@ { "cell_type": "code", "execution_count": null, - "id": "28", + "id": "29", "metadata": { "tags": [ "remove-cell" @@ -396,7 +409,7 @@ }, { "cell_type": "markdown", - "id": "29", + "id": "30", "metadata": {}, "source": [ "## Dictionary Encoding\n", @@ -423,7 +436,7 @@ { "cell_type": "code", "execution_count": null, - "id": "30", + "id": "31", "metadata": {}, "outputs": [], "source": [ @@ -432,7 +445,7 @@ }, { "cell_type": "markdown", - "id": "31", + "id": "32", "metadata": {}, "source": [ "We can see various fares applied at different time periods if we\n", @@ -442,7 +455,7 @@ { "cell_type": "code", "execution_count": null, - "id": "32", + "id": "33", "metadata": {}, "outputs": [], "source": [ @@ -451,7 +464,7 @@ }, { "cell_type": "markdown", - "id": "33", + "id": "34", "metadata": {}, "source": [ "Once encoded, the array itself only contains offset pointers (small integers),\n", @@ -461,7 +474,7 @@ { "cell_type": "code", "execution_count": null, - "id": "34", + "id": "35", "metadata": {}, "outputs": [], "source": [ @@ -472,7 +485,7 @@ { "cell_type": "code", "execution_count": null, - "id": "35", + "id": "36", "metadata": {}, "outputs": [], "source": [ @@ -482,7 +495,7 @@ { "cell_type": "code", "execution_count": null, - "id": "36", + "id": "37", "metadata": { "tags": [ "remove-cell" @@ -510,7 +523,7 @@ }, { "cell_type": "markdown", - "id": "37", + "id": "38", "metadata": {}, "source": [ "If we want to recover the original data for analysis (other than in\n", @@ -520,7 +533,7 @@ { "cell_type": "code", "execution_count": null, - "id": "38", + "id": "39", "metadata": {}, "outputs": [], "source": [ @@ -530,7 +543,7 @@ { "cell_type": "code", "execution_count": null, - "id": "39", + "id": "40", "metadata": { "tags": [ "remove-cell" @@ -544,7 +557,7 @@ }, { "cell_type": "markdown", - "id": "40", + "id": "41", "metadata": {}, "source": [ "## Joint Dict Encoding\n", @@ -562,7 +575,7 @@ { "cell_type": "code", "execution_count": null, - "id": "41", + "id": "42", "metadata": {}, "outputs": [], "source": [ @@ -581,7 +594,7 @@ }, { "cell_type": "markdown", - "id": "42", + "id": "43", "metadata": {}, "source": [ "A unique name is automatically generated for the join when `joint_dict` is set\n", @@ -594,7 +607,7 @@ { "cell_type": "code", "execution_count": null, - "id": "43", + "id": "44", "metadata": {}, "outputs": [], "source": [ @@ -606,7 +619,7 @@ }, { "cell_type": "markdown", - "id": "44", + "id": "45", "metadata": {}, "source": [ "The resulting dataset adds a variable for each created group, which contains \n", @@ -617,7 +630,7 @@ { "cell_type": "code", "execution_count": null, - "id": "45", + "id": "46", "metadata": {}, "outputs": [], "source": [ @@ -626,7 +639,7 @@ }, { "cell_type": "markdown", - "id": "46", + "id": "47", "metadata": {}, "source": [ "Skims encoded in this manner can be fed into sharrow and will compile and return \n", @@ -640,7 +653,7 @@ { "cell_type": "code", "execution_count": null, - "id": "47", + "id": "48", "metadata": {}, "outputs": [], "source": [ @@ -674,7 +687,7 @@ { "cell_type": "code", "execution_count": null, - "id": "48", + "id": "49", "metadata": {}, "outputs": [], "source": [ @@ -750,7 +763,7 @@ { "cell_type": "code", "execution_count": null, - "id": "49", + "id": "50", "metadata": {}, "outputs": [], "source": [ @@ -766,7 +779,7 @@ }, { "cell_type": "markdown", - "id": "50", + "id": "51", "metadata": { "tags": [] }, @@ -784,7 +797,7 @@ { "cell_type": "code", "execution_count": null, - "id": "51", + "id": "52", "metadata": { "tags": [] }, @@ -801,7 +814,7 @@ { "cell_type": "code", "execution_count": null, - "id": "52", + "id": "53", "metadata": { "tags": [] }, @@ -812,7 +825,7 @@ }, { "cell_type": "markdown", - "id": "53", + "id": "54", "metadata": {}, "source": [ "We'll then create a Dataset using construct." @@ -821,7 +834,7 @@ { "cell_type": "code", "execution_count": null, - "id": "54", + "id": "55", "metadata": { "tags": [] }, @@ -833,7 +846,7 @@ }, { "cell_type": "markdown", - "id": "55", + "id": "56", "metadata": {}, "source": [ "Note that the \"income\" variable remains an integer as expected, but the \"income_grp\" variable, \n", @@ -846,7 +859,7 @@ { "cell_type": "code", "execution_count": null, - "id": "56", + "id": "57", "metadata": { "tags": [] }, @@ -858,7 +871,7 @@ { "cell_type": "code", "execution_count": null, - "id": "57", + "id": "58", "metadata": { "tags": [] }, @@ -876,7 +889,7 @@ }, { "cell_type": "markdown", - "id": "58", + "id": "59", "metadata": {}, "source": [ "If you try to make the return trip to a pandas DataFrame using the regular \n", @@ -887,7 +900,7 @@ { "cell_type": "code", "execution_count": null, - "id": "59", + "id": "60", "metadata": { "tags": [] }, @@ -898,7 +911,7 @@ }, { "cell_type": "markdown", - "id": "60", + "id": "61", "metadata": {}, "source": [ "But, if you use the `single_dim` accessor on the dataset provided by sharrow,\n", @@ -908,7 +921,7 @@ { "cell_type": "code", "execution_count": null, - "id": "61", + "id": "62", "metadata": { "tags": [] }, @@ -920,7 +933,7 @@ { "cell_type": "code", "execution_count": null, - "id": "62", + "id": "63", "metadata": { "tags": [] }, @@ -932,7 +945,7 @@ }, { "cell_type": "markdown", - "id": "63", + "id": "64", "metadata": {}, "source": [ "Note that this automatic handling of categorical data only applies when constructing\n", @@ -944,7 +957,7 @@ { "cell_type": "code", "execution_count": null, - "id": "64", + "id": "65", "metadata": { "tags": [] }, diff --git a/pyproject.toml b/pyproject.toml index 86afc1c..f098bc1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,12 @@ fallback_version = "1999" write_to = "sharrow/_version.py" [tool.ruff] +fix = true +line-length = 88 +target-version = "py39" +extend-include = ["*.ipynb"] + +[tool.ruff.lint] select = [ "F", # Pyflakes "E", # Pycodestyle Errors @@ -52,12 +58,8 @@ select = [ "D", # pydocstyle "B", # flake8-bugbear ] -fix = true -ignore-init-module-imports = true -line-length = 88 ignore = ["B905", "D1"] -target-version = "py39" -extend-include = ["*.ipynb"] +ignore-init-module-imports = true per-file-ignores = { "*.ipynb" = [ "E402", # allow imports to appear anywhere in Jupyter Notebooks "E501", # allow long lines in Jupyter Notebooks diff --git a/sharrow/flows.py b/sharrow/flows.py index 849cff8..e7b69e6 100644 --- a/sharrow/flows.py +++ b/sharrow/flows.py @@ -1176,12 +1176,9 @@ def _flow_hash_push(x): digital_encoding = self.tree.subspaces[parts[1]][ "__".join(parts[2:]) ].attrs["digital_encoding"] - except (AttributeError, KeyError) as err: + except (AttributeError, KeyError): pass - print(f"$$$$/ndigital_encoding=ERR\n{err}\n\n\n$$$") - else: - print(f"$$$$/n{digital_encoding=}\n\n\n$$$") if digital_encoding: for de_k in sorted(digital_encoding.keys()): de_v = digital_encoding[de_k] diff --git a/sharrow/relationships.py b/sharrow/relationships.py index e86eb66..b03b08a 100644 --- a/sharrow/relationships.py +++ b/sharrow/relationships.py @@ -955,7 +955,7 @@ def dims(self): """Mapping from dimension names to lengths across all dataset nodes.""" dims = {} for _k, v in self.subspaces_iter(): - for name, length in v.dims.items(): + for name, length in v.sizes.items(): if name in dims: if dims[name] != length: raise ValueError( @@ -978,7 +978,7 @@ def dims_detail(self): s = "" for k, v in self.subspaces_iter(): s += f"\n{k}:" - for name, length in v.dims.items(): + for name, length in v.sizes.items(): s += f"\n - {name}: {length}" return s[1:] diff --git a/sharrow/shared_memory.py b/sharrow/shared_memory.py index d2b9e23..c8825b1 100644 --- a/sharrow/shared_memory.py +++ b/sharrow/shared_memory.py @@ -7,7 +7,6 @@ import dask import dask.array as da import numpy as np -import sparse import xarray as xr try: @@ -15,6 +14,10 @@ except ImportError: ShareableList, SharedMemory = None, None +try: + import sparse +except ImportError: + sparse = None __GLOBAL_MEMORY_ARRAYS = {} __GLOBAL_MEMORY_LISTS = {} @@ -283,7 +286,7 @@ def to_shared_memory(self, key=None, mode="r+", _dupe=True): def emit(k, a, is_coord): nonlocal names, wrappers, sizes, position - if isinstance(a.data, sparse.GCXS): + if sparse is not None and isinstance(a.data, sparse.GCXS): wrappers.append( { "sparse": True, @@ -452,6 +455,8 @@ def from_shared_memory(cls, key, own_data=False, mode="r+"): nbytes = t.pop("nbytes") is_sparse = t.pop("sparse", False) if is_sparse: + if sparse is None: + raise ImportError("sparse is not installed") _size_d = t.pop("data.nbytes") _size_i = t.pop("indices.nbytes") _size_p = t.pop("indptr.nbytes") diff --git a/sharrow/sparse.py b/sharrow/sparse.py index d96035f..732735c 100644 --- a/sharrow/sparse.py +++ b/sharrow/sparse.py @@ -3,10 +3,18 @@ import numba as nb import numpy as np import pandas as pd -import scipy.sparse -import sparse import xarray as xr +try: + import sparse +except ImportError: + sparse = None + +try: + import scipy.sparse +except ImportError: + scipy = None + @nb.njit def _get_idx(indices, indptr, data, i, j): @@ -67,6 +75,8 @@ def _get_idx(indices, indptr, data, i, j): class SparseArray2D: def __init__(self, i, j, data, shape=None): + if scipy is None: + raise ImportError("scipy is not installed") if isinstance(data, scipy.sparse.csr_matrix): self._sparse_data = data else: @@ -149,6 +159,9 @@ def apply_mapper(x): i_ = i j_ = j + if sparse is None: + raise ImportError("sparse is not installed") + sparse_data = sparse.GCXS( sparse.COO((i_, j_), data, shape=shape), compressed_axes=(0,) ) diff --git a/sharrow/tests/test_relationships.py b/sharrow/tests/test_relationships.py index 7a76dfb..3bced69 100644 --- a/sharrow/tests/test_relationships.py +++ b/sharrow/tests/test_relationships.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd +import pytest import xarray as xr from numpy.random import SeedSequence, default_rng from pytest import approx, fixture, mark, raises @@ -216,6 +217,8 @@ def test_shared_data_reversible_by_label(dataframe_regression): def test_with_2d_base(dataframe_regression): + pytest.importorskip("scipy", minversion="0.16") + data = example_data.get_data() skims = data["skims"] households = data["hhs"]