diff --git a/.flake8 b/.flake8 deleted file mode 100644 index 1410c0f5..00000000 --- a/.flake8 +++ /dev/null @@ -1,22 +0,0 @@ -# Taken directly from https://github.com/ambv/black/blob/master/.flake8 -[flake8] -ignore = - C408, # ignore temporarily https://github.com/adamchainz/flake8-comprehensions/pull/282 - E203, - E266, # too many leading # for block comments - E501, # line too long, conflict with black - E731, - W503, # linebreak before binary operator, conflict with black - C901, - D104, - D100, - D105, # missing docstring in magic method, unecessary - D205, - D400 -max-line-length = 88 -max-complexity = 18 -select = B,C,E,F,W,T4,B9,D -enable-extensions = flake8-docstrings -per-file-ignores = - tests/**:D101,D102,D103,D401 -docstring-convention = numpy diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b75d0c33..11983790 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,46 +1,16 @@ repos: - - repo: https://github.com/Quantco/pre-commit-mirrors-black - rev: 23.9.1 - hooks: - - id: black-conda - additional_dependencies: [flake8-docstrings, flake8-rst-docstrings] - args: - - --safe - - --target-version=py39 - - repo: https://github.com/Quantco/pre-commit-mirrors-flake8 - rev: 6.1.0 - hooks: - - id: flake8-conda - additional_dependencies: [ - -c, - conda-forge, - flake8-bugbear=21.4.3, - flake8-builtins=1.5.3, - flake8-comprehensions=3.5.0, - flake8-docstrings=1.6.0, - flake8-print=4.0.0, - pep8-naming=0.11.1, - python<3.12, - ] - - repo: https://github.com/Quantco/pre-commit-mirrors-isort - rev: 5.12.0 - hooks: - - id: isort-conda - additional_dependencies: [toml] - - repo: https://github.com/Quantco/pre-commit-mirrors-mypy - rev: "1.5.1" - hooks: - - id: mypy-conda - additional_dependencies: - - python=3.9 - - repo: https://github.com/Quantco/pre-commit-mirrors-pyupgrade - rev: 3.15.0 - hooks: - - id: pyupgrade-conda - args: [--py39-plus] - - repo: https://github.com/Quantco/pre-commit-mirrors-cython-lint - rev: 0.15.0 - hooks: - - id: cython-lint-conda - args: [--no-pycodestyle] - - id: double-quote-cython-strings-conda + - repo: https://github.com/Quantco/pre-commit-mirrors-ruff + rev: 0.1.3 + hooks: + - id: ruff-conda + - id: ruff-format-conda + - repo: https://github.com/Quantco/pre-commit-mirrors-mypy + rev: "1.5.1" + hooks: + - id: mypy-conda + - repo: https://github.com/Quantco/pre-commit-mirrors-cython-lint + rev: 0.15.0 + hooks: + - id: cython-lint-conda + args: [--no-pycodestyle] + - id: double-quote-cython-strings-conda diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 730db78c..deae25e6 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,13 @@ Changelog ========= +Unreleased +---------- + +**Other changes:** + +- Refactored the pre-commit hooks to use ruff. + 3.1.13 - 2023-10-17 ------------------- diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 00173ab2..b6eac9ab 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -39,7 +39,7 @@ conda install -y pre-commit git clone git@github.com:Quantco/tabmat.git cd tabmat -# Set up our pre-commit hooks for black, mypy, isort and flake8. +# Set up our pre-commit hooks for ruff, mypy, and cython-lint. pre-commit install # Set up a conda environment with name "tabmat" diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml index e2385220..b2a77971 100644 --- a/conda.recipe/meta.yaml +++ b/conda.recipe/meta.yaml @@ -17,7 +17,7 @@ requirements: build: - python # [build_platform != target_platform] - cross-python_{{ target_platform }} # [build_platform != target_platform] - - cython !=3.0.4 # [build_platform != target_platform] + - cython !=3.0.4 # [build_platform != target_platform] - numpy # [build_platform != target_platform] - {{ compiler("c") }} - {{ compiler("cxx") }} diff --git a/environment-win.yml b/environment-win.yml index e21460d7..0fd73327 100644 --- a/environment-win.yml +++ b/environment-win.yml @@ -7,13 +7,13 @@ dependencies: - pandas # development tools - - black - - flake8 + - click - numpydoc - pip - pre-commit - pytest - pytest-xdist + - ruff - setuptools_scm # build tools diff --git a/environment.yml b/environment.yml index 8d59938e..28d45ac8 100644 --- a/environment.yml +++ b/environment.yml @@ -6,13 +6,13 @@ dependencies: - pandas # development tools - - black - - flake8 + - click - numpydoc - pip - pre-commit - pytest - pytest-xdist + - ruff - setuptools_scm # build tools diff --git a/pyproject.toml b/pyproject.toml index 07b1e56a..48e72167 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,25 +8,28 @@ requires = [ 'Cython != 3.0.4', ] -[tool.black] -exclude = ''' -/( - \.eggs - | \.git - | \.venv - | build - | dist -)/ -''' +[tool.ruff] +ignore = ["E731", "N802", "N803", "N806"] +line-length = 88 +select = [ + # pyflakes + "F", + # pycodestyle + "E", "W", + # isort + "I", + # pep8-naming + "N", + # pyupgrade + "UP", +] +target-version = "py39" + +[tool.ruff.isort] +known-first-party = ["tabmat"] -[tool.isort] -multi_line_output = 3 -include_trailing_comma = true -ensure_newline_before_comments = true -line_length = 88 -known_first_party = "tabmat" -skip_glob = '\.eggs/*,\.git/*,\.venv/*,build/*,dist/*' -default_section = 'THIRDPARTY' +[tool.mypy] +python_version = '3.9' [tool.cibuildwheel] skip = [ diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 2a9acf13..00000000 --- a/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[bdist_wheel] -universal = 1 diff --git a/src/tabmat/benchmark/generate_matrices.py b/src/tabmat/benchmark/generate_matrices.py index 8d9cdeed..e5e770fe 100644 --- a/src/tabmat/benchmark/generate_matrices.py +++ b/src/tabmat/benchmark/generate_matrices.py @@ -47,9 +47,7 @@ def make_cat_matrices(n_rows: int, n_cat_cols_1: int, n_cat_cols_2: int) -> dict two_cat_matrices["scipy.sparse csr"] = sps.hstack( [elt.tocsr() for elt in two_cat_matrices["tabmat"].matrices] ) - two_cat_matrices["scipy.sparse csc"] = two_cat_matrices[ - "scipy.sparse csr" - ].tocsc() # type: ignore + two_cat_matrices["scipy.sparse csc"] = two_cat_matrices["scipy.sparse csr"].tocsc() # type: ignore return two_cat_matrices diff --git a/src/tabmat/benchmark/main.py b/src/tabmat/benchmark/main.py index ec56e1f9..3af8b280 100644 --- a/src/tabmat/benchmark/main.py +++ b/src/tabmat/benchmark/main.py @@ -142,8 +142,9 @@ def _to_standardized_mat(mat): "--operation_name", type=str, help=( - f"Specify a comma-separated list of operations you want to run. Leaving this blank " - f"will default to running all operations. Operation options: {get_op_names()}" + f"Specify a comma-separated list of operations you want to run. Leaving this " + f"blank will default to running all operations. Operation options: " + f"{get_op_names()}" ), ) @click.option( @@ -152,9 +153,9 @@ def _to_standardized_mat(mat): help=( f"Specify a comma-separated list of matrices you want to run or specify. " f"Leaving this blank will default to running all predefined matrices. " - f"Matrix options: {get_matrix_names()} OR custom. If custom, specify details using " - f"additional custom matrix options. See --dense, --sparse, --one_cat, --two_cat, " - f"and --dense_cat options for more details" + f"Matrix options: {get_matrix_names()} OR custom. If custom, specify details " + f"using additional custom matrix options. See --dense, --sparse, --one_cat, " + f"--two_cat, and --dense_cat options for more details" ), ) @click.option( @@ -214,7 +215,8 @@ def _to_standardized_mat(mat): help=( "Should we benchmark memory usage with tracemalloc. Turning this on will make " "the runtime benchmarks less useful due to memory benchmarking overhead. " - "Also, when memory benchmarking is on, debuggers like pdb and ipdb seem to fail." + "Also, when memory benchmarking is on, debuggers like pdb and ipdb seem to " + "fail." ), default=False, ) @@ -223,8 +225,9 @@ def _to_standardized_mat(mat): type=int, help=( "How many times to re-run the benchmark. The maximum memory usage and minimum " - "runtime will be reported. Higher numbers of iterations reduce noise. This defaults " - "to 100 unless memory benchmarking is turned on in which case it will be 1." + "runtime will be reported. Higher numbers of iterations reduce noise. This " + "defaults to 100 unless memory benchmarking is turned on in which case it will " + "be 1." ), default=None, ) @@ -323,7 +326,8 @@ def run_all_benchmarks( for params in two_cat: n_rows, n_cat_cols_1, n_cat_cols_2 = (int(float(x)) for x in params) benchmark_matrices[ - f"two_cat #rows:{n_rows}, #cats_1:{n_cat_cols_1}, #cats_2:{n_cat_cols_2}" + f"two_cat #rows:{n_rows}, #cats_1:{n_cat_cols_1}, " + f"#cats_2:{n_cat_cols_2}" ] = make_cat_matrices(n_rows, n_cat_cols_1, n_cat_cols_2) if dense_cat: for params in dense_cat: diff --git a/src/tabmat/benchmark/memory_tools.py b/src/tabmat/benchmark/memory_tools.py index 3714f9e0..095e6974 100644 --- a/src/tabmat/benchmark/memory_tools.py +++ b/src/tabmat/benchmark/memory_tools.py @@ -35,7 +35,9 @@ def __exit__(self, *excargs): def track_peak_mem(f, *args, **kwargs): - """Track peak memory. Used in benchmarks to track memory used during matrix operations.""" + """Track peak memory. Used in benchmarks to track memory used during matrix + operations. + """ with MemoryPoller() as mp: f(*args, **kwargs) for s in mp.snapshots: diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index fcd679ec..1d95c731 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -2,8 +2,8 @@ Categorical data. One-hot encoding a feature creates a sparse matrix that has some special properties: -All of its nonzero elements are ones, and since each element starts a new row, it's ``indptr``, -which indicates where rows start and end, will increment by 1 every time. +All of its nonzero elements are ones, and since each element starts a new row, it's +``indptr``, which indicates where rows start and end, will increment by 1 every time. Storage ^^^^^^^ @@ -28,11 +28,11 @@ array([0, 1, 2, 3], dtype=int32) -The size of this matrix, if the original array is of length ``n``, is ``n`` bytes for the -data (stored as quarter-precision integers), ``4n`` for ``indices``, and ``4(n+1)`` for -``indptr``. However, if we know the matrix results from one-hot encoding, we only need to -store the ``indices``, so we can reduce memory usage to slightly less than 4/9 of the -original. +The size of this matrix, if the original array is of length ``n``, is ``n`` bytes for +the data (stored as quarter-precision integers), ``4n`` for ``indices``, and ``4(n+1)`` +for ``indptr``. However, if we know the matrix results from one-hot encoding, we only +need to store the ``indices``, so we can reduce memory usage to slightly less than 4/9 +of the original. csc storage ----------- @@ -62,7 +62,8 @@ ---------------------- A general sparse CSR matrix-vector products in pseudocode, -modeled on [scipy sparse](https://github.com/scipy/scipy/blob/1dc960a33b000b95b1e399582c154efc0360a576/scipy/sparse/sparsetools/csr.h#L1120): # noqa: +modeled on [scipy sparse](https://github.com/scipy/scipy/blob/1dc960a33b000b95b\ +1e399582c154efc0360a576/scipy/sparse/sparsetools/csr.h#L1120): :: @@ -74,8 +75,8 @@ def matvec(mat, vec): res[i] += mat.data[j] * vec[mat.indices[j]] return res -With a CSR categorical matrix, ``data`` is all 1 and ``j`` always equals ``i``, so we can -simplify this function to be +With a CSR categorical matrix, ``data`` is all 1 and ``j`` always equals ``i``, so we +can simplify this function to be :: @@ -86,9 +87,9 @@ def matvec(mat, vec): res[i] = vec[mat.indices[j]] return res -The original function involved ``6N`` lookups, ``N`` multiplications, and ``N`` additions, -while the new function involves only ``3N`` lookups. It thus has the potential to be -significantly faster. +The original function involved ``6N`` lookups, ``N`` multiplications, and ``N`` +additions, while the new function involves only ``3N`` lookups. It thus has the +potential to be significantly faster. sandwich: X.T @ diag(d) @ X --------------------------- @@ -612,7 +613,8 @@ def multiply(self, other) -> SparseMatrix: """ if self.shape[0] != other.shape[0]: raise ValueError( - f"Shapes do not match. Expected length of {self.shape[0]}. Got {len(other)}." + f"Shapes do not match. Expected length of {self.shape[0]}. Got " + f"{len(other)}." ) if self.drop_first: diff --git a/src/tabmat/matrix_base.py b/src/tabmat/matrix_base.py index 4d394b3c..17739012 100644 --- a/src/tabmat/matrix_base.py +++ b/src/tabmat/matrix_base.py @@ -124,7 +124,8 @@ def standardize( self, weights: np.ndarray, center_predictors: bool, scale_predictors: bool ) -> tuple[Any, np.ndarray, Optional[np.ndarray]]: """ - Return a StandardizedMatrix along with the column means and column standard deviations. + Return a StandardizedMatrix along with the column means and column standard + deviations. It is often useful to modify a dataset so that each column has mean zero and standard deviation one. This function does this "standardization" diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py index e3d2c4b2..18f65a80 100644 --- a/src/tabmat/split_matrix.py +++ b/src/tabmat/split_matrix.py @@ -58,7 +58,8 @@ def _filter_out_empty(matrices, indices): def _combine_matrices(matrices, indices): """ - Combine multiple SparseMatrix and DenseMatrix objects into a single object of each type. + Combine multiple SparseMatrix and DenseMatrix objects into a single object of each + type. ``matrices`` is and ``indices`` marks which columns they correspond to. Categorical matrices remain unmodified by this function since categorical @@ -169,8 +170,8 @@ def __init__( if not mat.shape[0] == n_row: raise ValueError( "All matrices should have the same first dimension, " - f"but the first matrix has first dimension {n_row} and matrix {i} has " - f"first dimension {mat.shape[0]}." + f"but the first matrix has first dimension {n_row} and matrix {i} " + f"has first dimension {mat.shape[0]}." ) if mat.ndim == 1: flatten_matrices[i] = mat[:, np.newaxis] diff --git a/tests/test_benchmark_cli.py b/tests/test_benchmark_cli.py index 06772dea..ed3914bc 100644 --- a/tests/test_benchmark_cli.py +++ b/tests/test_benchmark_cli.py @@ -16,7 +16,8 @@ "cli_input", [ "", - "dense,sparse,sparse_narrow, sparse_wide,one_cat,two_cat,dense_cat,dense_smallcat", + "dense,sparse,sparse_narrow, sparse_wide,one_cat,two_cat,dense_cat," + "dense_smallcat", ], ) def test_generate_matrices(cli_input: str):