Use ruff as our formatter/linter (#326)

* switch to ruff for formatter-linter * remove reference to black/flake8 * remove bdist_wheel universal * skip cython version * add click to env * Jan's comment * [skipci] changelog
Quantco · Oct 31, 2023 · 4d3bbd8 · 4d3bbd8
1 parent 4604b24
commit 4d3bbd8
Show file tree

Hide file tree

Showing 16 changed files with 90 additions and 125 deletions.
diff --git a/.flake8 b/.flake8
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,46 +1,16 @@
 repos:
- - repo: https://github.com/Quantco/pre-commit-mirrors-black
-   rev: 23.9.1
-   hooks:
-     - id: black-conda
-       additional_dependencies: [flake8-docstrings, flake8-rst-docstrings]
-       args:
-         - --safe
-         - --target-version=py39
- - repo: https://github.com/Quantco/pre-commit-mirrors-flake8
-   rev: 6.1.0
-   hooks:
-    - id: flake8-conda
-      additional_dependencies: [
-          -c,
-          conda-forge,
-          flake8-bugbear=21.4.3,
-          flake8-builtins=1.5.3,
-          flake8-comprehensions=3.5.0,
-          flake8-docstrings=1.6.0,
-          flake8-print=4.0.0,
-          pep8-naming=0.11.1,
-          python<3.12,
-        ]
- - repo: https://github.com/Quantco/pre-commit-mirrors-isort
-   rev: 5.12.0
-   hooks:
-    - id: isort-conda
-      additional_dependencies: [toml]
- - repo: https://github.com/Quantco/pre-commit-mirrors-mypy
-   rev: "1.5.1"
-   hooks:
-    - id: mypy-conda
-      additional_dependencies:
-       - python=3.9
- - repo: https://github.com/Quantco/pre-commit-mirrors-pyupgrade
-   rev: 3.15.0
-   hooks:
-    - id: pyupgrade-conda
-      args: [--py39-plus]
- - repo: https://github.com/Quantco/pre-commit-mirrors-cython-lint
-   rev: 0.15.0
-   hooks:
-    - id: cython-lint-conda
-      args: [--no-pycodestyle]
-    - id: double-quote-cython-strings-conda
+  - repo: https://github.com/Quantco/pre-commit-mirrors-ruff
+    rev: 0.1.3
+    hooks:
+      - id: ruff-conda
+      - id: ruff-format-conda
+  - repo: https://github.com/Quantco/pre-commit-mirrors-mypy
+    rev: "1.5.1"
+    hooks:
+      - id: mypy-conda
+  - repo: https://github.com/Quantco/pre-commit-mirrors-cython-lint
+    rev: 0.15.0
+    hooks:
+      - id: cython-lint-conda
+        args: [--no-pycodestyle]
+      - id: double-quote-cython-strings-conda
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,6 +7,13 @@
 Changelog
 =========
 
+Unreleased
+----------
+
+**Other changes:**
+
+- Refactored the pre-commit hooks to use ruff.
+
 3.1.13 - 2023-10-17
 -------------------
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -39,7 +39,7 @@ conda install -y pre-commit
 git clone git@github.com:Quantco/tabmat.git
 cd tabmat
 
-# Set up our pre-commit hooks for black, mypy, isort and flake8.
+# Set up our pre-commit hooks for ruff, mypy, and cython-lint.
 pre-commit install
 
 # Set up a conda environment with name "tabmat"

diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml
@@ -17,7 +17,7 @@ requirements:
   build:
     - python                                 # [build_platform != target_platform]
     - cross-python_{{ target_platform }}     # [build_platform != target_platform]
-    - cython !=3.0.4                        # [build_platform != target_platform]
+    - cython !=3.0.4                         # [build_platform != target_platform]
     - numpy                                  # [build_platform != target_platform]
     - {{ compiler("c") }}
     - {{ compiler("cxx") }}

diff --git a/environment-win.yml b/environment-win.yml
@@ -7,13 +7,13 @@ dependencies:
   - pandas
 
   # development tools
-  - black
-  - flake8
+  - click
   - numpydoc
   - pip
   - pre-commit
   - pytest
   - pytest-xdist
+  - ruff
   - setuptools_scm
 
   # build tools

diff --git a/environment.yml b/environment.yml
@@ -6,13 +6,13 @@ dependencies:
   - pandas
 
   # development tools
-  - black
-  - flake8
+  - click
   - numpydoc
   - pip
   - pre-commit
   - pytest
   - pytest-xdist
+  - ruff
   - setuptools_scm
 
   # build tools

diff --git a/pyproject.toml b/pyproject.toml
@@ -8,25 +8,28 @@ requires = [
   'Cython != 3.0.4',
 ]
 
-[tool.black]
-exclude = '''
-/(
-    \.eggs
-  | \.git
-  | \.venv
-  | build
-  | dist
-)/
-'''
+[tool.ruff]
+ignore = ["E731", "N802", "N803", "N806"]
+line-length = 88
+select = [
+  # pyflakes
+  "F",
+  # pycodestyle
+  "E", "W",
+  # isort
+  "I",
+  # pep8-naming
+  "N",
+  # pyupgrade
+  "UP",
+]
+target-version = "py39"
+
+[tool.ruff.isort]
+known-first-party = ["tabmat"]
 
-[tool.isort]
-multi_line_output = 3
-include_trailing_comma = true
-ensure_newline_before_comments = true
-line_length = 88
-known_first_party = "tabmat"
-skip_glob = '\.eggs/*,\.git/*,\.venv/*,build/*,dist/*'
-default_section = 'THIRDPARTY'
+[tool.mypy]
+python_version = '3.9'
 
 [tool.cibuildwheel]
 skip = [

diff --git a/setup.cfg b/setup.cfg
diff --git a/src/tabmat/benchmark/generate_matrices.py b/src/tabmat/benchmark/generate_matrices.py
@@ -47,9 +47,7 @@ def make_cat_matrices(n_rows: int, n_cat_cols_1: int, n_cat_cols_2: int) -> dict
     two_cat_matrices["scipy.sparse csr"] = sps.hstack(
         [elt.tocsr() for elt in two_cat_matrices["tabmat"].matrices]
     )
-    two_cat_matrices["scipy.sparse csc"] = two_cat_matrices[
-        "scipy.sparse csr"
-    ].tocsc()  # type: ignore
+    two_cat_matrices["scipy.sparse csc"] = two_cat_matrices["scipy.sparse csr"].tocsc()  # type: ignore
     return two_cat_matrices
 
 

diff --git a/src/tabmat/benchmark/main.py b/src/tabmat/benchmark/main.py
@@ -142,8 +142,9 @@ def _to_standardized_mat(mat):
     "--operation_name",
     type=str,
     help=(
-        f"Specify a comma-separated list of operations you want to run. Leaving this blank "
-        f"will default to running all operations. Operation options: {get_op_names()}"
+        f"Specify a comma-separated list of operations you want to run. Leaving this "
+        f"blank will default to running all operations. Operation options: "
+        f"{get_op_names()}"
     ),
 )
 @click.option(
@@ -152,9 +153,9 @@ def _to_standardized_mat(mat):
     help=(
         f"Specify a comma-separated list of matrices you want to run or specify. "
         f"Leaving this blank will default to running all predefined matrices. "
-        f"Matrix options: {get_matrix_names()} OR custom. If custom, specify details using "
-        f"additional custom matrix options. See --dense, --sparse, --one_cat, --two_cat, "
-        f"and --dense_cat options for more details"
+        f"Matrix options: {get_matrix_names()} OR custom. If custom, specify details "
+        f"using additional custom matrix options. See --dense, --sparse, --one_cat, "
+        f"--two_cat, and --dense_cat options for more details"
     ),
 )
 @click.option(
@@ -214,7 +215,8 @@ def _to_standardized_mat(mat):
     help=(
         "Should we benchmark memory usage with tracemalloc. Turning this on will make "
         "the runtime benchmarks less useful due to memory benchmarking overhead. "
-        "Also, when memory benchmarking is on, debuggers like pdb and ipdb seem to fail."
+        "Also, when memory benchmarking is on, debuggers like pdb and ipdb seem to "
+        "fail."
     ),
     default=False,
 )
@@ -223,8 +225,9 @@ def _to_standardized_mat(mat):
     type=int,
     help=(
         "How many times to re-run the benchmark. The maximum memory usage and minimum "
-        "runtime will be reported. Higher numbers of iterations reduce noise. This defaults "
-        "to 100 unless memory benchmarking is turned on in which case it will be 1."
+        "runtime will be reported. Higher numbers of iterations reduce noise. This "
+        "defaults to 100 unless memory benchmarking is turned on in which case it will "
+        "be 1."
     ),
     default=None,
 )
@@ -323,7 +326,8 @@ def run_all_benchmarks(
             for params in two_cat:
                 n_rows, n_cat_cols_1, n_cat_cols_2 = (int(float(x)) for x in params)
                 benchmark_matrices[
-                    f"two_cat #rows:{n_rows}, #cats_1:{n_cat_cols_1}, #cats_2:{n_cat_cols_2}"
+                    f"two_cat #rows:{n_rows}, #cats_1:{n_cat_cols_1}, "
+                    f"#cats_2:{n_cat_cols_2}"
                 ] = make_cat_matrices(n_rows, n_cat_cols_1, n_cat_cols_2)
         if dense_cat:
             for params in dense_cat:

diff --git a/src/tabmat/benchmark/memory_tools.py b/src/tabmat/benchmark/memory_tools.py
@@ -35,7 +35,9 @@ def __exit__(self, *excargs):
 
 
 def track_peak_mem(f, *args, **kwargs):
-    """Track peak memory. Used in benchmarks to track memory used during matrix operations."""
+    """Track peak memory. Used in benchmarks to track memory used during matrix
+    operations.
+    """
     with MemoryPoller() as mp:
         f(*args, **kwargs)
     for s in mp.snapshots:

diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py
@@ -2,8 +2,8 @@
 Categorical data.
 
 One-hot encoding a feature creates a sparse matrix that has some special properties:
-All of its nonzero elements are ones, and since each element starts a new row, it's ``indptr``,
-which indicates where rows start and end, will increment by 1 every time.
+All of its nonzero elements are ones, and since each element starts a new row, it's
+``indptr``, which indicates where rows start and end, will increment by 1 every time.
 
 Storage
 ^^^^^^^
@@ -28,11 +28,11 @@
     array([0, 1, 2, 3], dtype=int32)
 
 
-The size of this matrix, if the original array is of length ``n``, is ``n`` bytes for the
-data (stored as quarter-precision integers), ``4n`` for ``indices``, and ``4(n+1)`` for
-``indptr``. However, if we know the matrix results from one-hot encoding, we only need to
-store the ``indices``, so we can reduce memory usage to slightly less than 4/9 of the
-original.
+The size of this matrix, if the original array is of length ``n``, is ``n`` bytes for
+the data (stored as quarter-precision integers), ``4n`` for ``indices``, and ``4(n+1)``
+for ``indptr``. However, if we know the matrix results from one-hot encoding, we only
+need to store the ``indices``, so we can reduce memory usage to slightly less than 4/9
+of the  original.
 
 csc storage
 -----------
@@ -62,7 +62,8 @@
 ----------------------
 
 A general sparse CSR matrix-vector products in pseudocode,
-modeled on [scipy sparse](https://github.com/scipy/scipy/blob/1dc960a33b000b95b1e399582c154efc0360a576/scipy/sparse/sparsetools/csr.h#L1120):  # noqa:
+modeled on [scipy sparse](https://github.com/scipy/scipy/blob/1dc960a33b000b95b\
+1e399582c154efc0360a576/scipy/sparse/sparsetools/csr.h#L1120):
 
 ::
 
@@ -74,8 +75,8 @@ def matvec(mat, vec):
                 res[i] += mat.data[j] * vec[mat.indices[j]]
         return res
 
-With a CSR categorical matrix, ``data`` is all 1 and ``j`` always equals ``i``, so we can
-simplify this function to be
+With a CSR categorical matrix, ``data`` is all 1 and ``j`` always equals ``i``, so we
+can simplify this function to be
 
 ::
 
@@ -86,9 +87,9 @@ def matvec(mat, vec):
             res[i] = vec[mat.indices[j]]
         return res
 
-The original function involved ``6N`` lookups, ``N`` multiplications, and ``N`` additions,
-while the new function involves only ``3N`` lookups. It thus has the potential to be
-significantly faster.
+The original function involved ``6N`` lookups, ``N`` multiplications, and ``N``
+additions, while the new function involves only ``3N`` lookups. It thus has the
+potential to be significantly faster.
 
 sandwich: X.T @ diag(d) @ X
 ---------------------------
@@ -612,7 +613,8 @@ def multiply(self, other) -> SparseMatrix:
         """
         if self.shape[0] != other.shape[0]:
             raise ValueError(
-                f"Shapes do not match. Expected length of {self.shape[0]}. Got {len(other)}."
+                f"Shapes do not match. Expected length of {self.shape[0]}. Got "
+                f"{len(other)}."
             )
 
         if self.drop_first:

diff --git a/src/tabmat/matrix_base.py b/src/tabmat/matrix_base.py
@@ -124,7 +124,8 @@ def standardize(
         self, weights: np.ndarray, center_predictors: bool, scale_predictors: bool
     ) -> tuple[Any, np.ndarray, Optional[np.ndarray]]:
         """
-        Return a StandardizedMatrix along with the column means and column standard deviations.
+        Return a StandardizedMatrix along with the column means and column standard
+        deviations.
 
         It is often useful to modify a dataset so that each column has mean
         zero and standard deviation one. This function does this "standardization"

diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py
@@ -58,7 +58,8 @@ def _filter_out_empty(matrices, indices):
 
 def _combine_matrices(matrices, indices):
     """
-    Combine multiple SparseMatrix and DenseMatrix objects into a single object of each type.
+    Combine multiple SparseMatrix and DenseMatrix objects into a single object of each
+    type.
 
     ``matrices`` is  and ``indices`` marks which columns they correspond to.
     Categorical matrices remain unmodified by this function since categorical
@@ -169,8 +170,8 @@ def __init__(
             if not mat.shape[0] == n_row:
                 raise ValueError(
                     "All matrices should have the same first dimension, "
-                    f"but the first matrix has first dimension {n_row} and matrix {i} has "
-                    f"first dimension {mat.shape[0]}."
+                    f"but the first matrix has first dimension {n_row} and matrix {i} "
+                    f"has first dimension {mat.shape[0]}."
                 )
             if mat.ndim == 1:
                 flatten_matrices[i] = mat[:, np.newaxis]

diff --git a/tests/test_benchmark_cli.py b/tests/test_benchmark_cli.py
@@ -16,7 +16,8 @@
     "cli_input",
     [
         "",
-        "dense,sparse,sparse_narrow, sparse_wide,one_cat,two_cat,dense_cat,dense_smallcat",
+        "dense,sparse,sparse_narrow, sparse_wide,one_cat,two_cat,dense_cat,"
+        "dense_smallcat",
     ],
 )
 def test_generate_matrices(cli_input: str):