diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2c0bad02..2dd10193 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -60,7 +60,7 @@ jobs: - name: Test with pytest if: ${{ matrix.os != 'macos-latest'}} run: | - pytest --cov=scib --cov-report=xml -vv --ignore=tests/integration/ --ignore=tests/metrics/rpy2 -vv + pytest --cov=scib --cov-report=xml -vv --ignore=tests/integration/ --ignore=tests/metrics/rpy2 -vv --durations 0 --durations-min=1.0 mv coverage.xml "$(echo 'coverage_metrics_${{ matrix.os }}_${{ matrix.python }}.xml' | sed 's/[^a-z0-9\.\/]/_/g')" - name: Upload coverage to GitHub Actions @@ -98,7 +98,7 @@ jobs: - name: Test with pytest run: | - pytest --cov=scib --cov-report=xml -vv --tb=native -k rpy2 + pytest --cov=scib --cov-report=xml -vv --tb=native -k rpy2 --durations 0 --durations-min=1.0 mv coverage.xml "$(echo 'coverage_rpy2_${{ matrix.os }}_${{ matrix.python }}.xml' | sed 's/[^a-z0-9\.\/]/_/g')" - name: Upload coverage to GitHub Actions @@ -129,7 +129,7 @@ jobs: - name: Test with pytest run: | - pytest --cov=scib --cov-report=xml -vv --tb=native -k integration + pytest --cov=scib --cov-report=xml -vv --tb=native -k integration --durations 0 --durations-min=1.0 mv coverage.xml "$(echo 'coverage_integration_${{ matrix.os }}_${{ matrix.python }}.xml' | sed 's/[^a-z0-9\.\/]/_/g')" - name: Upload coverage to GitHub Actions diff --git a/pyproject.toml b/pyproject.toml index 95ce61c6..5da991d6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,3 +9,4 @@ build-backend = "setuptools.build_meta" log_cli = 'True' log_cli_level = 'INFO' addopts = '-p no:warnings' +durations = 0 diff --git a/scib/metrics/cell_cycle.py b/scib/metrics/cell_cycle.py index b3d92845..6d024bf0 100644 --- a/scib/metrics/cell_cycle.py +++ b/scib/metrics/cell_cycle.py @@ -1,5 +1,6 @@ import numpy as np import pandas as pd +from tqdm import tqdm from ..preprocessing import score_cell_cycle from ..utils import check_adata @@ -11,12 +12,14 @@ def cell_cycle( adata_post, batch_key, embed=None, - agg_func=np.mean, + agg_func=np.nanmean, organism="mouse", n_comps=50, recompute_cc=True, precompute_pcr_key=None, verbose=False, + linreg_method="numpy", + n_threads=1, ): """Cell cycle conservation score @@ -44,6 +47,7 @@ def cell_cycle( precomputed scores if available as 'S_score' and 'G2M_score' in ``adata_post.obs`` :param precompute_pcr_key: Key in adata_pre for precomputed PCR values for cell cycle scores. Ignores cell cycle scores in adata_pre if present. + :param n_threads: Number of threads for linear regressions per principle component :return: A score between 1 and 0. The larger the score, the stronger the cell cycle @@ -70,11 +74,6 @@ def cell_cycle( if embed == "X_pca": embed = None - batches = adata_pre.obs[batch_key].unique() - scores_final = [] - scores_before = [] - scores_after = [] - recompute_cc = ( recompute_cc or "S_score" not in adata_pre.obs_keys() @@ -84,7 +83,12 @@ def cell_cycle( precompute_pcr_key is None or precompute_pcr_key not in adata_pre.uns_keys() ) - for batch in batches: + batches = adata_pre.obs[batch_key].unique() + scores_before = [] + scores_after = [] + scores_final = [] + + for batch in tqdm(batches): before, after = get_pcr_before_after( adata_pre, adata_post, @@ -92,11 +96,13 @@ def cell_cycle( batch=batch, embed=embed, organism=organism, + pcr_key=precompute_pcr_key, recompute_cc=recompute_cc, recompute_pcr=recompute_pcr, - pcr_key=precompute_pcr_key, n_comps=n_comps, verbose=verbose, + n_threads=n_threads, + linreg_method=linreg_method, ) # scale result @@ -140,11 +146,13 @@ def get_pcr_before_after( batch, embed, organism, - recompute_cc, - recompute_pcr, pcr_key, - n_comps, - verbose, + recompute_cc=False, + recompute_pcr=False, + n_comps=50, + verbose=True, + n_threads=1, + linreg_method="numpy", ): """ Principle component regression value on cell cycle scores for one batch @@ -190,16 +198,28 @@ def get_pcr_before_after( covariate = raw_sub.obs[["S_score", "G2M_score"]] # PCR on adata before integration - if recompute_pcr: + if recompute_pcr: # TODO: does this work for precomputed values? before = pc_regression( - raw_sub.X, covariate, pca_var=None, n_comps=n_comps, verbose=verbose + raw_sub.X, + covariate, + pca_var=None, + n_comps=n_comps, + verbose=verbose, + n_threads=n_threads, + linreg_method=linreg_method, ) else: before = pd.Series(raw_sub.uns[pcr_key]) # PCR on adata after integration after = pc_regression( - int_sub, covariate, pca_var=None, n_comps=n_comps, verbose=verbose + int_sub, + covariate, + pca_var=None, + n_comps=n_comps, + verbose=verbose, + n_threads=n_threads, + linreg_method=linreg_method, ) return before, after diff --git a/scib/metrics/pcr.py b/scib/metrics/pcr.py index abb0c5a9..8c9c9d8c 100644 --- a/scib/metrics/pcr.py +++ b/scib/metrics/pcr.py @@ -2,13 +2,22 @@ import pandas as pd import scanpy as sc from scipy import sparse -from sklearn.linear_model import LinearRegression +from tqdm import tqdm from ..utils import check_adata, check_batch def pcr_comparison( - adata_pre, adata_post, covariate, embed=None, n_comps=50, scale=True, verbose=False + adata_pre, + adata_post, + covariate, + embed=None, + n_comps=50, + linreg_method="sklearn", + recompute_pca=False, + scale=True, + verbose=False, + n_threads=1, ): """Principal component regression score @@ -25,6 +34,8 @@ def pcr_comparison( If None, use the full expression matrix (``adata_post.X``), otherwise use the embedding provided in ``adata_post.obsm[embed]``. :param n_comps: Number of principal components to compute + :param recompute_pca: Whether to recompute PCA with default settings + :param linreg_method: Method for linear regression, either 'sklearn' or 'numpy' :param scale: If True, scale score between 0 and 1 (default) :param verbose: :return: @@ -52,8 +63,10 @@ def pcr_comparison( pcr_before = pcr( adata_pre, covariate=covariate, - recompute_pca=True, + recompute_pca=recompute_pca, n_comps=n_comps, + linreg_method=linreg_method, + n_threads=n_threads, verbose=verbose, ) @@ -61,8 +74,10 @@ def pcr_comparison( adata_post, covariate=covariate, embed=embed, - recompute_pca=True, + recompute_pca=recompute_pca, n_comps=n_comps, + linreg_method=linreg_method, + n_threads=n_threads, verbose=verbose, ) @@ -79,7 +94,16 @@ def pcr_comparison( return pcr_after - pcr_before -def pcr(adata, covariate, embed=None, n_comps=50, recompute_pca=True, verbose=False): +def pcr( + adata, + covariate, + embed=None, + n_comps=50, + recompute_pca=False, + linreg_method="sklearn", + verbose=False, + n_threads=1, +): """Principal component regression for anndata object Wraps :func:`~scib.metrics.pc_regression` while checking whether to: @@ -127,25 +151,48 @@ def pcr(adata, covariate, embed=None, n_comps=50, recompute_pca=True, verbose=Fa assert embed in adata.obsm if verbose: print(f"Compute PCR on embedding n_comps: {n_comps}") - return pc_regression(adata.obsm[embed], covariate_values, n_comps=n_comps) + return pc_regression( + adata.obsm[embed], + covariate_values, + n_comps=n_comps, + linreg_method=linreg_method, + n_threads=n_threads, + ) # use existing PCA computation elif (recompute_pca is False) and ("X_pca" in adata.obsm) and ("pca" in adata.uns): if verbose: print("using existing PCA") return pc_regression( - adata.obsm["X_pca"], covariate_values, pca_var=adata.uns["pca"]["variance"] + adata.obsm["X_pca"], + covariate_values, + pca_var=adata.uns["pca"]["variance"], + linreg_method=linreg_method, + n_threads=n_threads, ) # recompute PCA else: if verbose: print(f"compute PCA n_comps: {n_comps}") - return pc_regression(adata.X, covariate_values, n_comps=n_comps) + return pc_regression( + adata.X, + covariate_values, + n_comps=n_comps, + linreg_method=linreg_method, + n_threads=n_threads, + ) def pc_regression( - data, covariate, pca_var=None, n_comps=50, svd_solver="arpack", verbose=False + data, + covariate, + pca_var=None, + n_comps=50, + svd_solver="arpack", + linreg_method="sklearn", + verbose=False, + n_threads=1, ): """Principal component regression @@ -172,7 +219,6 @@ def pc_regression( :return: Variance contribution of regression """ - if isinstance(data, (np.ndarray, sparse.csr_matrix, sparse.csc_matrix)): matrix = data else: @@ -180,6 +226,13 @@ def pc_regression( f"invalid type: {data.__class__} is not a numpy array or sparse matrix" ) + if linreg_method == "sklearn": + linreg_method = linreg_sklearn + elif linreg_method == "numpy": + linreg_method = linreg_np + else: + raise ValueError(f"invalid linreg_method: {linreg_method}") + # perform PCA if no variance contributions are given if pca_var is None: @@ -193,7 +246,7 @@ def pc_regression( matrix = matrix.toarray() if verbose: - print("compute PCA") + print("compute PCA...") X_pca, _, _, pca_var = sc.tl.pca( matrix, n_comps=n_comps, @@ -216,18 +269,51 @@ def pc_regression( else: if verbose: print("one-hot encode categorical values") - covariate = pd.get_dummies(covariate) + covariate = pd.get_dummies(covariate).to_numpy() # fit linear model for n_comps PCs - r2 = [] - for i in range(n_comps): - pc = X_pca[:, [i]] - lm = LinearRegression() - lm.fit(covariate, pc) - r2_score = np.maximum(0, lm.score(covariate, pc)) - r2.append(r2_score) - - Var = pca_var / sum(pca_var) * 100 - R2Var = sum(r2 * Var) / 100 + if verbose: + print(f"Use {n_threads} threads for regression...") + if n_threads == 1: + r2 = [] + for i in tqdm(range(n_comps), total=n_comps): + r2_score = linreg_method(X=covariate, y=X_pca[:, [i]]) + r2.append(r2_score) + else: + from concurrent.futures import ThreadPoolExecutor, as_completed + + r2 = [] + # parallelise over all principal components + with tqdm(total=n_comps) as pbar: + with ThreadPoolExecutor(max_workers=n_threads) as executor: + futures = [ + executor.submit(linreg_method, X=covariate, y=pc) for pc in X_pca.T + ] + for future in as_completed(futures): + r2.append(future.result()) + pbar.update(1) + + Var = pca_var / sum(pca_var) # * 100 + R2Var = sum(r2 * Var) # / 100 return R2Var + + +def linreg_sklearn(X, y): + from sklearn.linear_model import LinearRegression + + lm = LinearRegression() + lm.fit(X, y) + r2_score = lm.score(X, y) + np.maximum(0, r2_score) + return np.maximum(0, r2_score) + + +def linreg_np(X, y): + _, residuals, _, _ = np.linalg.lstsq(X, y, rcond=None) + if residuals.size == 0: + residuals = np.array([0]) + rss = residuals[0] if len(residuals) > 0 else 0 + tss = np.sum((y - y.mean()) ** 2) + r2_score = 1 - (rss / tss) + return np.maximum(0, r2_score) diff --git a/scib/preprocessing.py b/scib/preprocessing.py index cbcec346..b80aceb3 100644 --- a/scib/preprocessing.py +++ b/scib/preprocessing.py @@ -686,7 +686,7 @@ def score_cell_cycle(adata, organism="mouse"): f"cell cycle genes not in adata\n organism: {organism}\n varnames: {rand_genes}" ) - sc.tl.score_genes_cell_cycle(adata, s_genes, g2m_genes) + sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes) def save_seurat(adata, path, batch, hvgs=None): diff --git a/tests/metrics/test_beyond_label_metrics.py b/tests/metrics/test_beyond_label_metrics.py index b27ba2d7..b056d0b2 100644 --- a/tests/metrics/test_beyond_label_metrics.py +++ b/tests/metrics/test_beyond_label_metrics.py @@ -1,22 +1,29 @@ import pandas as pd +import pytest from scipy.sparse import csr_matrix import scib from tests.common import LOGGER, assert_near_exact -def test_cell_cycle(adata_paul15): +@pytest.mark.parametrize("n_threads", [1, 2]) +def test_cell_cycle(adata_paul15, n_threads): adata = adata_paul15 + # import anndata as ad + # adata = ad.concat([adata_paul15] * 50) + # adata.obs_names_make_unique() + # adata.obs['batch'] = 'batch' adata_int = adata.copy() - # only final score score = scib.me.cell_cycle( - adata, - adata_int, + adata_pre=adata, + adata_post=adata_int, batch_key="batch", organism="mouse", # recompute_cc=True, - verbose=True, + verbose=False, + n_threads=n_threads, + linreg_method="numpy", ) LOGGER.info(f"score: {score}") assert_near_exact(score, 1, diff=1e-12) diff --git a/tests/metrics/test_pcr_metrics.py b/tests/metrics/test_pcr_metrics.py index 667bb200..e9841583 100644 --- a/tests/metrics/test_pcr_metrics.py +++ b/tests/metrics/test_pcr_metrics.py @@ -1,24 +1,61 @@ +import pytest from scipy.sparse import csr_matrix import scib from tests.common import LOGGER, add_embed, assert_near_exact -def test_pc_regression(adata): - score = scib.me.pc_regression(adata.X, adata.obs["batch"]) - LOGGER.info(f"using embedding: {score}") - assert_near_exact(score, 0, diff=1e-4) +@pytest.mark.parametrize("sparse", [False, True]) +def test_pc_regression(adata, sparse): + if sparse: + adata.X = csr_matrix(adata.X) + score = scib.me.pc_regression( + adata.X, + covariate=adata.obs["batch"], + n_comps=adata.n_vars, + ) + LOGGER.info(score) + assert_near_exact(score, 0, diff=1e-3) + + +@pytest.mark.parametrize("n_threads", [1, 2]) +@pytest.mark.parametrize("linreg_method", ["numpy", "sklearn"]) +def test_pcr_timing(adata_pca, n_threads, linreg_method): + import timeit + + import anndata as ad + # scale up anndata + adata = ad.concat([adata_pca] * 100) + adata.uns = adata_pca.uns -def test_pc_regression_sparse(adata): - x = csr_matrix(adata.X) - score = scib.me.pc_regression(x, adata.obs["batch"], n_comps=x.shape[1]) - LOGGER.info(f"using embedding: {score}") - assert_near_exact(score, 0, diff=1e-4) + runs = 5 + timing = timeit.timeit( + lambda: scib.me.pcr( + adata, + covariate="celltype", + linreg_method=linreg_method, + verbose=False, + n_threads=n_threads, + recompute_pca=False, + ), + number=runs, + ) + LOGGER.info(f"timeit: {timing:.2f}s for {runs} runs") + + # test pcr value + score = scib.me.pcr( + adata_pca, + covariate="celltype", + linreg_method=linreg_method, + verbose=True, + n_threads=1, + ) + LOGGER.info(score) + assert_near_exact(score, 0.33401529220865844, diff=1e-3) -def test_pcr_batch(adata): - # no PCA precomputed +def test_pcr_comparison_batch(adata): score = scib.me.pcr_comparison( adata, adata, covariate="batch", n_comps=50, scale=True ) @@ -26,13 +63,13 @@ def test_pcr_batch(adata): assert_near_exact(score, 0, diff=1e-6) -def test_pcr_batch_precomputed(adata_pca): +def test_pcr_comparison_batch_precomputed(adata_pca): score = scib.me.pcr_comparison(adata_pca, adata_pca, covariate="batch", scale=True) LOGGER.info(f"precomputed PCA: {score}") assert_near_exact(score, 0, diff=1e-6) -def test_pcr_batch_embedding(adata): +def test_pcr_comparison_batch_embedding(adata): # use different embedding score = scib.me.pcr_comparison( adata_pre=adata, @@ -42,5 +79,5 @@ def test_pcr_batch_embedding(adata): n_comps=50, scale=True, ) - LOGGER.info(f"using embedding: {score}") + LOGGER.info(f"using X_emb: {score}") assert_near_exact(score, 0, diff=1e-6)