From 2cb86978d44d9727da72cbf94ea643fdc887328f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sophia=20M=C3=BCller-Dott?= Date: Fri, 19 Jul 2024 08:59:29 +0200 Subject: [PATCH 1/5] implement z-score method --- decoupler/method_zscore.py | 108 +++++++++++++++++++++++++++++++++ decoupler/tests/test_zscore.py | 31 ++++++++++ 2 files changed, 139 insertions(+) create mode 100644 decoupler/method_zscore.py create mode 100644 decoupler/tests/test_zscore.py diff --git a/decoupler/method_zscore.py b/decoupler/method_zscore.py new file mode 100644 index 0000000..e1940bc --- /dev/null +++ b/decoupler/method_zscore.py @@ -0,0 +1,108 @@ +""" +Method zscore. +Code to run the z-score (RoKAI, KSEA) method. +""" + +import numpy as np +import pandas as pd +from scipy.sparse import csr_matrix +from scipy.sparse import isspmatrix_csr + +from scipy.stats import t +from scipy.stats import norm + +from .pre import extract, match, rename_net, get_net_mat, filt_min_n, return_data + +from tqdm import tqdm + +def zscore(m, net, flavor='RoKAI', verbose=False): + + # Get dims + n_samples = m.shape[0] + n_features, n_fsets = net.shape + + es = np.zeros((n_samples, n_fsets)) + pv = np.zeros((n_samples, n_fsets)) + + # Compute each element of Matrix3 + for i in range(n_samples): + for f in range(n_fsets): + m_f = m[:, i] + if isspmatrix_csr(m_f): + m_f = m_f.toarray() + m_f = m_f.reshape(1, -1) + net_i = net[:, f] + + mean_product = np.sum(m_f * net_i) / np.sum(abs(net_i)) + mean_m_f = 0 if flavor == "RoKAI" else np.mean(m_f) + std_m_f = np.std(m_f) + count_non_zeros = np.count_nonzero(net_i) + + es[i, f] = (mean_product - mean_m_f) * np.sqrt(count_non_zeros) / std_m_f + pv[i, f] = norm.cdf(-abs(es[i, f])) + + return es, pv + + +def run_zscore(mat, net, source='source', target='target', weight='weight', batch_size=10000, flavor='RoKAI', + min_n=5, verbose=False, use_raw=True): + """ + z-score. + + Calculates regulatory activities using a z-score as descibed in KSEA or RoKAI. The z-score calculates the mean of the molecular features of the + known targets for each regulator and adjusts it for the number of identified targets for the regulator, the standard deviation of all molecular + features (RoKAI), as well as the mean of all moleculare features (KSEA). + + Parameters + ---------- + mat : list, DataFrame + List of [features, matrix], dataframe (samples x features). + net : DataFrame + Network in long format. + source : str + Column name in net with source nodes. + target : str + Column name in net with target nodes. + weight : str + Column name in net with weights. + flavor : int + Whether to use the implementation of RoKAI (default) or KSEA. + min_n : int + Minimum of targets per source. If less, sources are removed. + verbose : bool + Whether to show progress. + use_raw : bool + Use raw attribute of mat if present. + + Returns + ------- + estimate : DataFrame + Z-scores. + pvals : DataFrame + Obtained p-values. + """ + + # Extract sparse matrix and array of genes + m, r, c = extract(mat, use_raw=use_raw, verbose=verbose) + + # Transform net + net = rename_net(net, source=source, target=target, weight=weight) + net = filt_min_n(c, net, min_n=min_n) + sources, targets, net = get_net_mat(net) + + # Match arrays + net = match(c, targets, net) + + if verbose: + print('Running zscore on mat with {0} samples and {1} targets for {2} sources.'.format(m.shape[0], len(c), net.shape[1])) + + # Run ULM + estimate, pvals = zscore(m, net, flavor=flavor) + + # Transform to df + estimate = pd.DataFrame(estimate, index=r, columns=sources) + estimate.name = 'zscore_estimate' + pvals = pd.DataFrame(pvals, index=r, columns=sources) + pvals.name = 'zscore_pvals' + + return return_data(mat=mat, results=(estimate, pvals)) \ No newline at end of file diff --git a/decoupler/tests/test_zscore.py b/decoupler/tests/test_zscore.py new file mode 100644 index 0000000..8ff491c --- /dev/null +++ b/decoupler/tests/test_zscore.py @@ -0,0 +1,31 @@ +import numpy as np +import pandas as pd +from scipy.sparse import csr_matrix +from scipy.stats import norm +from anndata import AnnData +from numpy.testing import assert_almost_equal +from ..method_zscore import zscore, run_zscore + +def test_zscore(): + m = csr_matrix(np.array([[-7., -1., 1., 1.], [-4., -2., 1., 2.], [1., 2., 5., 1.], [1., 1., 6., 2.]], dtype=np.float32)) + net = np.array([[1., 0.], [1, 0.], [0., -1.], [0., -1.]], dtype=np.float32) + act, pvl = zscore(m, net) + assert act[0, 0] < 0 + assert act[1, 0] < 0 + assert act[2, 0] > 0 + assert act[3, 0] > 0 + assert np.all((0. <= pvl) * (pvl <= 1.)) + +def test_run_zscore(): + m = np.array([[7., 1., 1., 1.], [4., 2., 1., 2.], [1., 2., 5., 1.], [1., 1., 6., 2.]]) + r = np.array(['S1', 'S2', 'S3', 'S4']) + c = np.array(['G1', 'G2', 'G3', 'G4']) + df = pd.DataFrame(m, index=r, columns=c) + net = pd.DataFrame([['T1', 'G1', 1], ['T1', 'G2', 2], ['T2', 'G3', -3], ['T2', 'G4', 4]], + columns=['source', 'target', 'weight']) + res = run_zscore(df, net, verbose=True, use_raw=False, min_n=0) + assert res[0].loc['S1', 'T2'] > 0 + assert res[0].loc['S2', 'T2'] < 0 + assert res[0].loc['S3', 'T2'] > 0 + assert res[0].loc['S4', 'T2'] > 0 + assert res[1].map(lambda x: 0 <= x <= 1).all().all() \ No newline at end of file From b1fb97cabd918efc1b7726f6c360226632618d6f Mon Sep 17 00:00:00 2001 From: Pau Badia i Mompel <44896790+PauBadiaM@users.noreply.github.com> Date: Tue, 23 Jul 2024 16:09:54 +0200 Subject: [PATCH 2/5] zscore use matrix operations --- decoupler/method_zscore.py | 37 +++++++++++-------------------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/decoupler/method_zscore.py b/decoupler/method_zscore.py index e1940bc..c48c6ce 100644 --- a/decoupler/method_zscore.py +++ b/decoupler/method_zscore.py @@ -15,32 +15,17 @@ from tqdm import tqdm -def zscore(m, net, flavor='RoKAI', verbose=False): - - # Get dims - n_samples = m.shape[0] - n_features, n_fsets = net.shape - - es = np.zeros((n_samples, n_fsets)) - pv = np.zeros((n_samples, n_fsets)) - - # Compute each element of Matrix3 - for i in range(n_samples): - for f in range(n_fsets): - m_f = m[:, i] - if isspmatrix_csr(m_f): - m_f = m_f.toarray() - m_f = m_f.reshape(1, -1) - net_i = net[:, f] - - mean_product = np.sum(m_f * net_i) / np.sum(abs(net_i)) - mean_m_f = 0 if flavor == "RoKAI" else np.mean(m_f) - std_m_f = np.std(m_f) - count_non_zeros = np.count_nonzero(net_i) - - es[i, f] = (mean_product - mean_m_f) * np.sqrt(count_non_zeros) / std_m_f - pv[i, f] = norm.cdf(-abs(es[i, f])) +def zscore(m, net, flavor='RoKAI', verbose=False): + stds = np.std(m, axis=1, ddof=1) + if flavor != 'RoKAI': + mean_all = np.mean(m, axis=1) + else: + mean_all = np.zeros(stds.shape) + n = np.sqrt(np.count_nonzero(net, axis=0)) + mean = m.dot(net) / np.sum(np.abs(net), axis=0) + es = ((mean - mean_all.reshape(-1, 1)) * n) / stds.reshape(-1, 1) + pv = norm.cdf(-np.abs(es)) return es, pv @@ -105,4 +90,4 @@ def run_zscore(mat, net, source='source', target='target', weight='weight', batc pvals = pd.DataFrame(pvals, index=r, columns=sources) pvals.name = 'zscore_pvals' - return return_data(mat=mat, results=(estimate, pvals)) \ No newline at end of file + return return_data(mat=mat, results=(estimate, pvals)) From 6d997c6ea36addce7c288374b67dacab5e363a65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sophia=20M=C3=BCller-Dott?= Date: Wed, 24 Jul 2024 10:55:07 +0200 Subject: [PATCH 3/5] clean up packages and description --- decoupler/method_zscore.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/decoupler/method_zscore.py b/decoupler/method_zscore.py index c48c6ce..927566b 100644 --- a/decoupler/method_zscore.py +++ b/decoupler/method_zscore.py @@ -5,16 +5,11 @@ import numpy as np import pandas as pd -from scipy.sparse import csr_matrix -from scipy.sparse import isspmatrix_csr -from scipy.stats import t from scipy.stats import norm from .pre import extract, match, rename_net, get_net_mat, filt_min_n, return_data -from tqdm import tqdm - def zscore(m, net, flavor='RoKAI', verbose=False): stds = np.std(m, axis=1, ddof=1) @@ -40,8 +35,8 @@ def run_zscore(mat, net, source='source', target='target', weight='weight', batc Parameters ---------- - mat : list, DataFrame - List of [features, matrix], dataframe (samples x features). + mat : list, DataFrame or AnnData + List of [features, matrix], dataframe (samples x features) or an AnnData instance. net : DataFrame Network in long format. source : str @@ -50,8 +45,8 @@ def run_zscore(mat, net, source='source', target='target', weight='weight', batc Column name in net with target nodes. weight : str Column name in net with weights. - flavor : int - Whether to use the implementation of RoKAI (default) or KSEA. + batch_size : int + Size of the samples to use for each batch. Increasing this will consume more memmory but it will run faster. min_n : int Minimum of targets per source. If less, sources are removed. verbose : bool @@ -62,9 +57,9 @@ def run_zscore(mat, net, source='source', target='target', weight='weight', batc Returns ------- estimate : DataFrame - Z-scores. + Z-scores. Stored in `.obsm['zscore_estimate']` if `mat` is AnnData. pvals : DataFrame - Obtained p-values. + Obtained p-values. Stored in `.obsm['zscore_pvals']` if `mat` is AnnData. """ # Extract sparse matrix and array of genes From 4fb77f2d5e04575b4855e898b55120595e5ff9f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sophia=20M=C3=BCller-Dott?= Date: Wed, 24 Jul 2024 10:55:31 +0200 Subject: [PATCH 4/5] refine z-score tests --- decoupler/tests/test_zscore.py | 36 +++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/decoupler/tests/test_zscore.py b/decoupler/tests/test_zscore.py index 8ff491c..72f68ee 100644 --- a/decoupler/tests/test_zscore.py +++ b/decoupler/tests/test_zscore.py @@ -1,31 +1,49 @@ import numpy as np import pandas as pd -from scipy.sparse import csr_matrix -from scipy.stats import norm from anndata import AnnData -from numpy.testing import assert_almost_equal from ..method_zscore import zscore, run_zscore def test_zscore(): - m = csr_matrix(np.array([[-7., -1., 1., 1.], [-4., -2., 1., 2.], [1., 2., 5., 1.], [1., 1., 6., 2.]], dtype=np.float32)) + m = np.array([[-7., -1., 1., 1.], [-4., -2., 1., 2.], [1., 2., 5., 1.], [1., 1., 6., 2.], [-8., -7., 1., 1.]], dtype=np.float32) net = np.array([[1., 0.], [1, 0.], [0., -1.], [0., -1.]], dtype=np.float32) act, pvl = zscore(m, net) assert act[0, 0] < 0 assert act[1, 0] < 0 assert act[2, 0] > 0 assert act[3, 0] > 0 + assert act[4, 0] < 0 assert np.all((0. <= pvl) * (pvl <= 1.)) + + act2, pvl2 = zscore(m, net, flavor='KSEA') + assert act2[0, 0] < 0 + assert act2[1, 0] < 0 + assert act2[2, 0] < 0 + assert act2[3, 0] < 0 + assert act2[4, 0] < 0 + assert np.all((0. <= pvl2) * (pvl2 <= 1.)) def test_run_zscore(): - m = np.array([[7., 1., 1., 1.], [4., 2., 1., 2.], [1., 2., 5., 1.], [1., 1., 6., 2.]]) + m = np.array([[-7., -1., 1., 1.], [-4., -2., 1., 2.], [1., 2., 5., 1.], [1., 1., -6., -8.], [-8., -7., 1., 1.]]) r = np.array(['S1', 'S2', 'S3', 'S4']) c = np.array(['G1', 'G2', 'G3', 'G4']) df = pd.DataFrame(m, index=r, columns=c) - net = pd.DataFrame([['T1', 'G1', 1], ['T1', 'G2', 2], ['T2', 'G3', -3], ['T2', 'G4', 4]], + net = pd.DataFrame([['T1', 'G1', 1], ['T1', 'G2', 1], ['T2', 'G3', -1], ['T2', 'G4', -1]], columns=['source', 'target', 'weight']) res = run_zscore(df, net, verbose=True, use_raw=False, min_n=0) - assert res[0].loc['S1', 'T2'] > 0 + assert res[0].loc['S1', 'T2'] < 0 assert res[0].loc['S2', 'T2'] < 0 - assert res[0].loc['S3', 'T2'] > 0 + assert res[0].loc['S3', 'T2'] < 0 assert res[0].loc['S4', 'T2'] > 0 - assert res[1].map(lambda x: 0 <= x <= 1).all().all() \ No newline at end of file + assert res[0].loc['S5', 'T2'] < 0 + assert res[1].map(lambda x: 0 <= x <= 1).all().all() + + res2 = run_zscore(df, net, verbose=True, use_raw=False, min_n=0, flavor='KSEA') + assert res2[0].loc['S1', 'T2'] > 0 + assert res2[0].loc['S2', 'T2'] < 0 + assert res2[0].loc['S3', 'T2'] < 0 + assert res2[0].loc['S4', 'T2'] > 0 + assert res2[0].loc['S5', 'T2'] > 0 + assert res2[1].map(lambda x: 0 <= x <= 1).all().all() + + adata = AnnData(df.astype(np.float32)) + run_zscore(adata, net, verbose=True, use_raw=False, min_n=0) \ No newline at end of file From 5955256a437e633aaf5ac73fcd427208766b039a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sophia=20M=C3=BCller-Dott?= Date: Wed, 24 Jul 2024 11:10:54 +0200 Subject: [PATCH 5/5] fixed column names --- decoupler/tests/test_zscore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/decoupler/tests/test_zscore.py b/decoupler/tests/test_zscore.py index 72f68ee..724e931 100644 --- a/decoupler/tests/test_zscore.py +++ b/decoupler/tests/test_zscore.py @@ -24,7 +24,7 @@ def test_zscore(): def test_run_zscore(): m = np.array([[-7., -1., 1., 1.], [-4., -2., 1., 2.], [1., 2., 5., 1.], [1., 1., -6., -8.], [-8., -7., 1., 1.]]) - r = np.array(['S1', 'S2', 'S3', 'S4']) + r = np.array(['S1', 'S2', 'S3', 'S4', 'S5']) c = np.array(['G1', 'G2', 'G3', 'G4']) df = pd.DataFrame(m, index=r, columns=c) net = pd.DataFrame([['T1', 'G1', 1], ['T1', 'G2', 1], ['T2', 'G3', -1], ['T2', 'G4', -1]],