diff --git a/decoupler/method_zscore.py b/decoupler/method_zscore.py new file mode 100644 index 0000000..927566b --- /dev/null +++ b/decoupler/method_zscore.py @@ -0,0 +1,88 @@ +""" +Method zscore. +Code to run the z-score (RoKAI, KSEA) method. +""" + +import numpy as np +import pandas as pd + +from scipy.stats import norm + +from .pre import extract, match, rename_net, get_net_mat, filt_min_n, return_data + + +def zscore(m, net, flavor='RoKAI', verbose=False): + stds = np.std(m, axis=1, ddof=1) + if flavor != 'RoKAI': + mean_all = np.mean(m, axis=1) + else: + mean_all = np.zeros(stds.shape) + n = np.sqrt(np.count_nonzero(net, axis=0)) + mean = m.dot(net) / np.sum(np.abs(net), axis=0) + es = ((mean - mean_all.reshape(-1, 1)) * n) / stds.reshape(-1, 1) + pv = norm.cdf(-np.abs(es)) + return es, pv + + +def run_zscore(mat, net, source='source', target='target', weight='weight', batch_size=10000, flavor='RoKAI', + min_n=5, verbose=False, use_raw=True): + """ + z-score. + + Calculates regulatory activities using a z-score as descibed in KSEA or RoKAI. The z-score calculates the mean of the molecular features of the + known targets for each regulator and adjusts it for the number of identified targets for the regulator, the standard deviation of all molecular + features (RoKAI), as well as the mean of all moleculare features (KSEA). + + Parameters + ---------- + mat : list, DataFrame or AnnData + List of [features, matrix], dataframe (samples x features) or an AnnData instance. + net : DataFrame + Network in long format. + source : str + Column name in net with source nodes. + target : str + Column name in net with target nodes. + weight : str + Column name in net with weights. + batch_size : int + Size of the samples to use for each batch. Increasing this will consume more memmory but it will run faster. + min_n : int + Minimum of targets per source. If less, sources are removed. + verbose : bool + Whether to show progress. + use_raw : bool + Use raw attribute of mat if present. + + Returns + ------- + estimate : DataFrame + Z-scores. Stored in `.obsm['zscore_estimate']` if `mat` is AnnData. + pvals : DataFrame + Obtained p-values. Stored in `.obsm['zscore_pvals']` if `mat` is AnnData. + """ + + # Extract sparse matrix and array of genes + m, r, c = extract(mat, use_raw=use_raw, verbose=verbose) + + # Transform net + net = rename_net(net, source=source, target=target, weight=weight) + net = filt_min_n(c, net, min_n=min_n) + sources, targets, net = get_net_mat(net) + + # Match arrays + net = match(c, targets, net) + + if verbose: + print('Running zscore on mat with {0} samples and {1} targets for {2} sources.'.format(m.shape[0], len(c), net.shape[1])) + + # Run ULM + estimate, pvals = zscore(m, net, flavor=flavor) + + # Transform to df + estimate = pd.DataFrame(estimate, index=r, columns=sources) + estimate.name = 'zscore_estimate' + pvals = pd.DataFrame(pvals, index=r, columns=sources) + pvals.name = 'zscore_pvals' + + return return_data(mat=mat, results=(estimate, pvals)) diff --git a/decoupler/tests/test_zscore.py b/decoupler/tests/test_zscore.py new file mode 100644 index 0000000..724e931 --- /dev/null +++ b/decoupler/tests/test_zscore.py @@ -0,0 +1,49 @@ +import numpy as np +import pandas as pd +from anndata import AnnData +from ..method_zscore import zscore, run_zscore + +def test_zscore(): + m = np.array([[-7., -1., 1., 1.], [-4., -2., 1., 2.], [1., 2., 5., 1.], [1., 1., 6., 2.], [-8., -7., 1., 1.]], dtype=np.float32) + net = np.array([[1., 0.], [1, 0.], [0., -1.], [0., -1.]], dtype=np.float32) + act, pvl = zscore(m, net) + assert act[0, 0] < 0 + assert act[1, 0] < 0 + assert act[2, 0] > 0 + assert act[3, 0] > 0 + assert act[4, 0] < 0 + assert np.all((0. <= pvl) * (pvl <= 1.)) + + act2, pvl2 = zscore(m, net, flavor='KSEA') + assert act2[0, 0] < 0 + assert act2[1, 0] < 0 + assert act2[2, 0] < 0 + assert act2[3, 0] < 0 + assert act2[4, 0] < 0 + assert np.all((0. <= pvl2) * (pvl2 <= 1.)) + +def test_run_zscore(): + m = np.array([[-7., -1., 1., 1.], [-4., -2., 1., 2.], [1., 2., 5., 1.], [1., 1., -6., -8.], [-8., -7., 1., 1.]]) + r = np.array(['S1', 'S2', 'S3', 'S4', 'S5']) + c = np.array(['G1', 'G2', 'G3', 'G4']) + df = pd.DataFrame(m, index=r, columns=c) + net = pd.DataFrame([['T1', 'G1', 1], ['T1', 'G2', 1], ['T2', 'G3', -1], ['T2', 'G4', -1]], + columns=['source', 'target', 'weight']) + res = run_zscore(df, net, verbose=True, use_raw=False, min_n=0) + assert res[0].loc['S1', 'T2'] < 0 + assert res[0].loc['S2', 'T2'] < 0 + assert res[0].loc['S3', 'T2'] < 0 + assert res[0].loc['S4', 'T2'] > 0 + assert res[0].loc['S5', 'T2'] < 0 + assert res[1].map(lambda x: 0 <= x <= 1).all().all() + + res2 = run_zscore(df, net, verbose=True, use_raw=False, min_n=0, flavor='KSEA') + assert res2[0].loc['S1', 'T2'] > 0 + assert res2[0].loc['S2', 'T2'] < 0 + assert res2[0].loc['S3', 'T2'] < 0 + assert res2[0].loc['S4', 'T2'] > 0 + assert res2[0].loc['S5', 'T2'] > 0 + assert res2[1].map(lambda x: 0 <= x <= 1).all().all() + + adata = AnnData(df.astype(np.float32)) + run_zscore(adata, net, verbose=True, use_raw=False, min_n=0) \ No newline at end of file