Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implement z-score method #145

Merged
merged 5 commits into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 88 additions & 0 deletions decoupler/method_zscore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
"""
Method zscore.
Code to run the z-score (RoKAI, KSEA) method.
"""

import numpy as np
import pandas as pd

from scipy.stats import norm

from .pre import extract, match, rename_net, get_net_mat, filt_min_n, return_data


def zscore(m, net, flavor='RoKAI', verbose=False):
stds = np.std(m, axis=1, ddof=1)
if flavor != 'RoKAI':
mean_all = np.mean(m, axis=1)
else:
mean_all = np.zeros(stds.shape)
n = np.sqrt(np.count_nonzero(net, axis=0))
mean = m.dot(net) / np.sum(np.abs(net), axis=0)
es = ((mean - mean_all.reshape(-1, 1)) * n) / stds.reshape(-1, 1)
pv = norm.cdf(-np.abs(es))
return es, pv


def run_zscore(mat, net, source='source', target='target', weight='weight', batch_size=10000, flavor='RoKAI',
min_n=5, verbose=False, use_raw=True):
"""
z-score.

Calculates regulatory activities using a z-score as descibed in KSEA or RoKAI. The z-score calculates the mean of the molecular features of the
known targets for each regulator and adjusts it for the number of identified targets for the regulator, the standard deviation of all molecular
features (RoKAI), as well as the mean of all moleculare features (KSEA).

Parameters
----------
mat : list, DataFrame or AnnData
List of [features, matrix], dataframe (samples x features) or an AnnData instance.
net : DataFrame
Network in long format.
source : str
Column name in net with source nodes.
target : str
Column name in net with target nodes.
weight : str
Column name in net with weights.
batch_size : int
Size of the samples to use for each batch. Increasing this will consume more memmory but it will run faster.
min_n : int
Minimum of targets per source. If less, sources are removed.
verbose : bool
Whether to show progress.
use_raw : bool
Use raw attribute of mat if present.

Returns
-------
estimate : DataFrame
Z-scores. Stored in `.obsm['zscore_estimate']` if `mat` is AnnData.
pvals : DataFrame
Obtained p-values. Stored in `.obsm['zscore_pvals']` if `mat` is AnnData.
"""

# Extract sparse matrix and array of genes
m, r, c = extract(mat, use_raw=use_raw, verbose=verbose)

# Transform net
net = rename_net(net, source=source, target=target, weight=weight)
net = filt_min_n(c, net, min_n=min_n)
sources, targets, net = get_net_mat(net)

# Match arrays
net = match(c, targets, net)

if verbose:
print('Running zscore on mat with {0} samples and {1} targets for {2} sources.'.format(m.shape[0], len(c), net.shape[1]))

# Run ULM
estimate, pvals = zscore(m, net, flavor=flavor)

# Transform to df
estimate = pd.DataFrame(estimate, index=r, columns=sources)
estimate.name = 'zscore_estimate'
pvals = pd.DataFrame(pvals, index=r, columns=sources)
pvals.name = 'zscore_pvals'

return return_data(mat=mat, results=(estimate, pvals))
49 changes: 49 additions & 0 deletions decoupler/tests/test_zscore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import numpy as np
import pandas as pd
from anndata import AnnData
from ..method_zscore import zscore, run_zscore

def test_zscore():
m = np.array([[-7., -1., 1., 1.], [-4., -2., 1., 2.], [1., 2., 5., 1.], [1., 1., 6., 2.], [-8., -7., 1., 1.]], dtype=np.float32)
net = np.array([[1., 0.], [1, 0.], [0., -1.], [0., -1.]], dtype=np.float32)
act, pvl = zscore(m, net)
assert act[0, 0] < 0
assert act[1, 0] < 0
assert act[2, 0] > 0
assert act[3, 0] > 0
assert act[4, 0] < 0
assert np.all((0. <= pvl) * (pvl <= 1.))

act2, pvl2 = zscore(m, net, flavor='KSEA')
assert act2[0, 0] < 0
assert act2[1, 0] < 0
assert act2[2, 0] < 0
assert act2[3, 0] < 0
assert act2[4, 0] < 0
assert np.all((0. <= pvl2) * (pvl2 <= 1.))

def test_run_zscore():
m = np.array([[-7., -1., 1., 1.], [-4., -2., 1., 2.], [1., 2., 5., 1.], [1., 1., -6., -8.], [-8., -7., 1., 1.]])
r = np.array(['S1', 'S2', 'S3', 'S4', 'S5'])
c = np.array(['G1', 'G2', 'G3', 'G4'])
df = pd.DataFrame(m, index=r, columns=c)
net = pd.DataFrame([['T1', 'G1', 1], ['T1', 'G2', 1], ['T2', 'G3', -1], ['T2', 'G4', -1]],
columns=['source', 'target', 'weight'])
res = run_zscore(df, net, verbose=True, use_raw=False, min_n=0)
assert res[0].loc['S1', 'T2'] < 0
assert res[0].loc['S2', 'T2'] < 0
assert res[0].loc['S3', 'T2'] < 0
assert res[0].loc['S4', 'T2'] > 0
assert res[0].loc['S5', 'T2'] < 0
assert res[1].map(lambda x: 0 <= x <= 1).all().all()

res2 = run_zscore(df, net, verbose=True, use_raw=False, min_n=0, flavor='KSEA')
assert res2[0].loc['S1', 'T2'] > 0
assert res2[0].loc['S2', 'T2'] < 0
assert res2[0].loc['S3', 'T2'] < 0
assert res2[0].loc['S4', 'T2'] > 0
assert res2[0].loc['S5', 'T2'] > 0
assert res2[1].map(lambda x: 0 <= x <= 1).all().all()

adata = AnnData(df.astype(np.float32))
run_zscore(adata, net, verbose=True, use_raw=False, min_n=0)
Loading