From 2cb86978d44d9727da72cbf94ea643fdc887328f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=BCller-Dott?= <sophia.md@hotmail.de>
Date: Fri, 19 Jul 2024 08:59:29 +0200
Subject: [PATCH 1/5] implement z-score method

---
 decoupler/method_zscore.py     | 108 +++++++++++++++++++++++++++++++++
 decoupler/tests/test_zscore.py |  31 ++++++++++
 2 files changed, 139 insertions(+)
 create mode 100644 decoupler/method_zscore.py
 create mode 100644 decoupler/tests/test_zscore.py

diff --git a/decoupler/method_zscore.py b/decoupler/method_zscore.py
new file mode 100644
index 0000000..e1940bc
--- /dev/null
+++ b/decoupler/method_zscore.py
@@ -0,0 +1,108 @@
+"""
+Method zscore.
+Code to run the z-score (RoKAI, KSEA) method.
+"""
+
+import numpy as np
+import pandas as pd
+from scipy.sparse import csr_matrix
+from scipy.sparse import isspmatrix_csr
+
+from scipy.stats import t
+from scipy.stats import norm
+
+from .pre import extract, match, rename_net, get_net_mat, filt_min_n, return_data
+
+from tqdm import tqdm
+
+def zscore(m, net, flavor='RoKAI', verbose=False):
+
+    # Get dims
+    n_samples = m.shape[0]
+    n_features, n_fsets = net.shape
+    
+    es = np.zeros((n_samples, n_fsets))
+    pv = np.zeros((n_samples, n_fsets))
+                     
+    # Compute each element of Matrix3
+    for i in range(n_samples):
+        for f in range(n_fsets):
+            m_f = m[:, i]
+            if isspmatrix_csr(m_f):
+                m_f = m_f.toarray()
+                m_f = m_f.reshape(1, -1)
+            net_i = net[:, f]
+
+            mean_product = np.sum(m_f * net_i) / np.sum(abs(net_i))
+            mean_m_f = 0 if flavor == "RoKAI" else np.mean(m_f)
+            std_m_f = np.std(m_f)
+            count_non_zeros = np.count_nonzero(net_i)
+
+            es[i, f] = (mean_product - mean_m_f) * np.sqrt(count_non_zeros) / std_m_f
+            pv[i, f] = norm.cdf(-abs(es[i, f]))
+
+    return es, pv
+
+
+def run_zscore(mat, net, source='source', target='target', weight='weight', batch_size=10000, flavor='RoKAI',
+             min_n=5, verbose=False, use_raw=True):
+    """
+    z-score.
+
+    Calculates regulatory activities using a z-score as descibed in KSEA or RoKAI. The z-score calculates the mean of the molecular features of the 
+    known targets for each regulator and adjusts it for the number of identified targets for the regulator, the standard deviation of all molecular 
+    features (RoKAI), as well as the mean of all moleculare features (KSEA).
+
+    Parameters
+    ----------
+    mat : list, DataFrame
+        List of [features, matrix], dataframe (samples x features).
+    net : DataFrame
+        Network in long format.
+    source : str
+        Column name in net with source nodes.
+    target : str
+        Column name in net with target nodes.
+    weight : str
+        Column name in net with weights.
+    flavor : int
+        Whether to use the implementation of RoKAI (default) or KSEA.
+    min_n : int
+        Minimum of targets per source. If less, sources are removed.
+    verbose : bool
+        Whether to show progress.
+    use_raw : bool
+        Use raw attribute of mat if present.
+
+    Returns
+    -------
+    estimate : DataFrame
+        Z-scores.
+    pvals : DataFrame
+        Obtained p-values.
+    """
+
+    # Extract sparse matrix and array of genes
+    m, r, c = extract(mat, use_raw=use_raw, verbose=verbose)
+
+    # Transform net
+    net = rename_net(net, source=source, target=target, weight=weight)
+    net = filt_min_n(c, net, min_n=min_n)
+    sources, targets, net = get_net_mat(net)
+
+    # Match arrays
+    net = match(c, targets, net)
+
+    if verbose:
+        print('Running zscore on mat with {0} samples and {1} targets for {2} sources.'.format(m.shape[0], len(c), net.shape[1]))
+
+    # Run ULM
+    estimate, pvals = zscore(m, net, flavor=flavor)
+
+    # Transform to df
+    estimate = pd.DataFrame(estimate, index=r, columns=sources)
+    estimate.name = 'zscore_estimate'
+    pvals = pd.DataFrame(pvals, index=r, columns=sources)
+    pvals.name = 'zscore_pvals'
+
+    return return_data(mat=mat, results=(estimate, pvals))
\ No newline at end of file
diff --git a/decoupler/tests/test_zscore.py b/decoupler/tests/test_zscore.py
new file mode 100644
index 0000000..8ff491c
--- /dev/null
+++ b/decoupler/tests/test_zscore.py
@@ -0,0 +1,31 @@
+import numpy as np
+import pandas as pd
+from scipy.sparse import csr_matrix
+from scipy.stats import norm
+from anndata import AnnData
+from numpy.testing import assert_almost_equal
+from ..method_zscore import zscore, run_zscore
+
+def test_zscore():
+    m = csr_matrix(np.array([[-7., -1., 1., 1.], [-4., -2., 1., 2.], [1., 2., 5., 1.], [1., 1., 6., 2.]], dtype=np.float32))
+    net = np.array([[1., 0.], [1, 0.], [0., -1.], [0., -1.]], dtype=np.float32)
+    act, pvl = zscore(m, net)
+    assert act[0, 0] < 0
+    assert act[1, 0] < 0
+    assert act[2, 0] > 0
+    assert act[3, 0] > 0
+    assert np.all((0. <= pvl) * (pvl <= 1.))
+
+def test_run_zscore():
+    m = np.array([[7., 1., 1., 1.], [4., 2., 1., 2.], [1., 2., 5., 1.], [1., 1., 6., 2.]])
+    r = np.array(['S1', 'S2', 'S3', 'S4'])
+    c = np.array(['G1', 'G2', 'G3', 'G4'])
+    df = pd.DataFrame(m, index=r, columns=c)
+    net = pd.DataFrame([['T1', 'G1', 1], ['T1', 'G2', 2], ['T2', 'G3', -3], ['T2', 'G4', 4]],
+                       columns=['source', 'target', 'weight'])
+    res = run_zscore(df, net, verbose=True, use_raw=False, min_n=0)
+    assert res[0].loc['S1', 'T2'] > 0
+    assert res[0].loc['S2', 'T2'] < 0
+    assert res[0].loc['S3', 'T2'] > 0
+    assert res[0].loc['S4', 'T2'] > 0
+    assert res[1].map(lambda x: 0 <= x <= 1).all().all()
\ No newline at end of file

From b1fb97cabd918efc1b7726f6c360226632618d6f Mon Sep 17 00:00:00 2001
From: Pau Badia i Mompel <44896790+PauBadiaM@users.noreply.github.com>
Date: Tue, 23 Jul 2024 16:09:54 +0200
Subject: [PATCH 2/5] zscore use matrix operations

---
 decoupler/method_zscore.py | 37 +++++++++++--------------------------
 1 file changed, 11 insertions(+), 26 deletions(-)

diff --git a/decoupler/method_zscore.py b/decoupler/method_zscore.py
index e1940bc..c48c6ce 100644
--- a/decoupler/method_zscore.py
+++ b/decoupler/method_zscore.py
@@ -15,32 +15,17 @@
 
 from tqdm import tqdm
 
-def zscore(m, net, flavor='RoKAI', verbose=False):
-
-    # Get dims
-    n_samples = m.shape[0]
-    n_features, n_fsets = net.shape
-    
-    es = np.zeros((n_samples, n_fsets))
-    pv = np.zeros((n_samples, n_fsets))
-                     
-    # Compute each element of Matrix3
-    for i in range(n_samples):
-        for f in range(n_fsets):
-            m_f = m[:, i]
-            if isspmatrix_csr(m_f):
-                m_f = m_f.toarray()
-                m_f = m_f.reshape(1, -1)
-            net_i = net[:, f]
-
-            mean_product = np.sum(m_f * net_i) / np.sum(abs(net_i))
-            mean_m_f = 0 if flavor == "RoKAI" else np.mean(m_f)
-            std_m_f = np.std(m_f)
-            count_non_zeros = np.count_nonzero(net_i)
-
-            es[i, f] = (mean_product - mean_m_f) * np.sqrt(count_non_zeros) / std_m_f
-            pv[i, f] = norm.cdf(-abs(es[i, f]))
 
+def zscore(m, net, flavor='RoKAI', verbose=False):
+    stds = np.std(m, axis=1, ddof=1)
+    if flavor != 'RoKAI':
+        mean_all = np.mean(m, axis=1)
+    else:
+        mean_all = np.zeros(stds.shape)
+    n = np.sqrt(np.count_nonzero(net, axis=0))
+    mean = m.dot(net) / np.sum(np.abs(net), axis=0)
+    es = ((mean - mean_all.reshape(-1, 1)) * n) / stds.reshape(-1, 1)
+    pv = norm.cdf(-np.abs(es))
     return es, pv
 
 
@@ -105,4 +90,4 @@ def run_zscore(mat, net, source='source', target='target', weight='weight', batc
     pvals = pd.DataFrame(pvals, index=r, columns=sources)
     pvals.name = 'zscore_pvals'
 
-    return return_data(mat=mat, results=(estimate, pvals))
\ No newline at end of file
+    return return_data(mat=mat, results=(estimate, pvals))

From 6d997c6ea36addce7c288374b67dacab5e363a65 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=BCller-Dott?= <sophia.md@hotmail.de>
Date: Wed, 24 Jul 2024 10:55:07 +0200
Subject: [PATCH 3/5] clean up packages and description

---
 decoupler/method_zscore.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/decoupler/method_zscore.py b/decoupler/method_zscore.py
index c48c6ce..927566b 100644
--- a/decoupler/method_zscore.py
+++ b/decoupler/method_zscore.py
@@ -5,16 +5,11 @@
 
 import numpy as np
 import pandas as pd
-from scipy.sparse import csr_matrix
-from scipy.sparse import isspmatrix_csr
 
-from scipy.stats import t
 from scipy.stats import norm
 
 from .pre import extract, match, rename_net, get_net_mat, filt_min_n, return_data
 
-from tqdm import tqdm
-
 
 def zscore(m, net, flavor='RoKAI', verbose=False):
     stds = np.std(m, axis=1, ddof=1)
@@ -40,8 +35,8 @@ def run_zscore(mat, net, source='source', target='target', weight='weight', batc
 
     Parameters
     ----------
-    mat : list, DataFrame
-        List of [features, matrix], dataframe (samples x features).
+    mat : list, DataFrame or AnnData
+        List of [features, matrix], dataframe (samples x features) or an AnnData instance.
     net : DataFrame
         Network in long format.
     source : str
@@ -50,8 +45,8 @@ def run_zscore(mat, net, source='source', target='target', weight='weight', batc
         Column name in net with target nodes.
     weight : str
         Column name in net with weights.
-    flavor : int
-        Whether to use the implementation of RoKAI (default) or KSEA.
+    batch_size : int
+        Size of the samples to use for each batch. Increasing this will consume more memmory but it will run faster.
     min_n : int
         Minimum of targets per source. If less, sources are removed.
     verbose : bool
@@ -62,9 +57,9 @@ def run_zscore(mat, net, source='source', target='target', weight='weight', batc
     Returns
     -------
     estimate : DataFrame
-        Z-scores.
+        Z-scores. Stored in `.obsm['zscore_estimate']` if `mat` is AnnData.
     pvals : DataFrame
-        Obtained p-values.
+        Obtained p-values. Stored in `.obsm['zscore_pvals']` if `mat` is AnnData.
     """
 
     # Extract sparse matrix and array of genes

From 4fb77f2d5e04575b4855e898b55120595e5ff9f9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=BCller-Dott?= <sophia.md@hotmail.de>
Date: Wed, 24 Jul 2024 10:55:31 +0200
Subject: [PATCH 4/5] refine z-score tests

---
 decoupler/tests/test_zscore.py | 36 +++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/decoupler/tests/test_zscore.py b/decoupler/tests/test_zscore.py
index 8ff491c..72f68ee 100644
--- a/decoupler/tests/test_zscore.py
+++ b/decoupler/tests/test_zscore.py
@@ -1,31 +1,49 @@
 import numpy as np
 import pandas as pd
-from scipy.sparse import csr_matrix
-from scipy.stats import norm
 from anndata import AnnData
-from numpy.testing import assert_almost_equal
 from ..method_zscore import zscore, run_zscore
 
 def test_zscore():
-    m = csr_matrix(np.array([[-7., -1., 1., 1.], [-4., -2., 1., 2.], [1., 2., 5., 1.], [1., 1., 6., 2.]], dtype=np.float32))
+    m = np.array([[-7., -1., 1., 1.], [-4., -2., 1., 2.], [1., 2., 5., 1.], [1., 1., 6., 2.], [-8., -7., 1., 1.]], dtype=np.float32)
     net = np.array([[1., 0.], [1, 0.], [0., -1.], [0., -1.]], dtype=np.float32)
     act, pvl = zscore(m, net)
     assert act[0, 0] < 0
     assert act[1, 0] < 0
     assert act[2, 0] > 0
     assert act[3, 0] > 0
+    assert act[4, 0] < 0
     assert np.all((0. <= pvl) * (pvl <= 1.))
+    
+    act2, pvl2 = zscore(m, net, flavor='KSEA')
+    assert act2[0, 0] < 0
+    assert act2[1, 0] < 0
+    assert act2[2, 0] < 0
+    assert act2[3, 0] < 0
+    assert act2[4, 0] < 0
+    assert np.all((0. <= pvl2) * (pvl2 <= 1.))
 
 def test_run_zscore():
-    m = np.array([[7., 1., 1., 1.], [4., 2., 1., 2.], [1., 2., 5., 1.], [1., 1., 6., 2.]])
+    m = np.array([[-7., -1., 1., 1.], [-4., -2., 1., 2.], [1., 2., 5., 1.], [1., 1., -6., -8.], [-8., -7., 1., 1.]])
     r = np.array(['S1', 'S2', 'S3', 'S4'])
     c = np.array(['G1', 'G2', 'G3', 'G4'])
     df = pd.DataFrame(m, index=r, columns=c)
-    net = pd.DataFrame([['T1', 'G1', 1], ['T1', 'G2', 2], ['T2', 'G3', -3], ['T2', 'G4', 4]],
+    net = pd.DataFrame([['T1', 'G1', 1], ['T1', 'G2', 1], ['T2', 'G3', -1], ['T2', 'G4', -1]],
                        columns=['source', 'target', 'weight'])
     res = run_zscore(df, net, verbose=True, use_raw=False, min_n=0)
-    assert res[0].loc['S1', 'T2'] > 0
+    assert res[0].loc['S1', 'T2'] < 0
     assert res[0].loc['S2', 'T2'] < 0
-    assert res[0].loc['S3', 'T2'] > 0
+    assert res[0].loc['S3', 'T2'] < 0
     assert res[0].loc['S4', 'T2'] > 0
-    assert res[1].map(lambda x: 0 <= x <= 1).all().all()
\ No newline at end of file
+    assert res[0].loc['S5', 'T2'] < 0
+    assert res[1].map(lambda x: 0 <= x <= 1).all().all()
+    
+    res2 = run_zscore(df, net, verbose=True, use_raw=False, min_n=0, flavor='KSEA')
+    assert res2[0].loc['S1', 'T2'] > 0
+    assert res2[0].loc['S2', 'T2'] < 0
+    assert res2[0].loc['S3', 'T2'] < 0
+    assert res2[0].loc['S4', 'T2'] > 0
+    assert res2[0].loc['S5', 'T2'] > 0
+    assert res2[1].map(lambda x: 0 <= x <= 1).all().all()
+    
+    adata = AnnData(df.astype(np.float32))
+    run_zscore(adata, net, verbose=True, use_raw=False, min_n=0)
\ No newline at end of file

From 5955256a437e633aaf5ac73fcd427208766b039a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=BCller-Dott?= <sophia.md@hotmail.de>
Date: Wed, 24 Jul 2024 11:10:54 +0200
Subject: [PATCH 5/5] fixed column names

---
 decoupler/tests/test_zscore.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/decoupler/tests/test_zscore.py b/decoupler/tests/test_zscore.py
index 72f68ee..724e931 100644
--- a/decoupler/tests/test_zscore.py
+++ b/decoupler/tests/test_zscore.py
@@ -24,7 +24,7 @@ def test_zscore():
 
 def test_run_zscore():
     m = np.array([[-7., -1., 1., 1.], [-4., -2., 1., 2.], [1., 2., 5., 1.], [1., 1., -6., -8.], [-8., -7., 1., 1.]])
-    r = np.array(['S1', 'S2', 'S3', 'S4'])
+    r = np.array(['S1', 'S2', 'S3', 'S4', 'S5'])
     c = np.array(['G1', 'G2', 'G3', 'G4'])
     df = pd.DataFrame(m, index=r, columns=c)
     net = pd.DataFrame([['T1', 'G1', 1], ['T1', 'G2', 1], ['T2', 'G3', -1], ['T2', 'G4', -1]],