Merge pull request saezlab#58 from demian1/main

association between embeddings and sample metadata using simple anova model
robinfallegger · Jul 21, 2023 · 459b235 · 459b235
2 parents 80fb175 + 02cb3a3
commit 459b235
Show file tree

Hide file tree

Showing 4 changed files with 936 additions and 506 deletions.
diff --git a/decoupler/__init__.py b/decoupler/__init__.py
@@ -6,6 +6,7 @@
 from .utils import dense_run, p_adjust_fdr, shuffle_net, read_gmt  # noqa: F401
 from .utils_anndata import get_acts, swap_layer, get_pseudobulk, get_contrast, rank_sources_groups  # noqa: F401
 from .utils_anndata import get_top_targets, format_contrast_results, filter_by_expr, filter_by_prop  # noqa: F401
+from .utils_anndata import get_metadata_associations # noqa: F401
 from .method_wmean import run_wmean  # noqa: F401
 from .method_wsum import run_wsum  # noqa: F401
 from .method_ulm import run_ulm  # noqa: F401
@@ -25,6 +26,7 @@
 from .plotting import plot_metrics_scatter_cols, plot_psbulk_samples, plot_filter_by_expr, plot_filter_by_prop  # noqa: F401
 from .plotting import plot_volcano_df, plot_targets, plot_running_score  # noqa: F401
 from .plotting import plot_dotplot, plot_barplot_df  # noqa: F401
+from .plotting import plot_associations # noqa: F401
 from .benchmark import benchmark, format_benchmark_inputs, get_performances  # noqa: F401
 from .utils_benchmark import get_toy_benchmark_data, show_metrics  # noqa: F401
 from .metrics import metric_auroc, metric_auprc, metric_mcauroc, metric_mcauprc  # noqa: F401
diff --git a/decoupler/plotting.py b/decoupler/plotting.py
@@ -1458,3 +1458,140 @@ def plot_dotplot(df, x, y, c, s, scale=5, cmap='viridis_r', title=None, figsize=
 
     if return_fig:
         return fig
+
+def _check_assoc_plot_intputs(data, associations, cols, uns_key, obsm_key, use_X, layer, stat_col):
+    # check that the data is a panda frame or has an .uns attribute
+    assert hasattr(data, 'uns') and (hasattr(data, 'obsm') or hasattr(data, 'X') or hasattr(data, 'layers')), 'data sould be an AnnData/MuData object with an .uns, and .obsm, .X, or .layers attribute'
+    assert uns_key is not None, 'uns_key must be specified'
+    assert uns_key in data.uns.keys(), 'uns_key not found in data.uns'
+
+    stats = data.uns[uns_key]
+    assert stat_col in stats.columns, 'stat_col must be one of the columns in the anova results'
+
+    if associations is not None:
+        stats = stats[stats.variable.isin(associations)]
+    if cols is not None:
+        stats = stats[stats.factor.isin(cols)]
+
+    #specify at least but only one of obsm_key, use_X, or layer
+    assert obsm_key is not None or use_X is True or layer is not None, 'one of obsm_key, use_X, or layer must be specified'
+    if (obsm_key is not None and use_X) or (obsm_key is not None and layer is not None) or (use_X and layer is not None):
+        raise ValueError('only one of obsm_key, use_X, or layer should be specified')
+
+    # get activities for main clusterplot
+    if obsm_key is not None:
+        assert hasattr(data, 'obsm'), 'the data does not have an .obsm attribute'
+        assert obsm_key in data.obsm.keys(), 'obsm_key not found in data.obsm'
+        if isinstance(data.obsm[obsm_key], pd.DataFrame):
+            acts = data.obsm[obsm_key]
+        else:
+            column_name = obsm_key.replace('X_', '').replace('pca', 'PC').replace('mofa', 'Factor').replace('umap', 'UMAP')
+            columns = ['{0}{1}'.format(column_name, 1 + x) for x in range(data.obsm[obsm_key].shape[1])]
+            acts = pd.DataFrame(data.obsm[obsm_key],
+                                index = data.obs.index,
+                                columns=columns)
+    elif layer is not None:
+        assert hasattr(data, 'layers'), 'the data does not have a .layers attribute'
+        assert layer in data.layers.keys(), 'layer not found in data.layers'
+        acts = pd.DataFrame(data.layers[layer], index = data.obs.index, columns=data.var.index)
+    elif use_X:
+        assert hasattr(data, 'X'), 'the data does not have a .X attribute'
+        acts = pd.DataFrame(data.X, index = data.obs.index, columns=data.var.index)
+
+    return stats, acts
+
+
+
+def plot_associations(data, uns_key, associations = None, cols = None, obs_annotation_cols = None, obsm_key=None, use_X = False, layer= None, stat_col = 'p_adj', titles = ['Scores', 'Stats'], scores_kwargs = {}, stats_kwargs = {}):
+    """
+    Create a composite plot displaying association results between scores (bottom) and summary statistics (top) using a ClusterMap.
+    Requires PyComplexHeatmap to be installed.
+    
+    Parameters:
+    ------------
+        data : AnnData or MuData
+            The input data containing the association results in .uns[uns_key] and the underlying data in .obsm[obsm_key], .X or .layers[layer].
+        uns_key : str, optional
+            Key in `data.uns` where the association statistics are stored.
+        associations : list, optional
+            List of association names to be plotted. If None, all associations will be plotted (default is None).
+        cols : list, optional
+            List of columns to be plotted (i.e. columns of .obsm/rows of .var). If None, all columns will be plotted (default is None).
+        obsm_key : str, optional
+            Key of `data.obsm` used to plot the bottom clustermap. Either `obsm_key`, `use_X`, or `layer` must be specified.
+        use_X : bool, optional
+            Boolean indicating whether to use the data in `.X` for the bottom clustermap. Either `obsm_key`, `use_X`, or `layer` must be specified.
+        layer : str, optional
+            Key of `data.layers` used to plot the bottom clustermap. Either `obsm_key`, `use_X`, or `layer` must be specified.
+        stat_col : str, optional
+            Name of the summary statistic column in `data.uns[uns_key]` to be shown in the top clustermap (default is 'p_adj').
+        titles : list, optional
+            A list of two strings representing the titles for the ClusterMap for scores and statistics, respectively (default is ['Scores', 'Stats']).
+        scores_kwargs : dict, optional
+            A dictionary of additional keyword arguments for customizing the ClusterMap for scores. See PyComplexHeatmap.ClusterMapPlotter for available options.
+        stats_kwargs : dict, optional
+            A dictionary of additional keyword arguments for customizing the ClusterMap for statistics. See PyComplexHeatmap.ClusterMapPlotter for available options.
+
+    Returns:
+        ax : matplotlib.Axes
+            The main axis of the composite plot.
+        legend_axes : list
+            A list of matplotlib.Axes containing the legend(s) associated with the ClusterMap(s).
+    """
+
+    try:
+        import PyComplexHeatmap as pch
+    except ImportError:
+        raise ImportError('PyComplexHeatmap is not installed. Please install it using "pip install PyComplexHeatmap"')
+
+    stats, acts = _check_assoc_plot_intputs(data, associations, cols, uns_key, obsm_key, use_X, layer, stat_col)
+
+    #subset acts columns with those in stats['factor']
+    acts = acts[stats.factor.unique()]
+
+    # go from long to wide format using only the selected summary statistic
+    stats = stats.pivot(index='factor', columns='variable', values=stat_col).T
+    stats.index.name = None
+    stats.columns.name = None
+    if stat_col in ['pval', 'p_adj']: #do log transform for pvalues
+        stats = -np.log10(stats)
+
+    #defining defaults for clustermaps
+    score_defaults = {'col_cluster': False, 'row_cluster': True, 'label': '{0}scores'.format('' if obsm_key is None else obsm_key.replace('X_', '') + ' '),
+                      'row_dendrogram': True, 'col_dendrogram' :  False, 'show_rownames': False, 'show_colnames': True, 
+                      'verbose': 0, 'legend_gap': 5, 'cmap' :  'RdBu_r', 'center' :  0}
+    score_defaults.update(scores_kwargs)
+
+    stats_label = '-log10({0})'.format(stat_col) if stat_col in ['pval', 'p_adj'] else stat_col
+    stats_defaults = {'col_cluster': False, 'row_cluster' :  True, 'label' :  stats_label,
+                      'row_dendrogram' :  True, 'col_dendrogram' :  False, 'show_rownames' :  True, 'show_colnames' :  False,
+                      'verbose': 0, 'legend_gap': 5, 'cmap' :  'Reds' if stat_col in ['pval', 'p_adj'] else 'Greens'}
+    stats_defaults.update(stats_kwargs)
+
+    if obs_annotation_cols is not None:
+        simples = {}
+        for col in obs_annotation_cols:
+            if data.obs[col].dtype == 'object' or data.obs[col].dtype == 'category':
+                annot = data.obs[col].astype('object')
+                simples[col]=pch.annotations.anno_simple(annot,label= True)
+            else:
+                raise ValueError('Column {0} is not of object or category dtype. These are the only formats supported for now. If you want more complex types annotations, please build your own HeatMapAnnotation object from PyComplexHeatmap and pass it to `score_defaults` under right_annotation'.format(col))
+        score_defaults['right_annotation'] = pch.HeatmapAnnotation(**simples, wgap = 2,
+                            legend_width=10,
+                            label_side= 'bottom',
+                            legend = False,
+                            axis = 0)
+
+    #making both clustermaps
+    cm_scores = pch.ClusterMapPlotter(data=acts, plot=False, **score_defaults)
+    cm_stats = pch.ClusterMapPlotter(data=stats, plot=False, **stats_defaults)
+
+    #combine clustermaps vertically
+    ax, legend_axes = pch.composite(cmlist = [cm_stats, cm_scores], main = 1, axis = 0, height_ratios=[1,3], row_gap=7, legend_gap=15)
+
+    #add titles
+    if titles is not None:
+        cm_scores.ax.set_title(titles[0])
+        cm_stats.ax.set_title(titles[1])
+
+    return ax, legend_axes
diff --git a/decoupler/utils_anndata.py b/decoupler/utils_anndata.py
@@ -9,6 +9,7 @@
 import sys
 
 from anndata import AnnData
+from tqdm import tqdm
 
 from .utils import melt, p_adjust_fdr
 from .pre import rename_net
@@ -897,3 +898,178 @@ def rank_sources_groups(adata, groupby, reference='rest', method='t-test_overest
     results = pd.concat(results)
 
     return results.reset_index(drop=True)
+
+def _check_anova_inputs(data, obs_keys = None, obsm_key=None, use_X = False, layer = None):
+
+# check that data is not None
+    assert data is not None, 'data cannot be None'
+
+    # check whether data is a list, of size 2
+    if isinstance(data, list):
+        assert len(data) == 2, 'data must be a list of size 2'
+        #both elements in data must be pd.Dataframe instances
+        assert isinstance(data[0], pd.DataFrame), 'data[0] must be a pd.DataFrame instance'
+        assert isinstance(data[1], pd.DataFrame), 'data[1] must be a pd.DataFrame instance'
+        #both data[0] and data[1] must have the same index
+        assert data[0].index.equals(data[1].index), 'data[0] and data[1] must have the same index'
+
+        dependent_variables = data[0].columns
+        explanatory_variables = data[1].columns
+        scores = data[0]
+        obs = data[1]
+
+    elif hasattr(data, 'obs'):
+        assert isinstance(data.obs, pd.DataFrame), 'data.obs must be a pd.DataFrame instance'
+        obs = data.obs
+        if obsm_key is None and not use_X and layer is None:
+            raise ValueError('When providing an AnnData or MuData object, either obsm_key, use_X or layer must be specified')
+        elif (obsm_key is not None and use_X) or (obsm_key is not None and layer is not None) or (use_X and layer is not None):
+            raise ValueError('When providing an AnnData or MuData object, only one of obsm_key, use_X or layer must be specified')
+        elif obsm_key is not None:
+            assert hasattr(data, 'obsm'), 'data must have an .obsm attribute'
+            assert obsm_key in data.obsm.keys(), 'obsm_key must be a key in data.obsm'
+            column_name = obsm_key.replace('X_', '').replace('pca', 'PC').replace('mofa', 'Factor').replace('umap', 'UMAP')
+            scores = pd.DataFrame(data.obsm[obsm_key], index=data.obs.index, columns=['{0}{1}'.format(column_name, 1 + x) for x in range(data.obsm[obsm_key].shape[1])])
+        elif use_X:
+            assert hasattr(data, 'X'), 'data must have a .X attribute'
+            scores = pd.DataFrame(data.X, index=data.obs.index, columns=data.var.index)
+        elif layer is not None:
+            assert hasattr(data, 'layers'), 'data must have a .layers attribute'
+            assert layer in data.layers.keys(), 'layer must be a key in data.layers'
+            scores = pd.DataFrame(data.layers[layer], index=data.obs.index, columns=data.var.index)
+
+        assert scores.index.equals(obs.index), 'scores and obs must have the same index'
+        dependent_variables = scores.columns
+        explanatory_variables = obs.columns
+    else:
+        raise TypeError('data must be a list of size 2 or an AnnData or MuData object with .obs dataframe')
+
+    # select only the columns in obs_keys
+    if obs_keys is None:
+        obs_keys = explanatory_variables
+    assert all([var in explanatory_variables for var in obs_keys]), 'obs_keys must be a subset of the obs columns'
+    obs = obs.filter(obs_keys, axis=1)
+    explanatory_variables = obs_keys
+
+    #check that there are no . in the dependent and explanatory variables
+    assert all(['.' not in var for var in dependent_variables]), 'column names of dependent variables cannot contain .'
+    assert all(['.' not in var for var in explanatory_variables]), 'column names of explanatory variables (obs) cannot contain .'
+    # check there is no overlap of column names between dependent and explanatory variables
+    assert len([value for value in dependent_variables if value in explanatory_variables]) == 0, 'dependent and explanatory variables cannot have the same column names'
+
+    # merge scores and obs
+    scores = pd.merge(scores, obs, left_index=True, right_index=True)
+
+    return scores, list(dependent_variables), list(explanatory_variables)
+
+
+
+
+
+
+
+
+def get_metadata_associations(data, obs_keys = None, obsm_key=None, use_X = False, layer = None,  uns_key = None, inplace = False, alpha = 0.05, method = 'fdr_bh'):
+
+    """
+    Associate the data to sample metadata using ANOVA.
+    
+    Requires statsmodels to be installed
+
+    Parameters
+    ----------
+    
+        data : list, AnnData or MuData
+            The input data for ANOVA testing. It can be either a list of two pandas DataFrames [data, obs], an AnnData or MuData object.
+        obs_keys: list, optional
+            Column names of obs (sample metadata) which should be tested. If not provided, all columns in obs will be used.
+        obsm_key : str, optional
+            A key specifying where in obsm the data is located when providing an AnnData/MuData object. Either `obsm_key`, `use_X`, or `layer` must be specified.
+        use_X : bool, optional
+            A boolean flag indicating whether to use the data in `.X` from the AnnData/MuData object when providing the `data`. Either `obsm_key`, `use_X`, or `layer` must be specified.
+        layer : str, optional
+            Which layer to use when providing an AnnData/MuData object. Either `obsm_key`, `use_X`, or `layer` must be specified.
+        uns_key : str, optional 
+            Where results will be stored the AnnData/MuData object.
+        inplace : bool, optional
+            Whether to store the results in the AnnData or MuData object. If `False`, the function returns a pandas DataFrame with the results.
+        alpha : float, optional
+            The significance level for multiple testing correction (default is 0.05).
+        method : (str, optional): 
+            The method used for multiple testing correction. It can be one of the methods supported by `statsmodels.stats.multitest.multipletests` (default is `'fdr_bh'`, i.e., Benjamini-Hochberg method).
+
+    Returns:
+        results: pd.DataFrame
+            DataFrame with ANOVA results. If `data` is an AnnData or MuData object and `inplace` is `True`, the results are stored in `data.uns[uns_key]`.
+
+    """
+
+    try:
+        import statsmodels.api as sm
+        from statsmodels.formula.api import ols
+        from statsmodels.stats.multitest import multipletests
+    except Exception:
+        raise ImportError(
+            'statsmodels is not installed. Please install it.'
+        )
+
+    scores, dependent_variables, explanatory_variables = _check_anova_inputs(data, obs_keys = obs_keys, obsm_key=obsm_key, use_X = use_X, layer = layer)
+
+    # for each dependent variable, test the association with the other columns in scores using an ANOVA, collect p-values and eta_sq in a dataframe
+    for dependent in tqdm(dependent_variables):
+        stats = []
+        for explainer in explanatory_variables:
+            # check the data type of the column
+            if scores[explainer].dtype == 'object' or scores[explainer].dtype == 'category':
+                # if it is a string, the explanatory variable is a categorical variable
+                var_name = 'C({0})'.format(explainer)
+            # if it is a float or integer, the explanatory variable is a continuous variable
+            elif scores[explainer].dtype == 'float' or scores[explainer].dtype == 'int':
+                var_name = '{0}'.format(explainer)
+            # else raise an error
+            else:
+                raise ValueError('Column {0} has an unknown data type. Make sure it is either categorical/object or float/int'.format(explainer))
+
+            # create the formula
+            formula = '{0} ~ {1}'.format(dependent, var_name)
+
+            # fit the model
+            mod = ols(formula , data = scores.filter([dependent, explainer], axis=1)).fit()
+            # get the ANOVA table
+            try:
+                aov_table = sm.stats.anova_lm(mod, typ=2)
+            except ValueError:
+                print('WARNING: could not compute ANOVA for {0} and {1}'.format(dependent, explainer))
+                row = [explainer, np.nan, np.nan]
+            else:
+                # compute eta squared from anova table
+                eta_sq = aov_table.loc[var_name, 'sum_sq']/aov_table.sum(axis=0)['sum_sq'] # eta squared = SS from explanatory variable / SStotal (i.e. sum of all SS from variables plus residual SS)
+                row = [explainer, aov_table['PR(>F)'][0], eta_sq]
+            stats.append(row)
+
+        stats = pd.DataFrame(stats, columns=['variable', 'pval', 'eta_sq']).set_index('variable')
+        stats['factor'] = dependent
+        if dependent_variables.index(dependent) == 0:
+            stats_df = stats.copy()
+        else:
+            stats_df = pd.concat([stats_df, stats], axis=0, join='outer', sort=False)
+
+    stats_df = stats_df.reset_index()
+    stats_df['p_adj'] = np.stack(stats_df.groupby('factor').apply(lambda df: multipletests(df['pval'], alpha = alpha, method = method, returnsorted=False)[1]).values).flatten()
+
+    if isinstance(data, list):
+        return stats_df
+    elif inplace:
+        # which of obsm_key, use_X, or layer is not None?
+        if obsm_key is not None:
+            key = obsm_key.replace('X_', '') + '_anova' if uns_key is None else uns_key
+        elif use_X:
+            key = 'X_anova' if uns_key is None else uns_key
+        elif layer is not None:
+            key = layer + '_anova' if uns_key is None else uns_key
+        if not hasattr(data, 'uns'):
+            data.uns = {key: stats_df}
+        else:
+            data.uns[key] = stats_df
+    else:
+        return stats_df