Skip to content

Commit

Permalink
Merge pull request saezlab#58 from demian1/main
Browse files Browse the repository at this point in the history
association between embeddings and sample metadata using simple anova model
  • Loading branch information
PauBadiaM authored Jul 21, 2023
2 parents 80fb175 + 02cb3a3 commit 459b235
Show file tree
Hide file tree
Showing 4 changed files with 936 additions and 506 deletions.
2 changes: 2 additions & 0 deletions decoupler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .utils import dense_run, p_adjust_fdr, shuffle_net, read_gmt # noqa: F401
from .utils_anndata import get_acts, swap_layer, get_pseudobulk, get_contrast, rank_sources_groups # noqa: F401
from .utils_anndata import get_top_targets, format_contrast_results, filter_by_expr, filter_by_prop # noqa: F401
from .utils_anndata import get_metadata_associations # noqa: F401
from .method_wmean import run_wmean # noqa: F401
from .method_wsum import run_wsum # noqa: F401
from .method_ulm import run_ulm # noqa: F401
Expand All @@ -25,6 +26,7 @@
from .plotting import plot_metrics_scatter_cols, plot_psbulk_samples, plot_filter_by_expr, plot_filter_by_prop # noqa: F401
from .plotting import plot_volcano_df, plot_targets, plot_running_score # noqa: F401
from .plotting import plot_dotplot, plot_barplot_df # noqa: F401
from .plotting import plot_associations # noqa: F401
from .benchmark import benchmark, format_benchmark_inputs, get_performances # noqa: F401
from .utils_benchmark import get_toy_benchmark_data, show_metrics # noqa: F401
from .metrics import metric_auroc, metric_auprc, metric_mcauroc, metric_mcauprc # noqa: F401
137 changes: 137 additions & 0 deletions decoupler/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -1458,3 +1458,140 @@ def plot_dotplot(df, x, y, c, s, scale=5, cmap='viridis_r', title=None, figsize=

if return_fig:
return fig

def _check_assoc_plot_intputs(data, associations, cols, uns_key, obsm_key, use_X, layer, stat_col):
# check that the data is a panda frame or has an .uns attribute
assert hasattr(data, 'uns') and (hasattr(data, 'obsm') or hasattr(data, 'X') or hasattr(data, 'layers')), 'data sould be an AnnData/MuData object with an .uns, and .obsm, .X, or .layers attribute'
assert uns_key is not None, 'uns_key must be specified'
assert uns_key in data.uns.keys(), 'uns_key not found in data.uns'

stats = data.uns[uns_key]
assert stat_col in stats.columns, 'stat_col must be one of the columns in the anova results'

if associations is not None:
stats = stats[stats.variable.isin(associations)]
if cols is not None:
stats = stats[stats.factor.isin(cols)]

#specify at least but only one of obsm_key, use_X, or layer
assert obsm_key is not None or use_X is True or layer is not None, 'one of obsm_key, use_X, or layer must be specified'
if (obsm_key is not None and use_X) or (obsm_key is not None and layer is not None) or (use_X and layer is not None):
raise ValueError('only one of obsm_key, use_X, or layer should be specified')

# get activities for main clusterplot
if obsm_key is not None:
assert hasattr(data, 'obsm'), 'the data does not have an .obsm attribute'
assert obsm_key in data.obsm.keys(), 'obsm_key not found in data.obsm'
if isinstance(data.obsm[obsm_key], pd.DataFrame):
acts = data.obsm[obsm_key]
else:
column_name = obsm_key.replace('X_', '').replace('pca', 'PC').replace('mofa', 'Factor').replace('umap', 'UMAP')
columns = ['{0}{1}'.format(column_name, 1 + x) for x in range(data.obsm[obsm_key].shape[1])]
acts = pd.DataFrame(data.obsm[obsm_key],
index = data.obs.index,
columns=columns)
elif layer is not None:
assert hasattr(data, 'layers'), 'the data does not have a .layers attribute'
assert layer in data.layers.keys(), 'layer not found in data.layers'
acts = pd.DataFrame(data.layers[layer], index = data.obs.index, columns=data.var.index)
elif use_X:
assert hasattr(data, 'X'), 'the data does not have a .X attribute'
acts = pd.DataFrame(data.X, index = data.obs.index, columns=data.var.index)

return stats, acts



def plot_associations(data, uns_key, associations = None, cols = None, obs_annotation_cols = None, obsm_key=None, use_X = False, layer= None, stat_col = 'p_adj', titles = ['Scores', 'Stats'], scores_kwargs = {}, stats_kwargs = {}):
"""
Create a composite plot displaying association results between scores (bottom) and summary statistics (top) using a ClusterMap.
Requires PyComplexHeatmap to be installed.
Parameters:
------------
data : AnnData or MuData
The input data containing the association results in .uns[uns_key] and the underlying data in .obsm[obsm_key], .X or .layers[layer].
uns_key : str, optional
Key in `data.uns` where the association statistics are stored.
associations : list, optional
List of association names to be plotted. If None, all associations will be plotted (default is None).
cols : list, optional
List of columns to be plotted (i.e. columns of .obsm/rows of .var). If None, all columns will be plotted (default is None).
obsm_key : str, optional
Key of `data.obsm` used to plot the bottom clustermap. Either `obsm_key`, `use_X`, or `layer` must be specified.
use_X : bool, optional
Boolean indicating whether to use the data in `.X` for the bottom clustermap. Either `obsm_key`, `use_X`, or `layer` must be specified.
layer : str, optional
Key of `data.layers` used to plot the bottom clustermap. Either `obsm_key`, `use_X`, or `layer` must be specified.
stat_col : str, optional
Name of the summary statistic column in `data.uns[uns_key]` to be shown in the top clustermap (default is 'p_adj').
titles : list, optional
A list of two strings representing the titles for the ClusterMap for scores and statistics, respectively (default is ['Scores', 'Stats']).
scores_kwargs : dict, optional
A dictionary of additional keyword arguments for customizing the ClusterMap for scores. See PyComplexHeatmap.ClusterMapPlotter for available options.
stats_kwargs : dict, optional
A dictionary of additional keyword arguments for customizing the ClusterMap for statistics. See PyComplexHeatmap.ClusterMapPlotter for available options.
Returns:
ax : matplotlib.Axes
The main axis of the composite plot.
legend_axes : list
A list of matplotlib.Axes containing the legend(s) associated with the ClusterMap(s).
"""

try:
import PyComplexHeatmap as pch
except ImportError:
raise ImportError('PyComplexHeatmap is not installed. Please install it using "pip install PyComplexHeatmap"')

stats, acts = _check_assoc_plot_intputs(data, associations, cols, uns_key, obsm_key, use_X, layer, stat_col)

#subset acts columns with those in stats['factor']
acts = acts[stats.factor.unique()]

# go from long to wide format using only the selected summary statistic
stats = stats.pivot(index='factor', columns='variable', values=stat_col).T
stats.index.name = None
stats.columns.name = None
if stat_col in ['pval', 'p_adj']: #do log transform for pvalues
stats = -np.log10(stats)

#defining defaults for clustermaps
score_defaults = {'col_cluster': False, 'row_cluster': True, 'label': '{0}scores'.format('' if obsm_key is None else obsm_key.replace('X_', '') + ' '),
'row_dendrogram': True, 'col_dendrogram' : False, 'show_rownames': False, 'show_colnames': True,
'verbose': 0, 'legend_gap': 5, 'cmap' : 'RdBu_r', 'center' : 0}
score_defaults.update(scores_kwargs)

stats_label = '-log10({0})'.format(stat_col) if stat_col in ['pval', 'p_adj'] else stat_col
stats_defaults = {'col_cluster': False, 'row_cluster' : True, 'label' : stats_label,
'row_dendrogram' : True, 'col_dendrogram' : False, 'show_rownames' : True, 'show_colnames' : False,
'verbose': 0, 'legend_gap': 5, 'cmap' : 'Reds' if stat_col in ['pval', 'p_adj'] else 'Greens'}
stats_defaults.update(stats_kwargs)

if obs_annotation_cols is not None:
simples = {}
for col in obs_annotation_cols:
if data.obs[col].dtype == 'object' or data.obs[col].dtype == 'category':
annot = data.obs[col].astype('object')
simples[col]=pch.annotations.anno_simple(annot,label= True)
else:
raise ValueError('Column {0} is not of object or category dtype. These are the only formats supported for now. If you want more complex types annotations, please build your own HeatMapAnnotation object from PyComplexHeatmap and pass it to `score_defaults` under right_annotation'.format(col))
score_defaults['right_annotation'] = pch.HeatmapAnnotation(**simples, wgap = 2,
legend_width=10,
label_side= 'bottom',
legend = False,
axis = 0)

#making both clustermaps
cm_scores = pch.ClusterMapPlotter(data=acts, plot=False, **score_defaults)
cm_stats = pch.ClusterMapPlotter(data=stats, plot=False, **stats_defaults)

#combine clustermaps vertically
ax, legend_axes = pch.composite(cmlist = [cm_stats, cm_scores], main = 1, axis = 0, height_ratios=[1,3], row_gap=7, legend_gap=15)

#add titles
if titles is not None:
cm_scores.ax.set_title(titles[0])
cm_stats.ax.set_title(titles[1])

return ax, legend_axes
176 changes: 176 additions & 0 deletions decoupler/utils_anndata.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import sys

from anndata import AnnData
from tqdm import tqdm

from .utils import melt, p_adjust_fdr
from .pre import rename_net
Expand Down Expand Up @@ -897,3 +898,178 @@ def rank_sources_groups(adata, groupby, reference='rest', method='t-test_overest
results = pd.concat(results)

return results.reset_index(drop=True)

def _check_anova_inputs(data, obs_keys = None, obsm_key=None, use_X = False, layer = None):

# check that data is not None
assert data is not None, 'data cannot be None'

# check whether data is a list, of size 2
if isinstance(data, list):
assert len(data) == 2, 'data must be a list of size 2'
#both elements in data must be pd.Dataframe instances
assert isinstance(data[0], pd.DataFrame), 'data[0] must be a pd.DataFrame instance'
assert isinstance(data[1], pd.DataFrame), 'data[1] must be a pd.DataFrame instance'
#both data[0] and data[1] must have the same index
assert data[0].index.equals(data[1].index), 'data[0] and data[1] must have the same index'

dependent_variables = data[0].columns
explanatory_variables = data[1].columns
scores = data[0]
obs = data[1]

elif hasattr(data, 'obs'):
assert isinstance(data.obs, pd.DataFrame), 'data.obs must be a pd.DataFrame instance'
obs = data.obs
if obsm_key is None and not use_X and layer is None:
raise ValueError('When providing an AnnData or MuData object, either obsm_key, use_X or layer must be specified')
elif (obsm_key is not None and use_X) or (obsm_key is not None and layer is not None) or (use_X and layer is not None):
raise ValueError('When providing an AnnData or MuData object, only one of obsm_key, use_X or layer must be specified')
elif obsm_key is not None:
assert hasattr(data, 'obsm'), 'data must have an .obsm attribute'
assert obsm_key in data.obsm.keys(), 'obsm_key must be a key in data.obsm'
column_name = obsm_key.replace('X_', '').replace('pca', 'PC').replace('mofa', 'Factor').replace('umap', 'UMAP')
scores = pd.DataFrame(data.obsm[obsm_key], index=data.obs.index, columns=['{0}{1}'.format(column_name, 1 + x) for x in range(data.obsm[obsm_key].shape[1])])
elif use_X:
assert hasattr(data, 'X'), 'data must have a .X attribute'
scores = pd.DataFrame(data.X, index=data.obs.index, columns=data.var.index)
elif layer is not None:
assert hasattr(data, 'layers'), 'data must have a .layers attribute'
assert layer in data.layers.keys(), 'layer must be a key in data.layers'
scores = pd.DataFrame(data.layers[layer], index=data.obs.index, columns=data.var.index)

assert scores.index.equals(obs.index), 'scores and obs must have the same index'
dependent_variables = scores.columns
explanatory_variables = obs.columns
else:
raise TypeError('data must be a list of size 2 or an AnnData or MuData object with .obs dataframe')

# select only the columns in obs_keys
if obs_keys is None:
obs_keys = explanatory_variables
assert all([var in explanatory_variables for var in obs_keys]), 'obs_keys must be a subset of the obs columns'
obs = obs.filter(obs_keys, axis=1)
explanatory_variables = obs_keys

#check that there are no . in the dependent and explanatory variables
assert all(['.' not in var for var in dependent_variables]), 'column names of dependent variables cannot contain .'
assert all(['.' not in var for var in explanatory_variables]), 'column names of explanatory variables (obs) cannot contain .'
# check there is no overlap of column names between dependent and explanatory variables
assert len([value for value in dependent_variables if value in explanatory_variables]) == 0, 'dependent and explanatory variables cannot have the same column names'

# merge scores and obs
scores = pd.merge(scores, obs, left_index=True, right_index=True)

return scores, list(dependent_variables), list(explanatory_variables)








def get_metadata_associations(data, obs_keys = None, obsm_key=None, use_X = False, layer = None, uns_key = None, inplace = False, alpha = 0.05, method = 'fdr_bh'):

"""
Associate the data to sample metadata using ANOVA.
Requires statsmodels to be installed
Parameters
----------
data : list, AnnData or MuData
The input data for ANOVA testing. It can be either a list of two pandas DataFrames [data, obs], an AnnData or MuData object.
obs_keys: list, optional
Column names of obs (sample metadata) which should be tested. If not provided, all columns in obs will be used.
obsm_key : str, optional
A key specifying where in obsm the data is located when providing an AnnData/MuData object. Either `obsm_key`, `use_X`, or `layer` must be specified.
use_X : bool, optional
A boolean flag indicating whether to use the data in `.X` from the AnnData/MuData object when providing the `data`. Either `obsm_key`, `use_X`, or `layer` must be specified.
layer : str, optional
Which layer to use when providing an AnnData/MuData object. Either `obsm_key`, `use_X`, or `layer` must be specified.
uns_key : str, optional
Where results will be stored the AnnData/MuData object.
inplace : bool, optional
Whether to store the results in the AnnData or MuData object. If `False`, the function returns a pandas DataFrame with the results.
alpha : float, optional
The significance level for multiple testing correction (default is 0.05).
method : (str, optional):
The method used for multiple testing correction. It can be one of the methods supported by `statsmodels.stats.multitest.multipletests` (default is `'fdr_bh'`, i.e., Benjamini-Hochberg method).
Returns:
results: pd.DataFrame
DataFrame with ANOVA results. If `data` is an AnnData or MuData object and `inplace` is `True`, the results are stored in `data.uns[uns_key]`.
"""

try:
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multitest import multipletests
except Exception:
raise ImportError(
'statsmodels is not installed. Please install it.'
)

scores, dependent_variables, explanatory_variables = _check_anova_inputs(data, obs_keys = obs_keys, obsm_key=obsm_key, use_X = use_X, layer = layer)

# for each dependent variable, test the association with the other columns in scores using an ANOVA, collect p-values and eta_sq in a dataframe
for dependent in tqdm(dependent_variables):
stats = []
for explainer in explanatory_variables:
# check the data type of the column
if scores[explainer].dtype == 'object' or scores[explainer].dtype == 'category':
# if it is a string, the explanatory variable is a categorical variable
var_name = 'C({0})'.format(explainer)
# if it is a float or integer, the explanatory variable is a continuous variable
elif scores[explainer].dtype == 'float' or scores[explainer].dtype == 'int':
var_name = '{0}'.format(explainer)
# else raise an error
else:
raise ValueError('Column {0} has an unknown data type. Make sure it is either categorical/object or float/int'.format(explainer))

# create the formula
formula = '{0} ~ {1}'.format(dependent, var_name)

# fit the model
mod = ols(formula , data = scores.filter([dependent, explainer], axis=1)).fit()
# get the ANOVA table
try:
aov_table = sm.stats.anova_lm(mod, typ=2)
except ValueError:
print('WARNING: could not compute ANOVA for {0} and {1}'.format(dependent, explainer))
row = [explainer, np.nan, np.nan]
else:
# compute eta squared from anova table
eta_sq = aov_table.loc[var_name, 'sum_sq']/aov_table.sum(axis=0)['sum_sq'] # eta squared = SS from explanatory variable / SStotal (i.e. sum of all SS from variables plus residual SS)
row = [explainer, aov_table['PR(>F)'][0], eta_sq]
stats.append(row)

stats = pd.DataFrame(stats, columns=['variable', 'pval', 'eta_sq']).set_index('variable')
stats['factor'] = dependent
if dependent_variables.index(dependent) == 0:
stats_df = stats.copy()
else:
stats_df = pd.concat([stats_df, stats], axis=0, join='outer', sort=False)

stats_df = stats_df.reset_index()
stats_df['p_adj'] = np.stack(stats_df.groupby('factor').apply(lambda df: multipletests(df['pval'], alpha = alpha, method = method, returnsorted=False)[1]).values).flatten()

if isinstance(data, list):
return stats_df
elif inplace:
# which of obsm_key, use_X, or layer is not None?
if obsm_key is not None:
key = obsm_key.replace('X_', '') + '_anova' if uns_key is None else uns_key
elif use_X:
key = 'X_anova' if uns_key is None else uns_key
elif layer is not None:
key = layer + '_anova' if uns_key is None else uns_key
if not hasattr(data, 'uns'):
data.uns = {key: stats_df}
else:
data.uns[key] = stats_df
else:
return stats_df
Loading

0 comments on commit 459b235

Please sign in to comment.