Skip to content

Commit

Permalink
Merge pull request #332 from MannLabs/decouple-dataset-from-plots
Browse files Browse the repository at this point in the history
Decouple dataset from plots
  • Loading branch information
mschwoer authored Sep 20, 2024
2 parents 198cbdc + 1dcfd53 commit 963a91f
Show file tree
Hide file tree
Showing 11 changed files with 591 additions and 454 deletions.
299 changes: 279 additions & 20 deletions alphastats/DataSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,21 @@
import logging
import warnings
import plotly
import scipy

from alphastats import BaseLoader


from alphastats.DataSet_Plot import Plot
from alphastats.DataSet_Preprocess import Preprocess, PreprocessingStateKeys
from alphastats.DataSet_Preprocess import Preprocess
from alphastats.DataSet_Statistics import Statistics
from alphastats.utils import LoaderError
from alphastats.utils import LoaderError, check_for_missing_values, ignore_warning
from alphastats.statistics.tukey_test import tukey_test
from alphastats.plots.ClusterMap import ClusterMap
from alphastats.plots.DimensionalityReduction import DimensionalityReduction
from alphastats.plots.IntensityPlot import IntensityPlot
from alphastats.plots.SampleHistogram import SampleHistogram
from alphastats.plots.VolcanoPlot import VolcanoPlot

plotly.io.templates["alphastats_colors"] = plotly.graph_objects.layout.Template(
layout=plotly.graph_objects.Layout(
Expand All @@ -36,8 +42,8 @@
plotly.io.templates.default = "simple_white+alphastats_colors"


class DataSet(Plot):
"""Analysis Object"""
class DataSet:
"""The main object of AlphaPeptStats, providing an interface to all relevant functionality and data."""

def __init__(
self,
Expand Down Expand Up @@ -160,11 +166,11 @@ def batch_correction(self, batch: str) -> None:
def _get_statistics(self) -> Statistics:
"""Return instance of the Statistics object."""
return Statistics(
self.mat,
self.metadata,
self.index_column,
self.sample,
self.preprocessing_info,
mat=self.mat,
metadata=self.metadata,
index_column=self.index_column,
sample=self.sample,
preprocessing_info=self.preprocessing_info,
)

def diff_expression_analysis(
Expand Down Expand Up @@ -208,6 +214,270 @@ def ancova(
"""A wrapper for Statistics.ancova(), see documentation there."""
return self._get_statistics().ancova(protein_id, covar, between)

@check_for_missing_values
def plot_pca(self, group: Optional[str] = None, circle: bool = False):
"""Plot Principal Component Analysis (PCA)
Args:
group (str, optional): column in metadata that should be used for coloring. Defaults to None.
circle (bool, optional): draw circle around each group. Defaults to False.
Returns:
plotly.graph_objects._figure.Figure: PCA plot
"""
dimensionality_reduction = DimensionalityReduction(
mat=self.mat,
metadata=self.metadata,
sample=self.sample,
preprocessing_info=self.preprocessing_info,
group=group,
circle=circle,
method="pca",
)
return dimensionality_reduction.plot

@check_for_missing_values
def plot_tsne(
self,
group: Optional[str] = None,
circle: bool = False,
perplexity: int = 5,
n_iter: int = 1000,
):
"""Plot t-distributed stochastic neighbor embedding (t-SNE)
Args:
group (str, optional): column in metadata that should be used for coloring. Defaults to None.
circle (bool, optional): draw circle around each group. Defaults to False.
Returns:
plotly.graph_objects._figure.Figure: t-SNE plot
"""
dimensionality_reduction = DimensionalityReduction(
mat=self.mat,
metadata=self.metadata,
sample=self.sample,
preprocessing_info=self.preprocessing_info,
group=group,
method="tsne",
circle=circle,
perplexity=perplexity,
n_iter=n_iter,
)
return dimensionality_reduction.plot

@check_for_missing_values
def plot_umap(self, group: Optional[str] = None, circle: bool = False):
"""Plot Uniform Manifold Approximation and Projection for Dimension Reduction
Args:
group (str, optional): column in metadata that should be used for coloring. Defaults to None.
circle (bool, optional): draw circle around each group. Defaults to False.
Returns:
plotly.graph_objects._figure.Figure: UMAP plot
"""
dimensionality_reduction = DimensionalityReduction(
mat=self.mat,
metadata=self.metadata,
sample=self.sample,
preprocessing_info=self.preprocessing_info,
group=group,
method="umap",
circle=circle,
)
return dimensionality_reduction.plot

@ignore_warning(RuntimeWarning)
def plot_volcano(
self,
group1: Union[str, list],
group2: Union[str, list],
column: str = None,
method: str = "ttest",
labels: bool = False,
min_fc: float = 1.0,
alpha: float = 0.05,
draw_line: bool = True,
perm: int = 100,
fdr: float = 0.05,
# compare_preprocessing_modes: bool = False, # TODO reimplement
color_list: list = [],
):
"""Plot Volcano Plot
Args:
column (str): column name in the metadata file with the two groups to compare
group1 (str/list): name of group to compare needs to be present in column or list of sample names to compare
group2 (str/list): name of group to compare needs to be present in column or list of sample names to compare
method (str): "anova", "wald", "ttest", "SAM" Defaul ttest.
labels (bool): Add text labels to significant Proteins, Default False.
alpha(float,optional): p-value cut off.
min_fc (float): Minimum fold change.
draw_line(boolean): whether to draw cut off lines.
perm(float,optional): number of permutations when using SAM as method. Defaults to 100.
fdr(float,optional): FDR cut off when using SAM as method. Defaults to 0.05.
color_list (list): list with ProteinIDs that should be highlighted.
compare_preprocessing_modes(bool): Will iterate through normalization and imputation modes and return a list of VolcanoPlots in different settings, Default False.
Returns:
plotly.graph_objects._figure.Figure: Volcano Plot
"""

# TODO this needs to orchestrated from outside this method
# if compare_preprocessing_modes:
# params_for_func = locals()
# results = self._compare_preprocessing_modes(
# func=VolcanoPlot, params_for_func=params_for_func
# )
# return results
#
# else:
volcano_plot = VolcanoPlot(
mat=self.mat,
rawinput=self.rawinput,
metadata=self.metadata,
sample=self.sample,
index_column=self.index_column,
gene_names=self.gene_names,
preprocessing_info=self.preprocessing_info,
group1=group1,
group2=group2,
column=column,
method=method,
labels=labels,
min_fc=min_fc,
alpha=alpha,
draw_line=draw_line,
perm=perm,
fdr=fdr,
color_list=color_list,
)

return volcano_plot.plot

def plot_intensity(
self,
protein_id: str,
group: str = None,
subgroups: list = None,
method: str = "box",
add_significance: bool = False,
log_scale: bool = False,
# compare_preprocessing_modes: bool = False, TODO reimplement
):
"""Plot Intensity of individual Protein/ProteinGroup
Args:
protein_id (str): ProteinGroup ID
group (str, optional): A metadata column used for grouping. Defaults to None.
subgroups (list, optional): Select variables from the group column. Defaults to None.
method (str, optional): Violinplot = "violin", Boxplot = "box", Scatterplot = "scatter" or "all". Defaults to "box".
add_significance (bool, optional): add p-value bar, only possible when two groups are compared. Defaults False.
log_scale (bool, optional): yaxis in logarithmic scale. Defaults to False.
Returns:
plotly.graph_objects._figure.Figure: Plotly Plot
"""
# TODO this needs to orchestrated from outside this method
# if compare_preprocessing_modes:
# params_for_func = locals()
# results = self._compare_preprocessing_modes(
# func=IntensityPlot, params_for_func=params_for_func
# )
# return results

intensity_plot = IntensityPlot(
mat=self.mat,
metadata=self.metadata,
sample=self.sample,
intensity_column=self.intensity_column,
preprocessing_info=self.preprocessing_info,
protein_id=protein_id,
group=group,
subgroups=subgroups,
method=method,
add_significance=add_significance,
log_scale=log_scale,
)

return intensity_plot.plot

@ignore_warning(UserWarning)
@check_for_missing_values
def plot_clustermap(
self,
label_bar: str = None,
only_significant: bool = False,
group: str = None,
subgroups: list = None,
):
"""Plot a matrix dataset as a hierarchically-clustered heatmap
Args:
label_bar (str, optional): column/variable name described in the metadata. Will be plotted as bar above the heatmap to see wheteher groups are clustering together. Defaults to None.. Defaults to None.
only_significant (bool, optional): performs ANOVA and only signficantly different proteins will be clustered (p<0.05). Defaults to False.
group (str, optional): group containing subgroups that should be clustered. Defaults to None.
subgroups (list, optional): variables in group that should be plotted. Defaults to None.
Returns:
ClusterGrid: Clustermap
"""

clustermap = ClusterMap(
mat=self.mat,
metadata=self.metadata,
sample=self.sample,
index_column=self.index_column,
preprocessing_info=self.preprocessing_info,
label_bar=label_bar,
only_significant=only_significant,
group=group,
subgroups=subgroups,
)
return clustermap.plot

def plot_samplehistograms(self):
"""Plots the density distribution of each sample
Returns:
plotly: Plotly Graph Object
"""
return SampleHistogram(mat=self.mat).plot()

def _get_plot(self) -> Plot:
"""Get instance of the Plot object."""
return Plot(
self.mat,
self.rawmat,
self.metadata,
self.sample,
self.preprocessing_info,
)

def plot_correlation_matrix(self, method: str = "pearson"): # TODO unused
"""A wrapper for Plot.plot_correlation_matrix(), see documentation there."""
return self._get_plot().plot_correlation_matrix(method)

def plot_sampledistribution(
self,
method: str = "violin",
color: str = None,
log_scale: bool = False,
use_raw: bool = False,
):
"""A wrapper for Plot.plot_sampledistribution(), see documentation there."""
return self._get_plot().plot_sampledistribution(
method, color, log_scale, use_raw
)

def plot_dendrogram(
self, linkagefun=lambda x: scipy.cluster.hierarchy.linkage(x, "complete")
):
"""A wrapper for Plot.plot_dendrogram(), see documentation there."""
return self._get_plot().plot_dendrogram(linkagefun)

def _check_loader(self, loader):
"""Checks if the Loader is from class AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader
Expand Down Expand Up @@ -247,17 +517,6 @@ def _remove_misc_samples_in_metadata(self):
"are removed from the metadata."
)

# TODO this is implemented in both preprocessing and here
# This is only needed in the DimensionalityReduction class and only if the step was not run during preprocessing.
# idea: replace the step in DimensionalityReduction with something like:
# mat = self.data.mat.loc[sample_names,:] after creating sample_names.
def _subset(self):
# filter matrix so only samples that are described in metadata are also found in matrix
self.preprocessing_info.update(
{PreprocessingStateKeys.NUM_SAMPLES: self.metadata.shape[0]}
)
return self.mat[self.mat.index.isin(self.metadata[self.sample].tolist())]

def _create_matrix(self):
"""
Creates a matrix of the Outputfile, with columns displaying features (Proteins) and
Expand Down
Loading

0 comments on commit 963a91f

Please sign in to comment.