Merge pull request #332 from MannLabs/decouple-dataset-from-plots

Decouple dataset from plots
MannLabs · Sep 20, 2024 · 963a91f · 963a91f
2 parents 198cbdc + 1dcfd53
commit 963a91f
Show file tree

Hide file tree

Showing 11 changed files with 591 additions and 454 deletions.
diff --git a/alphastats/DataSet.py b/alphastats/DataSet.py
@@ -5,15 +5,21 @@
 import logging
 import warnings
 import plotly
+import scipy
 
 from alphastats import BaseLoader
 
 
 from alphastats.DataSet_Plot import Plot
-from alphastats.DataSet_Preprocess import Preprocess, PreprocessingStateKeys
+from alphastats.DataSet_Preprocess import Preprocess
 from alphastats.DataSet_Statistics import Statistics
-from alphastats.utils import LoaderError
+from alphastats.utils import LoaderError, check_for_missing_values, ignore_warning
 from alphastats.statistics.tukey_test import tukey_test
+from alphastats.plots.ClusterMap import ClusterMap
+from alphastats.plots.DimensionalityReduction import DimensionalityReduction
+from alphastats.plots.IntensityPlot import IntensityPlot
+from alphastats.plots.SampleHistogram import SampleHistogram
+from alphastats.plots.VolcanoPlot import VolcanoPlot
 
 plotly.io.templates["alphastats_colors"] = plotly.graph_objects.layout.Template(
     layout=plotly.graph_objects.Layout(
@@ -36,8 +42,8 @@
 plotly.io.templates.default = "simple_white+alphastats_colors"
 
 
-class DataSet(Plot):
-    """Analysis Object"""
+class DataSet:
+    """The main object of AlphaPeptStats, providing an interface to all relevant functionality and data."""
 
     def __init__(
         self,
@@ -160,11 +166,11 @@ def batch_correction(self, batch: str) -> None:
     def _get_statistics(self) -> Statistics:
         """Return instance of the Statistics object."""
         return Statistics(
-            self.mat,
-            self.metadata,
-            self.index_column,
-            self.sample,
-            self.preprocessing_info,
+            mat=self.mat,
+            metadata=self.metadata,
+            index_column=self.index_column,
+            sample=self.sample,
+            preprocessing_info=self.preprocessing_info,
         )
 
     def diff_expression_analysis(
@@ -208,6 +214,270 @@ def ancova(
         """A wrapper for Statistics.ancova(), see documentation there."""
         return self._get_statistics().ancova(protein_id, covar, between)
 
+    @check_for_missing_values
+    def plot_pca(self, group: Optional[str] = None, circle: bool = False):
+        """Plot Principal Component Analysis (PCA)
+
+        Args:
+            group (str, optional): column in metadata that should be used for coloring. Defaults to None.
+            circle (bool, optional): draw circle around each group. Defaults to False.
+
+        Returns:
+            plotly.graph_objects._figure.Figure: PCA plot
+        """
+        dimensionality_reduction = DimensionalityReduction(
+            mat=self.mat,
+            metadata=self.metadata,
+            sample=self.sample,
+            preprocessing_info=self.preprocessing_info,
+            group=group,
+            circle=circle,
+            method="pca",
+        )
+        return dimensionality_reduction.plot
+
+    @check_for_missing_values
+    def plot_tsne(
+        self,
+        group: Optional[str] = None,
+        circle: bool = False,
+        perplexity: int = 5,
+        n_iter: int = 1000,
+    ):
+        """Plot t-distributed stochastic neighbor embedding (t-SNE)
+
+        Args:
+            group (str, optional): column in metadata that should be used for coloring. Defaults to None.
+            circle (bool, optional): draw circle around each group. Defaults to False.
+
+        Returns:
+            plotly.graph_objects._figure.Figure: t-SNE plot
+        """
+        dimensionality_reduction = DimensionalityReduction(
+            mat=self.mat,
+            metadata=self.metadata,
+            sample=self.sample,
+            preprocessing_info=self.preprocessing_info,
+            group=group,
+            method="tsne",
+            circle=circle,
+            perplexity=perplexity,
+            n_iter=n_iter,
+        )
+        return dimensionality_reduction.plot
+
+    @check_for_missing_values
+    def plot_umap(self, group: Optional[str] = None, circle: bool = False):
+        """Plot Uniform Manifold Approximation and Projection for Dimension Reduction
+
+        Args:
+            group (str, optional): column in metadata that should be used for coloring. Defaults to None.
+            circle (bool, optional): draw circle around each group. Defaults to False.
+
+        Returns:
+            plotly.graph_objects._figure.Figure: UMAP plot
+        """
+        dimensionality_reduction = DimensionalityReduction(
+            mat=self.mat,
+            metadata=self.metadata,
+            sample=self.sample,
+            preprocessing_info=self.preprocessing_info,
+            group=group,
+            method="umap",
+            circle=circle,
+        )
+        return dimensionality_reduction.plot
+
+    @ignore_warning(RuntimeWarning)
+    def plot_volcano(
+        self,
+        group1: Union[str, list],
+        group2: Union[str, list],
+        column: str = None,
+        method: str = "ttest",
+        labels: bool = False,
+        min_fc: float = 1.0,
+        alpha: float = 0.05,
+        draw_line: bool = True,
+        perm: int = 100,
+        fdr: float = 0.05,
+        # compare_preprocessing_modes: bool = False, # TODO reimplement
+        color_list: list = [],
+    ):
+        """Plot Volcano Plot
+
+        Args:
+            column (str): column name in the metadata file with the two groups to compare
+            group1 (str/list): name of group to compare needs to be present in column or list of sample names to compare
+            group2 (str/list): name of group to compare needs to be present in column  or list of sample names to compare
+            method (str): "anova", "wald", "ttest", "SAM" Defaul ttest.
+            labels (bool): Add text labels to significant Proteins, Default False.
+            alpha(float,optional): p-value cut off.
+            min_fc (float): Minimum fold change.
+            draw_line(boolean): whether to draw cut off lines.
+            perm(float,optional): number of permutations when using SAM as method. Defaults to 100.
+            fdr(float,optional): FDR cut off when using SAM as method. Defaults to 0.05.
+            color_list (list): list with ProteinIDs that should be highlighted.
+            compare_preprocessing_modes(bool): Will iterate through normalization and imputation modes and return a list of VolcanoPlots in different settings, Default False.
+
+
+        Returns:
+            plotly.graph_objects._figure.Figure: Volcano Plot
+        """
+
+        # TODO this needs to orchestrated from outside this method
+        # if compare_preprocessing_modes:
+        #     params_for_func = locals()
+        #     results = self._compare_preprocessing_modes(
+        #         func=VolcanoPlot, params_for_func=params_for_func
+        #     )
+        #     return results
+        #
+        # else:
+        volcano_plot = VolcanoPlot(
+            mat=self.mat,
+            rawinput=self.rawinput,
+            metadata=self.metadata,
+            sample=self.sample,
+            index_column=self.index_column,
+            gene_names=self.gene_names,
+            preprocessing_info=self.preprocessing_info,
+            group1=group1,
+            group2=group2,
+            column=column,
+            method=method,
+            labels=labels,
+            min_fc=min_fc,
+            alpha=alpha,
+            draw_line=draw_line,
+            perm=perm,
+            fdr=fdr,
+            color_list=color_list,
+        )
+
+        return volcano_plot.plot
+
+    def plot_intensity(
+        self,
+        protein_id: str,
+        group: str = None,
+        subgroups: list = None,
+        method: str = "box",
+        add_significance: bool = False,
+        log_scale: bool = False,
+        # compare_preprocessing_modes: bool = False, TODO reimplement
+    ):
+        """Plot Intensity of individual Protein/ProteinGroup
+
+        Args:
+            protein_id (str): ProteinGroup ID
+            group (str, optional): A metadata column used for grouping. Defaults to None.
+            subgroups (list, optional): Select variables from the group column. Defaults to None.
+            method (str, optional):  Violinplot = "violin", Boxplot = "box", Scatterplot = "scatter" or "all". Defaults to "box".
+            add_significance (bool, optional): add p-value bar, only possible when two groups are compared. Defaults False.
+            log_scale (bool, optional): yaxis in logarithmic scale. Defaults to False.
+
+        Returns:
+            plotly.graph_objects._figure.Figure: Plotly Plot
+        """
+        # TODO this needs to orchestrated from outside this method
+        # if compare_preprocessing_modes:
+        #     params_for_func = locals()
+        #     results = self._compare_preprocessing_modes(
+        #         func=IntensityPlot, params_for_func=params_for_func
+        #     )
+        #     return results
+
+        intensity_plot = IntensityPlot(
+            mat=self.mat,
+            metadata=self.metadata,
+            sample=self.sample,
+            intensity_column=self.intensity_column,
+            preprocessing_info=self.preprocessing_info,
+            protein_id=protein_id,
+            group=group,
+            subgroups=subgroups,
+            method=method,
+            add_significance=add_significance,
+            log_scale=log_scale,
+        )
+
+        return intensity_plot.plot
+
+    @ignore_warning(UserWarning)
+    @check_for_missing_values
+    def plot_clustermap(
+        self,
+        label_bar: str = None,
+        only_significant: bool = False,
+        group: str = None,
+        subgroups: list = None,
+    ):
+        """Plot a matrix dataset as a hierarchically-clustered heatmap
+
+        Args:
+            label_bar (str, optional): column/variable name described in the metadata. Will be plotted as bar above the heatmap to see wheteher groups are clustering together. Defaults to None.. Defaults to None.
+            only_significant (bool, optional): performs ANOVA and only signficantly different proteins will be clustered (p<0.05). Defaults to False.
+            group (str, optional): group containing subgroups that should be clustered. Defaults to None.
+            subgroups (list, optional): variables in group that should be plotted. Defaults to None.
+
+        Returns:
+             ClusterGrid: Clustermap
+        """
+
+        clustermap = ClusterMap(
+            mat=self.mat,
+            metadata=self.metadata,
+            sample=self.sample,
+            index_column=self.index_column,
+            preprocessing_info=self.preprocessing_info,
+            label_bar=label_bar,
+            only_significant=only_significant,
+            group=group,
+            subgroups=subgroups,
+        )
+        return clustermap.plot
+
+    def plot_samplehistograms(self):
+        """Plots the density distribution of each sample
+
+        Returns:
+            plotly: Plotly Graph Object
+        """
+        return SampleHistogram(mat=self.mat).plot()
+
+    def _get_plot(self) -> Plot:
+        """Get instance of the Plot object."""
+        return Plot(
+            self.mat,
+            self.rawmat,
+            self.metadata,
+            self.sample,
+            self.preprocessing_info,
+        )
+
+    def plot_correlation_matrix(self, method: str = "pearson"):  # TODO unused
+        """A wrapper for Plot.plot_correlation_matrix(), see documentation there."""
+        return self._get_plot().plot_correlation_matrix(method)
+
+    def plot_sampledistribution(
+        self,
+        method: str = "violin",
+        color: str = None,
+        log_scale: bool = False,
+        use_raw: bool = False,
+    ):
+        """A wrapper for Plot.plot_sampledistribution(), see documentation there."""
+        return self._get_plot().plot_sampledistribution(
+            method, color, log_scale, use_raw
+        )
+
+    def plot_dendrogram(
+        self, linkagefun=lambda x: scipy.cluster.hierarchy.linkage(x, "complete")
+    ):
+        """A wrapper for Plot.plot_dendrogram(), see documentation there."""
+        return self._get_plot().plot_dendrogram(linkagefun)
+
     def _check_loader(self, loader):
         """Checks if the Loader is from class AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader
 
@@ -247,17 +517,6 @@ def _remove_misc_samples_in_metadata(self):
                 "are removed from the metadata."
             )
 
-    # TODO this is implemented in both preprocessing and here
-    #  This is only needed in the DimensionalityReduction class and only if the step was not run during preprocessing.
-    #  idea: replace the step in DimensionalityReduction with something like:
-    #  mat = self.data.mat.loc[sample_names,:] after creating sample_names.
-    def _subset(self):
-        # filter matrix so only samples that are described in metadata are also found in matrix
-        self.preprocessing_info.update(
-            {PreprocessingStateKeys.NUM_SAMPLES: self.metadata.shape[0]}
-        )
-        return self.mat[self.mat.index.isin(self.metadata[self.sample].tolist())]
-
     def _create_matrix(self):
         """
         Creates a matrix of the Outputfile, with columns displaying features (Proteins) and