Added support for Spectronaut wide formatted input

MannLabs · Jan 26, 2024 · 56c63e6 · 56c63e6
1 parent 89843be
commit 56c63e6
Show file tree

Hide file tree

Showing 2 changed files with 68 additions and 18 deletions.
diff --git a/alphastats/loader/BaseLoader.py b/alphastats/loader/BaseLoader.py
@@ -83,3 +83,23 @@ def _add_contamination_column(self):
             + "The contaminant library was created by Frankenfield et al."
             + ":https://www.biorxiv.org/content/10.1101/2022.04.27.489766v2.full"
         )
+    def read_uploaded_file_into_df(self, file):
+        filename = file.name
+
+        if filename.endswith(".xlsx"):
+            df = pd.read_excel(file)
+
+        elif filename.endswith(".txt") or filename.endswith(".tsv"):
+            df = pd.read_csv(file, delimiter="\t", low_memory=False)
+
+        elif filename.endswith(".csv"):
+            df = pd.read_csv(file, low_memory=False)
+
+        else:
+            df = None
+            logging.warning(
+                "WARNING: File could not be read. \nFile has to be a .xslx, .tsv, .csv or .txt file"
+            )
+            return
+
+        return df
diff --git a/alphastats/loader/SpectronautLoader.py b/alphastats/loader/SpectronautLoader.py
@@ -1,19 +1,20 @@
+import copy
 from alphastats.loader.BaseLoader import BaseLoader
 import pandas as pd
 import numpy as np
 import logging
 
 
 class SpectronautLoader(BaseLoader):
-    """Loader for Spectronaut outputfiles 
-    """
+    """Loader for Spectronaut outputfiles"""
 
     def __init__(
         self,
         file,
         intensity_column="PG.Quantity",
         index_column="PG.ProteinGroups",
-        sample_column="R.FileName",
+        # sample_column="R.FileName",
+        sample_column="experiment", # 
         gene_names_column="PG.Genes",
         filter_qvalue=True,
         qvalue_cutoff=0.01,
@@ -22,13 +23,13 @@ def __init__(
         """Loads Spectronaut output. Will add contamination column for further analysis.
 
         Args:
-            file (str): path to Spectronaut outputfile or pandas.DataFrame 
+            file (str): path to Spectronaut outputfile or pandas.DataFrame
             intensity_column (str, optional): columns where the intensity of the proteins are given. Defaults to "PG.Quantity".
             index_column (str, optional): column indicating the protein groups. Defaults to "PG.ProteinGroups".
             sample_column (str, optional): column that contains sample names used for downstream analysis. Defaults to "R.FileName".
             gene_names_column (str, optional): column with gene names. Defaults to "PG.Genes".
             filter_qvalue (bool, optional): will filter out the intensities that have greater than qvalue_cutoff in EG.Qvalue column. Those intensities will be replaced with zero and will be considered as censored missing values for imputation purpose.. Defaults to True.
-            qvalue_cutoff (float, optional): cut off vaéie. Defaults to 0.01.
+            qvalue_cutoff (float, optional): cut off value. Defaults to 0.01.
             sep (str, optional): file separation of file. Defaults to "\t".
         """
 
@@ -42,12 +43,16 @@ def __init__(
 
         self._read_spectronaut_file(file=file, sep=sep)
 
-        if filter_qvalue:
+        is_long = self._check_if_long(self.rawinput)
+
+        if filter_qvalue and is_long:
             self._filter_qvalue(qvalue_cutoff=qvalue_cutoff)
 
-        self._reshape_spectronaut(
-            sample_column=sample_column, gene_names_column=gene_names_column
-        )
+        if is_long:
+            self._reshape_spectronaut(
+                sample_column=sample_column, gene_names_column=gene_names_column
+            )
+
         self._add_contamination_column()
         self._read_all_columns_as_string()
 
@@ -59,26 +64,38 @@ def _reshape_spectronaut(self, sample_column, gene_names_column):
         self.rawinput["sample"] = (
             self.rawinput[sample_column] + "_" + self.intensity_column
         )
-
+        print("self.rawinput[sample_column]", (
+            self.rawinput[sample_column] + "_" + self.intensity_column
+        ))
         indexing_columns = [self.index_column]
-
+        print("print(indexing_columns)", indexing_columns)
         if gene_names_column in self.rawinput.columns.to_list():
             self.gene_names = gene_names_column
-            indexing_columns += [self.gene_names]
+            indexing_columns.append(self.gene_names)
 
         keep_columns = [self.intensity_column, "sample"] + indexing_columns
-
+        print("keep_columns", keep_columns)
         df = self.rawinput[keep_columns].drop_duplicates()
         df = df.pivot(
             columns="sample", index=indexing_columns, values=self.intensity_column
         )
         df.reset_index(inplace=True)
 
         self.rawinput = df
+        print(self.rawinput.columns.to_list())
+        df.to_csv("~/Downloads/wide_test.tsv", sep="\t", index=False)
 
         self.intensity_column = "[sample]_" + self.intensity_column
 
+    def _check_if_long(self, df):
+        for colname in df.columns.to_list():
+            if colname.startswith('PG.Quantity'):
+                return True
+            elif 'PG.Quantity' in colname:
+                return False
+
     def _filter_qvalue(self, qvalue_cutoff):
+        print(self.rawinput.columns.to_list())
         if "EG.Qvalue" not in self.rawinput.columns.to_list():
             raise Warning(
                 "Column EG.Qvalue not found in file. File will not be filtered according to q-value."
@@ -97,12 +114,25 @@ def _read_spectronaut_file(self, file, sep):
         # some spectronaut files include european decimal separators
         if isinstance(file, pd.DataFrame):
             df = file
+            for column in df.columns:
+                try:
+                    if df[column].dtype == np.float64:
+                        continue
+                    df[column] = df[column].str.replace(',', '.').astype(float)
+                    print("converted", column, df[column].dtype)
+                except (ValueError, AttributeError) as e:
+                    print("failed", column, df[column].dtype)
         else:
-            df = pd.read_csv(file, sep=sep, low_memory=False)
-
-            if df[self.intensity_column].dtype != np.float64:
-                # load european
-                df = pd.read_csv(file, sep=sep, decimal=",")
+            df = self.read_uploaded_file_into_df(file)
+            # сonvert from european
+            for column in df.columns:
+                try:
+                    if df[column].dtype == np.float64:
+                        continue
+                    df[column] = df[column].str.replace(',', '.').astype(float)
+                    print("converted", column, df[column].dtype)
+                except (ValueError, AttributeError) as e:
+                    print("failed", column, df[column].dtype)
 
         self.rawinput = df