From 56c63e6f55f6be8bf3cf6172f09a9f8160d013e8 Mon Sep 17 00:00:00 2001 From: Mikhail Lebedev Date: Fri, 26 Jan 2024 13:47:52 +0100 Subject: [PATCH] Added support for Spectronaut wide formatted input --- alphastats/loader/BaseLoader.py | 20 ++++++++ alphastats/loader/SpectronautLoader.py | 66 +++++++++++++++++++------- 2 files changed, 68 insertions(+), 18 deletions(-) diff --git a/alphastats/loader/BaseLoader.py b/alphastats/loader/BaseLoader.py index d80a53e5..5fcc45da 100644 --- a/alphastats/loader/BaseLoader.py +++ b/alphastats/loader/BaseLoader.py @@ -83,3 +83,23 @@ def _add_contamination_column(self): + "The contaminant library was created by Frankenfield et al." + ":https://www.biorxiv.org/content/10.1101/2022.04.27.489766v2.full" ) + def read_uploaded_file_into_df(self, file): + filename = file.name + + if filename.endswith(".xlsx"): + df = pd.read_excel(file) + + elif filename.endswith(".txt") or filename.endswith(".tsv"): + df = pd.read_csv(file, delimiter="\t", low_memory=False) + + elif filename.endswith(".csv"): + df = pd.read_csv(file, low_memory=False) + + else: + df = None + logging.warning( + "WARNING: File could not be read. \nFile has to be a .xslx, .tsv, .csv or .txt file" + ) + return + + return df \ No newline at end of file diff --git a/alphastats/loader/SpectronautLoader.py b/alphastats/loader/SpectronautLoader.py index c80fe131..25220f7d 100644 --- a/alphastats/loader/SpectronautLoader.py +++ b/alphastats/loader/SpectronautLoader.py @@ -1,3 +1,4 @@ +import copy from alphastats.loader.BaseLoader import BaseLoader import pandas as pd import numpy as np @@ -5,15 +6,15 @@ class SpectronautLoader(BaseLoader): - """Loader for Spectronaut outputfiles - """ + """Loader for Spectronaut outputfiles""" def __init__( self, file, intensity_column="PG.Quantity", index_column="PG.ProteinGroups", - sample_column="R.FileName", + # sample_column="R.FileName", + sample_column="experiment", # gene_names_column="PG.Genes", filter_qvalue=True, qvalue_cutoff=0.01, @@ -22,13 +23,13 @@ def __init__( """Loads Spectronaut output. Will add contamination column for further analysis. Args: - file (str): path to Spectronaut outputfile or pandas.DataFrame + file (str): path to Spectronaut outputfile or pandas.DataFrame intensity_column (str, optional): columns where the intensity of the proteins are given. Defaults to "PG.Quantity". index_column (str, optional): column indicating the protein groups. Defaults to "PG.ProteinGroups". sample_column (str, optional): column that contains sample names used for downstream analysis. Defaults to "R.FileName". gene_names_column (str, optional): column with gene names. Defaults to "PG.Genes". filter_qvalue (bool, optional): will filter out the intensities that have greater than qvalue_cutoff in EG.Qvalue column. Those intensities will be replaced with zero and will be considered as censored missing values for imputation purpose.. Defaults to True. - qvalue_cutoff (float, optional): cut off vaéie. Defaults to 0.01. + qvalue_cutoff (float, optional): cut off value. Defaults to 0.01. sep (str, optional): file separation of file. Defaults to "\t". """ @@ -42,12 +43,16 @@ def __init__( self._read_spectronaut_file(file=file, sep=sep) - if filter_qvalue: + is_long = self._check_if_long(self.rawinput) + + if filter_qvalue and is_long: self._filter_qvalue(qvalue_cutoff=qvalue_cutoff) - self._reshape_spectronaut( - sample_column=sample_column, gene_names_column=gene_names_column - ) + if is_long: + self._reshape_spectronaut( + sample_column=sample_column, gene_names_column=gene_names_column + ) + self._add_contamination_column() self._read_all_columns_as_string() @@ -59,15 +64,17 @@ def _reshape_spectronaut(self, sample_column, gene_names_column): self.rawinput["sample"] = ( self.rawinput[sample_column] + "_" + self.intensity_column ) - + print("self.rawinput[sample_column]", ( + self.rawinput[sample_column] + "_" + self.intensity_column + )) indexing_columns = [self.index_column] - + print("print(indexing_columns)", indexing_columns) if gene_names_column in self.rawinput.columns.to_list(): self.gene_names = gene_names_column - indexing_columns += [self.gene_names] + indexing_columns.append(self.gene_names) keep_columns = [self.intensity_column, "sample"] + indexing_columns - + print("keep_columns", keep_columns) df = self.rawinput[keep_columns].drop_duplicates() df = df.pivot( columns="sample", index=indexing_columns, values=self.intensity_column @@ -75,10 +82,20 @@ def _reshape_spectronaut(self, sample_column, gene_names_column): df.reset_index(inplace=True) self.rawinput = df + print(self.rawinput.columns.to_list()) + df.to_csv("~/Downloads/wide_test.tsv", sep="\t", index=False) self.intensity_column = "[sample]_" + self.intensity_column + def _check_if_long(self, df): + for colname in df.columns.to_list(): + if colname.startswith('PG.Quantity'): + return True + elif 'PG.Quantity' in colname: + return False + def _filter_qvalue(self, qvalue_cutoff): + print(self.rawinput.columns.to_list()) if "EG.Qvalue" not in self.rawinput.columns.to_list(): raise Warning( "Column EG.Qvalue not found in file. File will not be filtered according to q-value." @@ -97,12 +114,25 @@ def _read_spectronaut_file(self, file, sep): # some spectronaut files include european decimal separators if isinstance(file, pd.DataFrame): df = file + for column in df.columns: + try: + if df[column].dtype == np.float64: + continue + df[column] = df[column].str.replace(',', '.').astype(float) + print("converted", column, df[column].dtype) + except (ValueError, AttributeError) as e: + print("failed", column, df[column].dtype) else: - df = pd.read_csv(file, sep=sep, low_memory=False) - - if df[self.intensity_column].dtype != np.float64: - # load european - df = pd.read_csv(file, sep=sep, decimal=",") + df = self.read_uploaded_file_into_df(file) + # сonvert from european + for column in df.columns: + try: + if df[column].dtype == np.float64: + continue + df[column] = df[column].str.replace(',', '.').astype(float) + print("converted", column, df[column].dtype) + except (ValueError, AttributeError) as e: + print("failed", column, df[column].dtype) self.rawinput = df