Skip to content

Commit

Permalink
Added support for Spectronaut wide formatted input
Browse files Browse the repository at this point in the history
  • Loading branch information
Mikhail Lebedev committed Jan 26, 2024
1 parent 89843be commit 56c63e6
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 18 deletions.
20 changes: 20 additions & 0 deletions alphastats/loader/BaseLoader.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,23 @@ def _add_contamination_column(self):
+ "The contaminant library was created by Frankenfield et al."
+ ":https://www.biorxiv.org/content/10.1101/2022.04.27.489766v2.full"
)
def read_uploaded_file_into_df(self, file):
filename = file.name

if filename.endswith(".xlsx"):
df = pd.read_excel(file)

elif filename.endswith(".txt") or filename.endswith(".tsv"):
df = pd.read_csv(file, delimiter="\t", low_memory=False)

elif filename.endswith(".csv"):
df = pd.read_csv(file, low_memory=False)

else:
df = None
logging.warning(
"WARNING: File could not be read. \nFile has to be a .xslx, .tsv, .csv or .txt file"
)
return

return df
66 changes: 48 additions & 18 deletions alphastats/loader/SpectronautLoader.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
import copy
from alphastats.loader.BaseLoader import BaseLoader
import pandas as pd
import numpy as np
import logging


class SpectronautLoader(BaseLoader):
"""Loader for Spectronaut outputfiles
"""
"""Loader for Spectronaut outputfiles"""

def __init__(
self,
file,
intensity_column="PG.Quantity",
index_column="PG.ProteinGroups",
sample_column="R.FileName",
# sample_column="R.FileName",
sample_column="experiment", #
gene_names_column="PG.Genes",
filter_qvalue=True,
qvalue_cutoff=0.01,
Expand All @@ -22,13 +23,13 @@ def __init__(
"""Loads Spectronaut output. Will add contamination column for further analysis.
Args:
file (str): path to Spectronaut outputfile or pandas.DataFrame
file (str): path to Spectronaut outputfile or pandas.DataFrame
intensity_column (str, optional): columns where the intensity of the proteins are given. Defaults to "PG.Quantity".
index_column (str, optional): column indicating the protein groups. Defaults to "PG.ProteinGroups".
sample_column (str, optional): column that contains sample names used for downstream analysis. Defaults to "R.FileName".
gene_names_column (str, optional): column with gene names. Defaults to "PG.Genes".
filter_qvalue (bool, optional): will filter out the intensities that have greater than qvalue_cutoff in EG.Qvalue column. Those intensities will be replaced with zero and will be considered as censored missing values for imputation purpose.. Defaults to True.
qvalue_cutoff (float, optional): cut off vaéie. Defaults to 0.01.
qvalue_cutoff (float, optional): cut off value. Defaults to 0.01.
sep (str, optional): file separation of file. Defaults to "\t".
"""

Expand All @@ -42,12 +43,16 @@ def __init__(

self._read_spectronaut_file(file=file, sep=sep)

if filter_qvalue:
is_long = self._check_if_long(self.rawinput)

if filter_qvalue and is_long:
self._filter_qvalue(qvalue_cutoff=qvalue_cutoff)

self._reshape_spectronaut(
sample_column=sample_column, gene_names_column=gene_names_column
)
if is_long:
self._reshape_spectronaut(
sample_column=sample_column, gene_names_column=gene_names_column
)

self._add_contamination_column()
self._read_all_columns_as_string()

Expand All @@ -59,26 +64,38 @@ def _reshape_spectronaut(self, sample_column, gene_names_column):
self.rawinput["sample"] = (
self.rawinput[sample_column] + "_" + self.intensity_column
)

print("self.rawinput[sample_column]", (
self.rawinput[sample_column] + "_" + self.intensity_column
))
indexing_columns = [self.index_column]

print("print(indexing_columns)", indexing_columns)
if gene_names_column in self.rawinput.columns.to_list():
self.gene_names = gene_names_column
indexing_columns += [self.gene_names]
indexing_columns.append(self.gene_names)

keep_columns = [self.intensity_column, "sample"] + indexing_columns

print("keep_columns", keep_columns)
df = self.rawinput[keep_columns].drop_duplicates()
df = df.pivot(
columns="sample", index=indexing_columns, values=self.intensity_column
)
df.reset_index(inplace=True)

self.rawinput = df
print(self.rawinput.columns.to_list())
df.to_csv("~/Downloads/wide_test.tsv", sep="\t", index=False)

self.intensity_column = "[sample]_" + self.intensity_column

def _check_if_long(self, df):
for colname in df.columns.to_list():
if colname.startswith('PG.Quantity'):
return True
elif 'PG.Quantity' in colname:
return False

def _filter_qvalue(self, qvalue_cutoff):
print(self.rawinput.columns.to_list())
if "EG.Qvalue" not in self.rawinput.columns.to_list():
raise Warning(
"Column EG.Qvalue not found in file. File will not be filtered according to q-value."
Expand All @@ -97,12 +114,25 @@ def _read_spectronaut_file(self, file, sep):
# some spectronaut files include european decimal separators
if isinstance(file, pd.DataFrame):
df = file
for column in df.columns:
try:
if df[column].dtype == np.float64:
continue
df[column] = df[column].str.replace(',', '.').astype(float)
print("converted", column, df[column].dtype)
except (ValueError, AttributeError) as e:
print("failed", column, df[column].dtype)
else:
df = pd.read_csv(file, sep=sep, low_memory=False)

if df[self.intensity_column].dtype != np.float64:
# load european
df = pd.read_csv(file, sep=sep, decimal=",")
df = self.read_uploaded_file_into_df(file)
# сonvert from european
for column in df.columns:
try:
if df[column].dtype == np.float64:
continue
df[column] = df[column].str.replace(',', '.').astype(float)
print("converted", column, df[column].dtype)
except (ValueError, AttributeError) as e:
print("failed", column, df[column].dtype)

self.rawinput = df

Expand Down

0 comments on commit 56c63e6

Please sign in to comment.