Skip to content

Commit

Permalink
simplified calculate_plot_data and generate_intermediate methods
Browse files Browse the repository at this point in the history
also generate_intermediate now Computes CV
(adresses #40)  closes #133 #113
  • Loading branch information
wolski committed Nov 23, 2023
1 parent 579809e commit 3e10488
Show file tree
Hide file tree
Showing 12 changed files with 87 additions and 205 deletions.
21 changes: 5 additions & 16 deletions proteobench/modules/dda_quant/datapoint.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import json
import numpy as np
from dataclasses import asdict, dataclass
from datetime import datetime

import numpy as np


@dataclass
class Datapoint:
"""Data used to stored the"""

# TODO add threshold value used for presence ion/peptidoform
id: str = None
search_engine: str = None
Expand Down Expand Up @@ -39,21 +41,8 @@ def calculate_missing_quan_prec(self, df, nr_missing_0):
return nr_quan_prec_missing

def calculate_plot_data(self, df):
species = ["YEAST", "HUMAN", "ECOLI"]
prop_ratios = []
sum_ratios = 0
nr_missing_0 = 0
for spec in species:
f = len(df[df[spec] == True])
sum_s = np.nan_to_num(df[df[spec] == True]["1|2_expected_ratio_diff"], nan=0, neginf=-1000, posinf=1000).sum()
ratio = sum_s / f
prop_ratio = (f / len(df)) * ratio
prop_ratios.append(prop_ratio)
sum_ratios += prop_ratio
nr_missing_0 += f

# TODO rename/document code
self.weighted_sum = round(sum_ratios, ndigits=3)
# compute mean of epsilon column in df
self.weighted_sum = round(df["epsilon"].mean(), ndigits=3)
self.nr_prec = len(df)

def generate_id(self):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
[species_expected_ratio]
[species_expected_ratio.YEAST]
"1|2" = 2.0
"A_vs_B" = 2.0

[species_expected_ratio.ECOLI]
"1|2" = 0.25
"A_vs_B" = 0.25

[species_expected_ratio.HUMAN]
"1|2" = 1.0
"A_vs_B" = 1.0

[general]
min_count_multispec = 1
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@ decoy = "Reverse"
ms1_int_sum_apex_dn = "Intensity"

[replicate_mapper]
LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01 = 1
LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02 = 1
LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03 = 1
LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01 = 2
LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02 = 2
LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03 = 2
LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01 = "A"
LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02 = "A"
LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03 = "A"
LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01 = "B"
LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02 = "B"
LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03 = "B"

[run_mapper]
LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01 = "Condition_A_Sample_Alpha_01"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@ Charge = "Charge"


[replicate_mapper]
LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01 = 1
LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02 = 1
LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03 = 1
LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01 = 2
LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02 = 2
LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03 = 2
LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01 = "A"
LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02 = "A"
LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03 = "A"
LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01 = "B"
LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02 = "B"
LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03 = "B"

[run_mapper]
LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01 = "Condition_A_Sample_Alpha_01"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ Proteins = "Proteins"
Charge = "Charge"

[replicate_mapper]
LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01 = 1
LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02 = 1
LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03 = 1
LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01 = 2
LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02 = 2
LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03 = 2
LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01 = "A"
LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02 = "A"
LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03 = "A"
LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01 = "B"
LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02 = "B"
LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03 = "B"

[run_mapper]
"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01" = "Condition_A_Sample_Alpha_01"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@ Protein = "Proteins"
Charge = "Charge"

[replicate_mapper]
"A_1 Intensity" = 1
"A_2 Intensity" = 1
"A_3 Intensity" = 1
"B_1 Intensity" = 2
"B_2 Intensity" = 2
"B_3 Intensity" = 2
"A_1 Intensity" = "A"
"A_2 Intensity" = "A"
"A_3 Intensity" = "A"
"B_1 Intensity" = "B"
"B_2 Intensity" = "B"
"B_3 Intensity" = "B"

[run_mapper]
"A_1 Intensity" = "Condition_A_Sample_Alpha_01"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ sequence = "Sequence"


[replicate_mapper]
abundance_DDA_Condition_A_Sample_Alpha_01 = 1
abundance_DDA_Condition_A_Sample_Alpha_02 = 1
abundance_DDA_Condition_A_Sample_Alpha_03 = 1
abundance_DDA_Condition_B_Sample_Alpha_01 = 2
abundance_DDA_Condition_B_Sample_Alpha_02 = 2
abundance_DDA_Condition_B_Sample_Alpha_03 = 2
abundance_DDA_Condition_A_Sample_Alpha_01 = "A"
abundance_DDA_Condition_A_Sample_Alpha_02 = "A"
abundance_DDA_Condition_A_Sample_Alpha_03 = "A"
abundance_DDA_Condition_B_Sample_Alpha_01 = "B"
abundance_DDA_Condition_B_Sample_Alpha_02 = "B"
abundance_DDA_Condition_B_Sample_Alpha_03 = "B"

[run_mapper]
"abundance_DDA_Condition_A_Sample_Alpha_01" = "Condition_A_Sample_Alpha_01"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
"charge" = "Charge"

[replicate_mapper]
"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML.gz" = 1
"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.mzML.gz" = 1
"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03.mzML.gz" = 1
"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML.gz" = 2
"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.mzML.gz" = 2
"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.mzML.gz" = 2
"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML.gz" = "A"
"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.mzML.gz" = "A"
"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03.mzML.gz" = "A"
"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML.gz" = "B"
"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.mzML.gz" = "B"
"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.mzML.gz" = "B"

[run_mapper]
"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML" = "Condition_A_Sample_Alpha_01"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@ protein_group = "Proteins"
"modified_peptide" = "Modified sequence"

[replicate_mapper]
abundance_A_1 = 1
abundance_A_2 = 1
abundance_A_3 = 1
abundance_B_1 = 2
abundance_B_2 = 2
abundance_B_3 = 2
abundance_A_1 = "A"
abundance_A_2 = "A"
abundance_A_3 = "A"
abundance_B_1 = "B"
abundance_B_2 = "B"
abundance_B_3 = "B"

[run_mapper]
abundance_A_1 = "Condition_A_Sample_Alpha_01"
Expand Down
118 changes: 11 additions & 107 deletions proteobench/modules/dda_quant/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,8 @@ def generate_intermediate_V2(
for x in quant_raw_df.columns
]

quant_raw_df["1|2_ratio"] = (
quant_raw_df["log_Intensity_mean_1"] / quant_raw_df["log_Intensity_mean_2"]
quant_raw_df["log2_A_vs_B"] = (
quant_raw_df["log_Intensity_mean_A"] - quant_raw_df["log_Intensity_mean_B"]
)
quant_raw_df = pd.merge(
quant_raw_df, intensities_wide, on="peptidoform", how="inner"
Expand Down Expand Up @@ -139,117 +139,21 @@ def generate_intermediate_V3(filtered_df, quant_df, parse_settings):
withspecies.loc[withspecies[species] == True, "species"] = species
withspecies.loc[
withspecies[species] == True, "expectedRatio"
] = species_expected_ratio[species]["1|2"]
] = species_expected_ratio[species]["A_vs_B"]

withspecies["epsilon"] = (
withspecies["log2_A_vs_B"] - withspecies["expectedRatio"]
)
return withspecies

def generate_intermediate(
self,
filtered_df,
replicate_to_raw: dict,
parse_settings: ParseSettings,
precursor="peptidoform",
self, filtered_df, replicate_to_raw: dict, parse_settings: ParseSettings
) -> pd.DataFrame:
"""Take the generic format of data search output and convert it to get the quantification data (a tuple, the quantification measure and the reliability of it)."""

species_peptidoform = list(parse_settings.species_dict.values())
# species_peptidoform.append("peptidoform")

filtered_df_p1 = filtered_df[["Raw file", precursor, "Intensity"]].copy()

# Summarize values of the same peptide using mean
# TODO should we take the mean or sum of the same peptidoform/peptideions same raw file multiple intensities
quant_raw_df = filtered_df_p1.groupby([precursor, "Raw file"]).Intensity.mean()
quant_df = quant_raw_df.unstack(level=1)

# Count number of values per peptidoform and Raw file
# TODO calculate this on the log2 transformed values
for replicate, replicate_runs in replicate_to_raw.items():
selected_replicate_df = quant_raw_df.index.get_level_values(
"Raw file"
).isin(replicate_runs)
replicate_quant_df = quant_raw_df[selected_replicate_df]
## Add means of replicates
mean_series = replicate_quant_df.groupby([precursor]).mean()
# change indices of mean_series from peptidoform to multiindices containing peptidoform,replicate
quant_df["mean_of_" + str(replicate)] = mean_series

## Add number of missing values per row of replicate
# TODO keep missing values, filter later for calculation of ratios
missing_series = replicate_quant_df.isna().groupby([precursor]).sum()
quant_df["missing_values_" + str(replicate)] = missing_series
return quant_df

@staticmethod
def generate_intermediate_V4(filtered_df, quant_df, parse_settings):
species_peptidoform = list(parse_settings.species_dict.values())
species_peptidoform.append("peptidoform")
# TODO check, do we need to drop_duplicates? When?
peptidoform_to_species = filtered_df[species_peptidoform].drop_duplicates()
# merge dataframes quant_df and species_quant_df and peptidoform_to_species
withspecies = pd.merge(
quant_df, peptidoform_to_species, on="peptidoform", how="inner"
res = Module.generate_intermediate_V2(
filtered_df, replicate_to_raw, parse_settings
)

peptidoform_to_species.index = peptidoform_to_species["peptidoform"]
peptidoform_to_species_dict = peptidoform_to_species.T.to_dict()

species_quant_df = pd.DataFrame(
[peptidoform_to_species_dict[idx] for idx in quant_df.index]
)
species_quant_df.set_index("peptidoform", drop=True, inplace=True)
"""Calculate the quantification ratios and compare them to the expected ratios."""
cv_replicate_quant_species_df = pd.concat([quant_df, species_quant_df], axis=1)

ratio_dict = {}
for species in parse_settings.species_dict.values():
species_df_slice = cv_replicate_quant_species_df[
cv_replicate_quant_species_df[species] == True
]
# TODO add cutoffs for different thresholds presence of peptide ion
# TODO do substraction for log2 transformed
for conditions in itertools.combinations(
set(parse_settings.replicate_mapper.values()), 2
):
condition_comp_id = "|".join(map(str, conditions))

ratio = (
species_df_slice["mean_of_" + str(conditions[0])]
/ species_df_slice["mean_of_" + str(conditions[1])]
)
ratio_diff = (
abs(
ratio
- parse_settings.species_expected_ratio[species][
condition_comp_id
]
)
* 100
)

# There is a loop that adds resulting ratios, if already
# exists than concat to the existing DF, otherwise
# keyexception and make df
try:
ratio_dict[condition_comp_id + "_ratio"] = pd.concat(
[ratio, ratio_dict[condition_comp_id + "_ratio"]]
)
ratio_dict[condition_comp_id + "_expected_ratio_diff"] = pd.concat(
[
ratio_dict[condition_comp_id + "_expected_ratio_diff"],
ratio_diff,
]
)
except KeyError:
ratio_dict[condition_comp_id + "_ratio"] = ratio
ratio_dict[condition_comp_id + "_expected_ratio_diff"] = ratio_diff
ratio_df = pd.DataFrame(ratio_dict)

intermediate = pd.concat([cv_replicate_quant_species_df, ratio_df], axis=1)

intermediate.rename(columns=parse_settings.run_mapper, inplace=True)

return intermediate
res = Module.generate_intermediate_V3(filtered_df, res, parse_settings)
return res

def strip_sequence_wombat(self, seq: str) -> str:
"""Remove parts of the peptide sequence that contain modifications."""
Expand Down
24 changes: 8 additions & 16 deletions proteobench/modules/dda_quant/plot.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly.graph_objects as go
import streamlit as st
from streamlit_plotly_events import plotly_events

Expand All @@ -18,33 +18,26 @@ def plot_bench(self, result_df: pd.DataFrame) -> go.Figure:
)
fig = px.histogram(
result_df,
x=np.log2(result_df["1|2_ratio"]),
x=result_df["log2_A_vs_B"],
color="kind",
marginal="rug",
histnorm="probability density",
barmode="overlay",
opacity=0.7,
nbins=100
nbins=100,
)

fig.update_layout(
width=700,
height=700,
# title="Distplot",
xaxis=dict(
title="1|2_ratio",
color="white",
gridwidth=2))

fig.update_yaxes(title="Density",
color="white",
gridwidth=2)

fig.update_layout(
width=700,
height=700
xaxis=dict(title="log2_A_vs_B", color="white", gridwidth=2),
)

fig.update_yaxes(title="Density", color="white", gridwidth=2)

fig.update_layout(width=700, height=700)

fig.update_xaxes(range=[-4, 4])
fig.update_xaxes(showgrid=True, gridcolor="lightgray", gridwidth=1)
fig.update_yaxes(showgrid=True, gridcolor="lightgray", gridwidth=1)
Expand Down Expand Up @@ -98,7 +91,6 @@ def plot_metric(self, benchmark_metrics_df: pd.DataFrame) -> go.Figure:
for idx, row in benchmark_metrics_df.iterrows()
]


mapping = {"old": 10, "new": 20}

fig = go.Figure(
Expand Down
Loading

0 comments on commit 3e10488

Please sign in to comment.