From 3e10488211e318e499093e70e63c69768a8cac4f Mon Sep 17 00:00:00 2001 From: Witold Wolski Date: Thu, 23 Nov 2023 17:13:02 +0100 Subject: [PATCH] simplified calculate_plot_data and generate_intermediate methods also generate_intermediate now Computes CV (adresses #40) closes #133 #113 --- proteobench/modules/dda_quant/datapoint.py | 21 +--- .../io_parse_settings/module_settings.toml | 6 +- .../parse_settings_alphapept.toml | 12 +- .../parse_settings_custom.toml | 12 +- .../parse_settings_maxquant.toml | 12 +- .../parse_settings_msfragger.toml | 12 +- .../parse_settings_proline.toml | 12 +- .../parse_settings_sage.toml | 12 +- .../parse_settings_wombat.toml | 12 +- proteobench/modules/dda_quant/module.py | 118 ++---------------- proteobench/modules/dda_quant/plot.py | 24 ++-- test/test_module_dda_quant.py | 39 +++--- 12 files changed, 87 insertions(+), 205 deletions(-) diff --git a/proteobench/modules/dda_quant/datapoint.py b/proteobench/modules/dda_quant/datapoint.py index 8459245b..a304eb84 100644 --- a/proteobench/modules/dda_quant/datapoint.py +++ b/proteobench/modules/dda_quant/datapoint.py @@ -1,12 +1,14 @@ import json -import numpy as np from dataclasses import asdict, dataclass from datetime import datetime +import numpy as np + @dataclass class Datapoint: """Data used to stored the""" + # TODO add threshold value used for presence ion/peptidoform id: str = None search_engine: str = None @@ -39,21 +41,8 @@ def calculate_missing_quan_prec(self, df, nr_missing_0): return nr_quan_prec_missing def calculate_plot_data(self, df): - species = ["YEAST", "HUMAN", "ECOLI"] - prop_ratios = [] - sum_ratios = 0 - nr_missing_0 = 0 - for spec in species: - f = len(df[df[spec] == True]) - sum_s = np.nan_to_num(df[df[spec] == True]["1|2_expected_ratio_diff"], nan=0, neginf=-1000, posinf=1000).sum() - ratio = sum_s / f - prop_ratio = (f / len(df)) * ratio - prop_ratios.append(prop_ratio) - sum_ratios += prop_ratio - nr_missing_0 += f - - # TODO rename/document code - self.weighted_sum = round(sum_ratios, ndigits=3) + # compute mean of epsilon column in df + self.weighted_sum = round(df["epsilon"].mean(), ndigits=3) self.nr_prec = len(df) def generate_id(self): diff --git a/proteobench/modules/dda_quant/io_parse_settings/module_settings.toml b/proteobench/modules/dda_quant/io_parse_settings/module_settings.toml index 320fedc3..a45f4924 100644 --- a/proteobench/modules/dda_quant/io_parse_settings/module_settings.toml +++ b/proteobench/modules/dda_quant/io_parse_settings/module_settings.toml @@ -1,12 +1,12 @@ [species_expected_ratio] [species_expected_ratio.YEAST] -"1|2" = 2.0 +"A_vs_B" = 2.0 [species_expected_ratio.ECOLI] -"1|2" = 0.25 +"A_vs_B" = 0.25 [species_expected_ratio.HUMAN] -"1|2" = 1.0 +"A_vs_B" = 1.0 [general] min_count_multispec = 1 \ No newline at end of file diff --git a/proteobench/modules/dda_quant/io_parse_settings/parse_settings_alphapept.toml b/proteobench/modules/dda_quant/io_parse_settings/parse_settings_alphapept.toml index 4c1757e8..74dc9543 100644 --- a/proteobench/modules/dda_quant/io_parse_settings/parse_settings_alphapept.toml +++ b/proteobench/modules/dda_quant/io_parse_settings/parse_settings_alphapept.toml @@ -8,12 +8,12 @@ decoy = "Reverse" ms1_int_sum_apex_dn = "Intensity" [replicate_mapper] -LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01 = 1 -LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02 = 1 -LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03 = 1 -LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01 = 2 -LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02 = 2 -LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03 = 2 +LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01 = "A" +LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02 = "A" +LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03 = "A" +LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01 = "B" +LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02 = "B" +LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03 = "B" [run_mapper] LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01 = "Condition_A_Sample_Alpha_01" diff --git a/proteobench/modules/dda_quant/io_parse_settings/parse_settings_custom.toml b/proteobench/modules/dda_quant/io_parse_settings/parse_settings_custom.toml index 8264e393..a94a8ea4 100644 --- a/proteobench/modules/dda_quant/io_parse_settings/parse_settings_custom.toml +++ b/proteobench/modules/dda_quant/io_parse_settings/parse_settings_custom.toml @@ -5,12 +5,12 @@ Charge = "Charge" [replicate_mapper] -LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01 = 1 -LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02 = 1 -LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03 = 1 -LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01 = 2 -LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02 = 2 -LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03 = 2 +LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01 = "A" +LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02 = "A" +LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03 = "A" +LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01 = "B" +LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02 = "B" +LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03 = "B" [run_mapper] LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01 = "Condition_A_Sample_Alpha_01" diff --git a/proteobench/modules/dda_quant/io_parse_settings/parse_settings_maxquant.toml b/proteobench/modules/dda_quant/io_parse_settings/parse_settings_maxquant.toml index 8ccdb521..ab55cac3 100644 --- a/proteobench/modules/dda_quant/io_parse_settings/parse_settings_maxquant.toml +++ b/proteobench/modules/dda_quant/io_parse_settings/parse_settings_maxquant.toml @@ -6,12 +6,12 @@ Proteins = "Proteins" Charge = "Charge" [replicate_mapper] -LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01 = 1 -LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02 = 1 -LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03 = 1 -LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01 = 2 -LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02 = 2 -LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03 = 2 +LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01 = "A" +LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02 = "A" +LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03 = "A" +LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01 = "B" +LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02 = "B" +LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03 = "B" [run_mapper] "LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01" = "Condition_A_Sample_Alpha_01" diff --git a/proteobench/modules/dda_quant/io_parse_settings/parse_settings_msfragger.toml b/proteobench/modules/dda_quant/io_parse_settings/parse_settings_msfragger.toml index 2154a450..0f05780d 100644 --- a/proteobench/modules/dda_quant/io_parse_settings/parse_settings_msfragger.toml +++ b/proteobench/modules/dda_quant/io_parse_settings/parse_settings_msfragger.toml @@ -5,12 +5,12 @@ Protein = "Proteins" Charge = "Charge" [replicate_mapper] -"A_1 Intensity" = 1 -"A_2 Intensity" = 1 -"A_3 Intensity" = 1 -"B_1 Intensity" = 2 -"B_2 Intensity" = 2 -"B_3 Intensity" = 2 +"A_1 Intensity" = "A" +"A_2 Intensity" = "A" +"A_3 Intensity" = "A" +"B_1 Intensity" = "B" +"B_2 Intensity" = "B" +"B_3 Intensity" = "B" [run_mapper] "A_1 Intensity" = "Condition_A_Sample_Alpha_01" diff --git a/proteobench/modules/dda_quant/io_parse_settings/parse_settings_proline.toml b/proteobench/modules/dda_quant/io_parse_settings/parse_settings_proline.toml index 24a82df3..d8c8fedf 100644 --- a/proteobench/modules/dda_quant/io_parse_settings/parse_settings_proline.toml +++ b/proteobench/modules/dda_quant/io_parse_settings/parse_settings_proline.toml @@ -6,12 +6,12 @@ sequence = "Sequence" [replicate_mapper] -abundance_DDA_Condition_A_Sample_Alpha_01 = 1 -abundance_DDA_Condition_A_Sample_Alpha_02 = 1 -abundance_DDA_Condition_A_Sample_Alpha_03 = 1 -abundance_DDA_Condition_B_Sample_Alpha_01 = 2 -abundance_DDA_Condition_B_Sample_Alpha_02 = 2 -abundance_DDA_Condition_B_Sample_Alpha_03 = 2 +abundance_DDA_Condition_A_Sample_Alpha_01 = "A" +abundance_DDA_Condition_A_Sample_Alpha_02 = "A" +abundance_DDA_Condition_A_Sample_Alpha_03 = "A" +abundance_DDA_Condition_B_Sample_Alpha_01 = "B" +abundance_DDA_Condition_B_Sample_Alpha_02 = "B" +abundance_DDA_Condition_B_Sample_Alpha_03 = "B" [run_mapper] "abundance_DDA_Condition_A_Sample_Alpha_01" = "Condition_A_Sample_Alpha_01" diff --git a/proteobench/modules/dda_quant/io_parse_settings/parse_settings_sage.toml b/proteobench/modules/dda_quant/io_parse_settings/parse_settings_sage.toml index d30efd53..fe84decf 100644 --- a/proteobench/modules/dda_quant/io_parse_settings/parse_settings_sage.toml +++ b/proteobench/modules/dda_quant/io_parse_settings/parse_settings_sage.toml @@ -4,12 +4,12 @@ "charge" = "Charge" [replicate_mapper] -"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML.gz" = 1 -"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.mzML.gz" = 1 -"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03.mzML.gz" = 1 -"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML.gz" = 2 -"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.mzML.gz" = 2 -"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.mzML.gz" = 2 +"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML.gz" = "A" +"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.mzML.gz" = "A" +"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03.mzML.gz" = "A" +"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML.gz" = "B" +"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.mzML.gz" = "B" +"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.mzML.gz" = "B" [run_mapper] "LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML" = "Condition_A_Sample_Alpha_01" diff --git a/proteobench/modules/dda_quant/io_parse_settings/parse_settings_wombat.toml b/proteobench/modules/dda_quant/io_parse_settings/parse_settings_wombat.toml index 9d4e73c9..31909674 100644 --- a/proteobench/modules/dda_quant/io_parse_settings/parse_settings_wombat.toml +++ b/proteobench/modules/dda_quant/io_parse_settings/parse_settings_wombat.toml @@ -3,12 +3,12 @@ protein_group = "Proteins" "modified_peptide" = "Modified sequence" [replicate_mapper] -abundance_A_1 = 1 -abundance_A_2 = 1 -abundance_A_3 = 1 -abundance_B_1 = 2 -abundance_B_2 = 2 -abundance_B_3 = 2 +abundance_A_1 = "A" +abundance_A_2 = "A" +abundance_A_3 = "A" +abundance_B_1 = "B" +abundance_B_2 = "B" +abundance_B_3 = "B" [run_mapper] abundance_A_1 = "Condition_A_Sample_Alpha_01" diff --git a/proteobench/modules/dda_quant/module.py b/proteobench/modules/dda_quant/module.py index 928098cd..8aa2e90e 100644 --- a/proteobench/modules/dda_quant/module.py +++ b/proteobench/modules/dda_quant/module.py @@ -105,8 +105,8 @@ def generate_intermediate_V2( for x in quant_raw_df.columns ] - quant_raw_df["1|2_ratio"] = ( - quant_raw_df["log_Intensity_mean_1"] / quant_raw_df["log_Intensity_mean_2"] + quant_raw_df["log2_A_vs_B"] = ( + quant_raw_df["log_Intensity_mean_A"] - quant_raw_df["log_Intensity_mean_B"] ) quant_raw_df = pd.merge( quant_raw_df, intensities_wide, on="peptidoform", how="inner" @@ -139,117 +139,21 @@ def generate_intermediate_V3(filtered_df, quant_df, parse_settings): withspecies.loc[withspecies[species] == True, "species"] = species withspecies.loc[ withspecies[species] == True, "expectedRatio" - ] = species_expected_ratio[species]["1|2"] + ] = species_expected_ratio[species]["A_vs_B"] + withspecies["epsilon"] = ( + withspecies["log2_A_vs_B"] - withspecies["expectedRatio"] + ) return withspecies def generate_intermediate( - self, - filtered_df, - replicate_to_raw: dict, - parse_settings: ParseSettings, - precursor="peptidoform", + self, filtered_df, replicate_to_raw: dict, parse_settings: ParseSettings ) -> pd.DataFrame: - """Take the generic format of data search output and convert it to get the quantification data (a tuple, the quantification measure and the reliability of it).""" - - species_peptidoform = list(parse_settings.species_dict.values()) - # species_peptidoform.append("peptidoform") - - filtered_df_p1 = filtered_df[["Raw file", precursor, "Intensity"]].copy() - - # Summarize values of the same peptide using mean - # TODO should we take the mean or sum of the same peptidoform/peptideions same raw file multiple intensities - quant_raw_df = filtered_df_p1.groupby([precursor, "Raw file"]).Intensity.mean() - quant_df = quant_raw_df.unstack(level=1) - - # Count number of values per peptidoform and Raw file - # TODO calculate this on the log2 transformed values - for replicate, replicate_runs in replicate_to_raw.items(): - selected_replicate_df = quant_raw_df.index.get_level_values( - "Raw file" - ).isin(replicate_runs) - replicate_quant_df = quant_raw_df[selected_replicate_df] - ## Add means of replicates - mean_series = replicate_quant_df.groupby([precursor]).mean() - # change indices of mean_series from peptidoform to multiindices containing peptidoform,replicate - quant_df["mean_of_" + str(replicate)] = mean_series - - ## Add number of missing values per row of replicate - # TODO keep missing values, filter later for calculation of ratios - missing_series = replicate_quant_df.isna().groupby([precursor]).sum() - quant_df["missing_values_" + str(replicate)] = missing_series - return quant_df - - @staticmethod - def generate_intermediate_V4(filtered_df, quant_df, parse_settings): - species_peptidoform = list(parse_settings.species_dict.values()) - species_peptidoform.append("peptidoform") - # TODO check, do we need to drop_duplicates? When? - peptidoform_to_species = filtered_df[species_peptidoform].drop_duplicates() - # merge dataframes quant_df and species_quant_df and peptidoform_to_species - withspecies = pd.merge( - quant_df, peptidoform_to_species, on="peptidoform", how="inner" + res = Module.generate_intermediate_V2( + filtered_df, replicate_to_raw, parse_settings ) - - peptidoform_to_species.index = peptidoform_to_species["peptidoform"] - peptidoform_to_species_dict = peptidoform_to_species.T.to_dict() - - species_quant_df = pd.DataFrame( - [peptidoform_to_species_dict[idx] for idx in quant_df.index] - ) - species_quant_df.set_index("peptidoform", drop=True, inplace=True) - """Calculate the quantification ratios and compare them to the expected ratios.""" - cv_replicate_quant_species_df = pd.concat([quant_df, species_quant_df], axis=1) - - ratio_dict = {} - for species in parse_settings.species_dict.values(): - species_df_slice = cv_replicate_quant_species_df[ - cv_replicate_quant_species_df[species] == True - ] - # TODO add cutoffs for different thresholds presence of peptide ion - # TODO do substraction for log2 transformed - for conditions in itertools.combinations( - set(parse_settings.replicate_mapper.values()), 2 - ): - condition_comp_id = "|".join(map(str, conditions)) - - ratio = ( - species_df_slice["mean_of_" + str(conditions[0])] - / species_df_slice["mean_of_" + str(conditions[1])] - ) - ratio_diff = ( - abs( - ratio - - parse_settings.species_expected_ratio[species][ - condition_comp_id - ] - ) - * 100 - ) - - # There is a loop that adds resulting ratios, if already - # exists than concat to the existing DF, otherwise - # keyexception and make df - try: - ratio_dict[condition_comp_id + "_ratio"] = pd.concat( - [ratio, ratio_dict[condition_comp_id + "_ratio"]] - ) - ratio_dict[condition_comp_id + "_expected_ratio_diff"] = pd.concat( - [ - ratio_dict[condition_comp_id + "_expected_ratio_diff"], - ratio_diff, - ] - ) - except KeyError: - ratio_dict[condition_comp_id + "_ratio"] = ratio - ratio_dict[condition_comp_id + "_expected_ratio_diff"] = ratio_diff - ratio_df = pd.DataFrame(ratio_dict) - - intermediate = pd.concat([cv_replicate_quant_species_df, ratio_df], axis=1) - - intermediate.rename(columns=parse_settings.run_mapper, inplace=True) - - return intermediate + res = Module.generate_intermediate_V3(filtered_df, res, parse_settings) + return res def strip_sequence_wombat(self, seq: str) -> str: """Remove parts of the peptide sequence that contain modifications.""" diff --git a/proteobench/modules/dda_quant/plot.py b/proteobench/modules/dda_quant/plot.py index 990f2cd3..f5ab86d5 100644 --- a/proteobench/modules/dda_quant/plot.py +++ b/proteobench/modules/dda_quant/plot.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd -import plotly.graph_objects as go import plotly.express as px +import plotly.graph_objects as go import streamlit as st from streamlit_plotly_events import plotly_events @@ -18,33 +18,26 @@ def plot_bench(self, result_df: pd.DataFrame) -> go.Figure: ) fig = px.histogram( result_df, - x=np.log2(result_df["1|2_ratio"]), + x=result_df["log2_A_vs_B"], color="kind", marginal="rug", histnorm="probability density", barmode="overlay", opacity=0.7, - nbins=100 + nbins=100, ) fig.update_layout( width=700, height=700, # title="Distplot", - xaxis=dict( - title="1|2_ratio", - color="white", - gridwidth=2)) - - fig.update_yaxes(title="Density", - color="white", - gridwidth=2) - - fig.update_layout( - width=700, - height=700 + xaxis=dict(title="log2_A_vs_B", color="white", gridwidth=2), ) + fig.update_yaxes(title="Density", color="white", gridwidth=2) + + fig.update_layout(width=700, height=700) + fig.update_xaxes(range=[-4, 4]) fig.update_xaxes(showgrid=True, gridcolor="lightgray", gridwidth=1) fig.update_yaxes(showgrid=True, gridcolor="lightgray", gridwidth=1) @@ -98,7 +91,6 @@ def plot_metric(self, benchmark_metrics_df: pd.DataFrame) -> go.Figure: for idx, row in benchmark_metrics_df.iterrows() ] - mapping = {"old": 10, "new": 20} fig = go.Figure( diff --git a/test/test_module_dda_quant.py b/test/test_module_dda_quant.py index 696ff6bc..9ecfb28f 100644 --- a/test/test_module_dda_quant.py +++ b/test/test_module_dda_quant.py @@ -115,30 +115,27 @@ def test_input_file_processing(self): intermediate = Module().generate_intermediate( prepared_df, replicate_to_raw, parse_settings ) - - intermediate = Module.generate_intermediate_V4( - prepared_df, intermediate, parse_settings - ) - self.assertFalse(intermediate.empty) - def test_input_file_processing_V2(self): - """Test the processing of the input files.""" - for format_name in self.supported_formats: - input_df = load_file(format_name) - parse_settings = ParseSettings(format_name) - prepared_df, replicate_to_raw = ParseInputs().convert_to_standard_format( - input_df, parse_settings - ) - - - intermediate = Module.generate_intermediate_V2( - prepared_df, replicate_to_raw, parse_settings - ) + self.assertFalse(intermediate.empty) - # Get quantification data - Module.generate_intermediate_V3(prepared_df, intermediate, parse_settings) + def test_benchmarking(self): + user_input = { + "version": "1.0", + "fdr_psm": 0.01, + "fdr_peptide": 0.05, + "fdr_protein": 0.1, + "mbr": 1, + "precursor_mass_tolerance": 0.02, + "precursor_mass_tolerance_unit": "Da", + "fragment_mass_tolerance": 0.02, + "fragment_mass_tolerance_unit": "Da", + "search_enzyme_name": "Trypsin", + "allowed_missed_cleavage": 1, + "min_peptide_length": 6, + "max_peptide_length": 30, + } + Module().benchmarking(TESTDATA_FILES["MaxQuant"], "MaxQuant", user_input, None) - self.assertFalse(intermediate.empty) class TestWrongFormatting(unittest.TestCase): """Simple tests that should break if the ."""