Merge pull request #252 from Proteobench/proline_fix_prot

Fix proline input
Proteobench · Feb 16, 2024 · 3aa64ee · 3aa64ee
2 parents 265ebb5 + f528012
commit 3aa64ee
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 9 deletions.
diff --git a/docs/modules/3-DDA-Quantification-ion-level.md b/docs/modules/3-DDA-Quantification-ion-level.md
@@ -73,11 +73,10 @@ In the recent versions of MaxQuant, the default settings work perfectly (`Identi
 Some older versions of MaxQuant do not provide the option to change fasta header parsing. These are not compatible with ProteoBench.
 
 ### Proline
-<!-- Use the raw file names as sample names. In the output, it will automatically remove "LFQ_Orbitrap_". -->
-<!-- For this module, use the excel exports. Make sure that the “Quantified peptide ions” tab contains the columns "samesets_accessions". -->
-<!-- The "Quantified peptide ions" tab reports validated PSMs, so precursor ion quantities (retrieved from XICs) are duplicated. This redundancy is removed before metric calculation. -->
-<!-- For public submission, you can upload the same excel export, just make sure to have the tabs "Search settings and infos", "Import and filters", "Quant config". -->
--- work in progress --
+Use the raw file names as sample names. In the output, it will automatically remove "LFQ_Orbitrap_".
+For this module, use the excel exports. Make sure that the “Quantified peptide ions” tab contains the columns "samesets_accessions" and "subsets_accessions". The accessions in these two field are combined to determine what species a peptide sequence matches to.
+The "Quantified peptide ions" tab reports validated PSMs, so precursor ion quantities (retrieved from XICs) are duplicated. This redundancy is removed before metric calculation.
+For public submission, you can upload the same excel export, just make sure to have the tabs "Search settings and infos", "Import and filters", "Quant config".
 
 ### Sage
 
@@ -107,7 +106,7 @@ the table must not contain non-validated ions. If you have any issue, contact us
 Each software tool produces specific output files formats. We made ``.toml`` files that describe where to find the information needed in each type of input. These can be found in `proteobench/modules/dda_quant/io_parse_settings`:
 
 - **[mapper]**
-mapping between the headers in the input file (left-hand side) and the header of the intermediate file generated by ProteoBench. 
+mapping between the headers in the input file (left-hand side) and the header of the intermediate file generated by ProteoBench. If more parsing is required before metrics calculation, this part can contain mapping between intermediatec column names and the name in the intermediate file. This is the case for Proline where protein accessions are reported in two independent columns that need to be combined. This should be commented in the toml.
 
   - "Raw file" = field that contains the raw file identifiers. **If the field "Raw file" is present, the table is parsed is a long format, otherwise it is parsed as wide format.**
 

diff --git a/proteobench/modules/dda_quant_ion/io_parse_settings/parse_settings_proline.toml b/proteobench/modules/dda_quant_ion/io_parse_settings/parse_settings_proline.toml
@@ -1,6 +1,6 @@
 [mapper]
-"samesets_accessions" = "Proteins"
-sequence = "Sequence"
+"proteins" = "Proteins" # from the intermediate file where "proteins" will contain the concatenation of all accessions from "samesets_accessions" and "subsets_accessions" reported for every ion in Proline input.
+"sequence" = "Sequence"
 "modifications" = "Modifications"
 "master_quant_peptide_ion_charge" = "Charge"
 

diff --git a/proteobench/modules/dda_quant_ion/module.py b/proteobench/modules/dda_quant_ion/module.py
@@ -46,12 +46,32 @@ def load_input_file(self, input_csv: str, input_format: str) -> pd.DataFrame:
                 header=0,
                 index_col=None,
             )
-            # TODO this should be generalized further, maybe even moved to parsing param in toml
             input_data_frame["modifications"].fillna("", inplace=True)
+            input_data_frame["subsets_accessions"].fillna("", inplace=True)
             input_data_frame["proforma"] = input_data_frame.apply(
                 lambda x: aggregate_modification_column(x.sequence, x.modifications),
                 axis=1,
             )
+            input_data_frame["proteins"] = input_data_frame["samesets_accessions"] + input_data_frame[
+                "subsets_accessions"
+            ].apply(lambda x: "; " + x if len(x) > 0 else "")
+
+            input_data_frame["proteins"] = input_data_frame["proteins"].apply(
+                lambda x: "; ".join(sorted(x.split("; ")))
+            )
+            input_data_frame.drop_duplicates(
+                subset=["proforma", "master_quant_peptide_ion_charge", "proteins"], inplace=True
+            )
+
+            group_cols = ["proforma", "master_quant_peptide_ion_charge"]
+            agg_funcs = {col: "first" for col in input_data_frame.columns if col not in group_cols + ["proteins"]}
+
+            input_data_frame = (
+                input_data_frame.groupby(group_cols)
+                .agg({"proteins": lambda x: "; ".join(x), **agg_funcs})
+                .reset_index()
+            )
+
         elif input_format == "i2MassChroQ":
             input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t")
             input_data_frame["proforma"] = input_data_frame["ProForma"]