From aed9a2051c9db8f0f94c3ab15261249ceeb2ac1d Mon Sep 17 00:00:00 2001
From: RobbinBouwmeester <robbin.bouwmeester@ugent.be>
Date: Fri, 16 Feb 2024 11:43:37 +0100
Subject: [PATCH 1/3] Fix proline input

---
 .../parse_settings_proline.toml               |  4 ++--
 proteobench/modules/dda_quant_ion/module.py   | 22 ++++++++++++++++++-
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/proteobench/modules/dda_quant_ion/io_parse_settings/parse_settings_proline.toml b/proteobench/modules/dda_quant_ion/io_parse_settings/parse_settings_proline.toml
index 3262d05c..4e9efd2e 100644
--- a/proteobench/modules/dda_quant_ion/io_parse_settings/parse_settings_proline.toml
+++ b/proteobench/modules/dda_quant_ion/io_parse_settings/parse_settings_proline.toml
@@ -1,6 +1,6 @@
 [mapper]
-"samesets_accessions" = "Proteins"
-sequence = "Sequence"
+"proteins" = "Proteins"
+"sequence" = "Sequence"
 "modifications" = "Modifications"
 "master_quant_peptide_ion_charge" = "Charge"
 
diff --git a/proteobench/modules/dda_quant_ion/module.py b/proteobench/modules/dda_quant_ion/module.py
index feff3f97..216352db 100644
--- a/proteobench/modules/dda_quant_ion/module.py
+++ b/proteobench/modules/dda_quant_ion/module.py
@@ -46,12 +46,32 @@ def load_input_file(self, input_csv: str, input_format: str) -> pd.DataFrame:
                 header=0,
                 index_col=None,
             )
-            # TODO this should be generalized further, maybe even moved to parsing param in toml
             input_data_frame["modifications"].fillna("", inplace=True)
+            input_data_frame["subsets_accessions"].fillna("", inplace=True)
             input_data_frame["proforma"] = input_data_frame.apply(
                 lambda x: aggregate_modification_column(x.sequence, x.modifications),
                 axis=1,
             )
+            input_data_frame["proteins"] = input_data_frame["samesets_accessions"] + input_data_frame[
+                "subsets_accessions"
+            ].apply(lambda x: "; " + x if len(x) > 0 else "")
+
+            input_data_frame["proteins"] = input_data_frame["proteins"].apply(
+                lambda x: "; ".join(sorted(x.split("; ")))
+            )
+            input_data_frame.drop_duplicates(
+                subset=["proforma", "master_quant_peptide_ion_charge", "proteins"], inplace=True
+            )
+
+            group_cols = ["proforma", "master_quant_peptide_ion_charge"]
+            agg_funcs = {col: "first" for col in input_data_frame.columns if col not in group_cols + ["proteins"]}
+
+            input_data_frame = (
+                input_data_frame.groupby(group_cols)
+                .agg({"proteins": lambda x: "; ".join(x), **agg_funcs})
+                .reset_index()
+            )
+
         elif input_format == "i2MassChroQ":
             input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t")
             input_data_frame["proforma"] = input_data_frame["ProForma"]

From c289f4ab19f47134b2c4ecd9506d82c87dbbfdb3 Mon Sep 17 00:00:00 2001
From: Marie Locard-Paulet <8386272+mlocardpaulet@users.noreply.github.com>
Date: Fri, 16 Feb 2024 12:55:09 +0100
Subject: [PATCH 2/3] amend documentation

---
 docs/modules/3-DDA-Quantification-ion-level.md | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/docs/modules/3-DDA-Quantification-ion-level.md b/docs/modules/3-DDA-Quantification-ion-level.md
index c5f3e864..aa9060ad 100644
--- a/docs/modules/3-DDA-Quantification-ion-level.md
+++ b/docs/modules/3-DDA-Quantification-ion-level.md
@@ -73,11 +73,10 @@ In the recent versions of MaxQuant, the default settings work perfectly (`Identi
 Some older versions of MaxQuant do not provide the option to change fasta header parsing. These are not compatible with ProteoBench.
 
 ### Proline
-<!-- Use the raw file names as sample names. In the output, it will automatically remove "LFQ_Orbitrap_". -->
-<!-- For this module, use the excel exports. Make sure that the “Quantified peptide ions” tab contains the columns "samesets_accessions". -->
-<!-- The "Quantified peptide ions" tab reports validated PSMs, so precursor ion quantities (retrieved from XICs) are duplicated. This redundancy is removed before metric calculation. -->
-<!-- For public submission, you can upload the same excel export, just make sure to have the tabs "Search settings and infos", "Import and filters", "Quant config". -->
--- work in progress --
+Use the raw file names as sample names. In the output, it will automatically remove "LFQ_Orbitrap_".
+For this module, use the excel exports. Make sure that the “Quantified peptide ions” tab contains the columns "samesets_accessions" and "subsets_accessions". The accessions in these two field are combined to determine what species a peptide sequence matches to.
+The "Quantified peptide ions" tab reports validated PSMs, so precursor ion quantities (retrieved from XICs) are duplicated. This redundancy is removed before metric calculation.
+For public submission, you can upload the same excel export, just make sure to have the tabs "Search settings and infos", "Import and filters", "Quant config".
 
 ### Sage
 

From f528012db8e7cdc6384319eee6cdb3211218efe7 Mon Sep 17 00:00:00 2001
From: Marie Locard-Paulet <8386272+mlocardpaulet@users.noreply.github.com>
Date: Fri, 16 Feb 2024 13:08:28 +0100
Subject: [PATCH 3/3] amend docs and comment toml

---
 docs/modules/3-DDA-Quantification-ion-level.md                  | 2 +-
 .../dda_quant_ion/io_parse_settings/parse_settings_proline.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/modules/3-DDA-Quantification-ion-level.md b/docs/modules/3-DDA-Quantification-ion-level.md
index aa9060ad..7f2522ed 100644
--- a/docs/modules/3-DDA-Quantification-ion-level.md
+++ b/docs/modules/3-DDA-Quantification-ion-level.md
@@ -106,7 +106,7 @@ the table must not contain non-validated ions. If you have any issue, contact us
 Each software tool produces specific output files formats. We made ``.toml`` files that describe where to find the information needed in each type of input. These can be found in `proteobench/modules/dda_quant/io_parse_settings`:
 
 - **[mapper]**
-mapping between the headers in the input file (left-hand side) and the header of the intermediate file generated by ProteoBench. 
+mapping between the headers in the input file (left-hand side) and the header of the intermediate file generated by ProteoBench. If more parsing is required before metrics calculation, this part can contain mapping between intermediatec column names and the name in the intermediate file. This is the case for Proline where protein accessions are reported in two independent columns that need to be combined. This should be commented in the toml.
 
   - "Raw file" = field that contains the raw file identifiers. **If the field "Raw file" is present, the table is parsed is a long format, otherwise it is parsed as wide format.**
  
diff --git a/proteobench/modules/dda_quant_ion/io_parse_settings/parse_settings_proline.toml b/proteobench/modules/dda_quant_ion/io_parse_settings/parse_settings_proline.toml
index 4e9efd2e..eddb9a1a 100644
--- a/proteobench/modules/dda_quant_ion/io_parse_settings/parse_settings_proline.toml
+++ b/proteobench/modules/dda_quant_ion/io_parse_settings/parse_settings_proline.toml
@@ -1,5 +1,5 @@
 [mapper]
-"proteins" = "Proteins"
+"proteins" = "Proteins" # from the intermediate file where "proteins" will contain the concatenation of all accessions from "samesets_accessions" and "subsets_accessions" reported for every ion in Proline input.
 "sequence" = "Sequence"
 "modifications" = "Modifications"
 "master_quant_peptide_ion_charge" = "Charge"