Merge pull request #264 from Proteobench/fragpipe_params_upload

Fragpipe params upload
Proteobench · Mar 11, 2024 · 953dbb9 · 953dbb9
2 parents 2435449 + ff277d9
commit 953dbb9
Show file tree

Hide file tree

Showing 8 changed files with 113 additions and 67 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -2,7 +2,7 @@
     "[python]": {
         "editor.formatOnSave": true,
         "editor.codeActionsOnSave": {
-            "source.organizeImports": true
+            "source.organizeImports": "explicit"
         },
         "editor.rulers": [
             88

diff --git a/docs/modules/3-DDA-Quantification-ion-level.md b/docs/modules/3-DDA-Quantification-ion-level.md
@@ -43,7 +43,7 @@ The module is flexible in terms of what workflow the participants can run. Howev
 
 When you have successfully uploaded and visualized a benchmark run, we strongly encourage you to add the result to the online repository. This way, your run will be available to the entire community and can be compared to all other uploaded benchmark runs. By doing so, your workflow outputs, parameters and calculated metrics will be stored and publicly available. 
 
-To submit your run for public usage, you need to upload the parameter file associated to your run in the field `Meta data for searches`. Currently, we accept outputs from MaxQuant and Proline (see bellow for more tool-specific details). We are working on adding FragPipe outputs. Please fill the `Comments for submission` if needed, and confirm that the metadata is correct (correspond to the benchmark run) before pressing the button `I really want to upload it`. 
+To submit your run for public usage, you need to upload the parameter file associated to your run in the field `Meta data for searches`. Currently, we accept outputs from MaxQuant, FragPipe and Proline (see bellow for more tool-specific details). Please fill the `Comments for submission` if needed, and confirm that the metadata is correct (correspond to the benchmark run) before pressing the button `I really want to upload it`. 
 
 After upload, you will get a link to the pull request associated with your data. Please copy it and save it. With this link, you can get the unique identifier of your run (for example "Proline__20240106_141919"), and follow the advancement of your submission and add comments to communicate with the ProteoBench maintainers. If everything looks good, your submission will be reviewed and accepted (it will take a few working days). Then, your benchmark run will be added to the public runs of this module and plotted alongside all other benchmark runs in the figure. 
 
@@ -60,6 +60,7 @@ After upload, you will get a link to the pull request associated with your data.
 2. Following import of raw files, assign experiments "by File Name" right above the list of raw files.
 3. **Make sure contaminants are not added when you add decoys to the database**. 
 4. Upload "combined_ion/modified_peptides.tsv" in order for Proteobench to calculate the ion ratios. Parameter files are not yet implemented in ProteoBench, but we are working on it.
+For public submission, please provide the ".params" and the ".worflow" files that correspond to your search. 
 
 ### i2MassChroQ
 -- available soon --

diff --git a/proteobench/io/params/fragger.py b/proteobench/io/params/fragger.py
@@ -1,14 +1,16 @@
 """Functionality to parse FragPipe fragger.params parameter files.
 
-FragPipe has a text based paramter file format which 
-separates paramters and their value using an equal sign. Optional comments are 
+FragPipe has a text based paramter file format which
+separates paramters and their value using an equal sign. Optional comments are
 expressed with a hash sign.
 """
+
 from __future__ import annotations
 
 import logging
 import re
 from collections import namedtuple
+from io import BytesIO
 from pathlib import Path
 
 import pandas as pd
@@ -22,52 +24,72 @@
 VERSION_NO_PATTERN = r"\d+(\.\d+)*"
 
 
-def read_file(file: str, sep: str = " = ") -> list[Parameter]:
+def parse_params(l_of_str: list[str], sep: str = " = ") -> list[Parameter]:
     """Read FragPipe parameter file as list of records."""
-    with open(file) as f:
-        data = []
-        for line in f:
-            line = line.strip()
-            logger.debug(line)
-            # ! logic below also allows to keep the comments as comments
-            if line.startswith("#"):
-                continue
-            if not line:
-                continue
-            if "#" in line:
-                res = line.split("#")
-                if len(res) == 1:
-                    comment = res[0]
-                    data.append(Parameter(None, None, comment.strip()))
-                    continue
-                param, comment = [x.strip() for x in res]
-            else:
-                param = line
-                comment = None
-            res = param.strip().split(sep, maxsplit=1)
+    data = []
+    for line in l_of_str:
+        line = line.strip()
+        logger.debug(line)
+        # ! logic below also allows to keep the comments as comments
+        if line.startswith("#"):
+            continue
+        if not line:
+            continue
+        if "#" in line:
+            res = line.split("#")
             if len(res) == 1:
-                param = res[0].strip()
-                data.append(Parameter(param, None, comment))
+                comment = res[0]
+                data.append(Parameter(None, None, comment.strip()))
                 continue
-            param, value = [x.strip() for x in res]
-            data.append(Parameter(param, value, comment))
+            param, comment = [x.strip() for x in res]
+        else:
+            param = line
+            comment = None
+        res = param.strip().split(sep, maxsplit=1)
+        if len(res) == 1:
+            param = res[0].strip()
+            data.append(Parameter(param, None, comment))
+            continue
+        param, value = [x.strip() for x in res]
+        data.append(Parameter(param, value, comment))
     return data
 
 
-def extract_params(file: str, f_fragpipe_workflow) -> ProteoBenchParameters:
-    msfragger_params = read_file(file)
+def read_msfragger_params(file: BytesIO, sep: str = " = ") -> list[Parameter]:
+    l_of_str = file.read().decode("utf-8").splitlines()
+    return parse_params(l_of_str, sep=sep)
+
+
+def read_fragpipe_workflow(file: BytesIO, sep: str = "=") -> list[Parameter]:
+    l_of_str = file.read().decode("utf-8").splitlines()
+    header = l_of_str[0][1:].strip()
+    return header, parse_params(l_of_str, sep=sep)
+
+
+def extract_params(file: BytesIO, file1: BytesIO) -> ProteoBenchParameters:
+    # ! make it possible to pass files in both orders
+    msfragger_params, fragpipe_params = None, None
+    for f_ in [file, file1]:
+        print("file:", f_)
+        f_suffix = Path(f_.name).suffix
+        if f_suffix == ".params":
+            if msfragger_params is not None:
+                raise ValueError("MSFragger params file already parsed.")
+            msfragger_params = read_msfragger_params(f_)
+        elif f_suffix == ".workflow":
+            if fragpipe_params is not None:
+                raise ValueError("FragPipe workflow file already parsed.")
+            header, fragpipe_params = read_fragpipe_workflow(f_)
+        else:
+            raise ValueError("File extension not recognized.")
+
     msfragger_params = pd.DataFrame.from_records(msfragger_params, columns=Parameter._fields).set_index(
         Parameter._fields[0]
     )
-    fragpipe_params = read_file(f_fragpipe_workflow, sep="=")
     fragpipe_params = pd.DataFrame.from_records(fragpipe_params, columns=Parameter._fields).set_index(
         Parameter._fields[0]
     )
 
-    # FragPipe version in first line
-    with open(f_fragpipe_workflow) as f:
-        header = next(iter(f))[1:].strip()
-
     match = re.search(VERSION_NO_PATTERN, header)
 
     if match:
@@ -94,8 +116,17 @@ def extract_params(file: str, f_fragpipe_workflow) -> ProteoBenchParameters:
     params.min_peptide_length = msfragger_params.loc["digest_min_length", "value"]
     params.max_peptide_length = msfragger_params.loc["digest_max_length", "value"]
 
-    params.precursor_mass_tolerance = msfragger_params.loc["precursor_true_tolerance", "value"]
-    params.fragment_mass_tolerance = msfragger_params.loc["fragment_mass_tolerance", "value"]
+    precursor_mass_units = "Da"
+    if int(msfragger_params.loc["precursor_mass_units", "value"]):
+        precursor_mass_units = "ppm"
+    params.precursor_mass_tolerance = (
+        f'{msfragger_params.loc["precursor_true_tolerance", "value"]} {precursor_mass_units}'
+    )
+
+    fragment_mass_units = "Da"
+    if int(msfragger_params.loc["fragment_mass_units", "value"]):
+        fragment_mass_units = "ppm"
+    params.fragment_mass_tolerance = f'{msfragger_params.loc["fragment_mass_tolerance", "value"]} {fragment_mass_units}'
     # ! ionquant is not necessarily fixed?
     params.ident_fdr_protein = fragpipe_params.loc["ionquant.proteinfdr", "value"]
     params.ident_fdr_peptide = fragpipe_params.loc["ionquant.peptidefdr", "value"]
@@ -121,16 +152,19 @@ def extract_params(file: str, f_fragpipe_workflow) -> ProteoBenchParameters:
     from pprint import pprint
 
     file = pathlib.Path("../../../test/params/fragger.params")
-    data = read_file(file)
+    with open(file, "rb") as f:
+        data = read_msfragger_params(f)
     df = pd.DataFrame.from_records(data, columns=Parameter._fields).set_index(Parameter._fields[0])
     df.to_csv(file.with_suffix(".csv"))
 
     file_fragpipe = pathlib.Path("../../../test/params/fragpipe.workflow")
-    data = read_file(file_fragpipe, sep="=")
+    with open(file_fragpipe, "rb") as f:
+        _, data = read_fragpipe_workflow(f)
     df = pd.DataFrame.from_records(data, columns=Parameter._fields).set_index(Parameter._fields[0])
     df.to_csv(file_fragpipe.with_suffix(".csv"))
 
-    params = extract_params(file, file_fragpipe)
+    with open(file, "rb") as f, open(file_fragpipe, "rb") as f2:
+        params = extract_params(f, f2)
     pprint(params.__dict__)
     series = pd.Series(params.__dict__)
     series.to_csv(file.parent / "fragger_extracted_params.csv")
diff --git a/proteobench/modules/dda_quant_base/module.py b/proteobench/modules/dda_quant_base/module.py
@@ -16,6 +16,7 @@
 from proteobench.github.gh import clone_repo, pr_github, read_results_json_repo
 from proteobench.io.params import ProteoBenchParameters
 from proteobench.io.params.alphapept import extract_params as extract_params_alphapept
+from proteobench.io.params.fragger import extract_params as extract_params_fragger
 from proteobench.io.params.maxquant import extract_params as extract_params_maxquant
 from proteobench.io.params.proline import extract_params as extract_params_proline
 from proteobench.io.params.sage import extract_params as extract_params_sage
@@ -48,6 +49,7 @@ def is_implemented(self) -> bool:
         "Proline": extract_params_proline,
         "AlphaPept": extract_params_alphapept,
         "Sage": extract_params_sage,
+        "FragPipe": extract_params_fragger,
     }
 
     @staticmethod
@@ -155,8 +157,6 @@ def compute_group_stats(
     def compute_epsilon(withspecies, species_expected_ratio):
         # for all columns named parse_settings.species_dict.values() compute the sum over the rows and add it to a new column "unique"
         withspecies["unique"] = withspecies[species_expected_ratio.keys()].sum(axis=1)
-        # create a list tabulating how many entries in withspecies["unique"] are 1,2,3,4,5,6
-        unique_counts = withspecies["unique"].value_counts()
 
         # now remove all rows with withspecies["unique"] > 1
         withspecies = withspecies[withspecies["unique"] == 1]
@@ -430,9 +430,10 @@ def write_intermediate_raw(self, dir, ident, input_df, result_performance, param
         input_df.to_csv(os.path.join(path_write, "input_df.csv"))
         result_performance.to_csv(os.path.join(path_write, "result_performance.csv"))
 
-    def load_params_file(self, input_file: str, input_format: str) -> ProteoBenchParameters:
+    def load_params_file(self, input_file: list[str], input_format: str) -> ProteoBenchParameters:
         """Method loads parameters from a metadata file depending on its format."""
-        print(self.EXTRACT_PARAMS_DICT)
-        params = self.EXTRACT_PARAMS_DICT[input_format](input_file)
+        # ! adapted to be able to parse more than one file.
+        # ! how to ensure orrect order?
+        params = self.EXTRACT_PARAMS_DICT[input_format](*input_file)
         params.software_name = input_format
         return params
diff --git a/test/params/fragger_extracted_params.csv b/test/params/fragger_extracted_params.csv
@@ -7,8 +7,8 @@ ident_fdr_psm,0.01
 ident_fdr_peptide,0.01
 ident_fdr_protein,0.01
 enable_match_between_runs,True
-precursor_mass_tolerance,20
-fragment_mass_tolerance,20
+precursor_mass_tolerance,20 ppm
+fragment_mass_tolerance,20 ppm
 enzyme,stricttrypsin
 allowed_miscleavages,2
 min_peptide_length,7

diff --git a/test/test_parse_params_fragger.py b/test/test_parse_params_fragger.py
@@ -2,36 +2,44 @@
 from pathlib import Path
 
 import pandas as pd
-import pytest
 
 import proteobench.io.params.fragger as fragger_params
 
 TESTDATA_DIR = Path(__file__).parent / "params"
 
-# ! currently fragpipe with msfragger has two parameter/configuration files per run
-fnames = ["fragger.params", "fragpipe.workflow"]
-fnames = [TESTDATA_DIR / fname for fname in fnames]
 
-fnames = [(fname, fname.with_suffix(".json")) for fname in fnames]
-
-
-@pytest.mark.parametrize("file,csv_expected", fnames)
-def test_read_file(file, csv_expected):
+def test_read_msfragger_params():
     file = TESTDATA_DIR / "fragger.params"
     csv_expected = TESTDATA_DIR / "fragger.csv"
-    expected = pd.read_csv(csv_expected)
-    data = fragger_params.read_file(file)
+    expected = pd.read_csv(csv_expected, index_col=0)
+    with open(file, "rb") as f:
+        data = fragger_params.read_msfragger_params(f)
     actual = pd.DataFrame.from_records(data, columns=(fragger_params.Parameter._fields)).set_index(
         fragger_params.Parameter._fields[0]
     )
-    actual.equals(expected)
+    actual = pd.read_csv(io.StringIO(actual.to_csv()), index_col=0).squeeze("columns")
+    assert actual.equals(expected)
+
+
+def test_read_fragpipe_workflow():
+    file = TESTDATA_DIR / "fragpipe.workflow"
+    csv_expected = TESTDATA_DIR / "fragpipe.csv"
+    expected = pd.read_csv(csv_expected, index_col=0)
+    with open(file, "rb") as f:
+        _, data = fragger_params.read_fragpipe_workflow(f)
+    actual = pd.DataFrame.from_records(data, columns=(fragger_params.Parameter._fields)).set_index(
+        fragger_params.Parameter._fields[0]
+    )
+    actual = pd.read_csv(io.StringIO(actual.to_csv()), index_col=0).squeeze("columns")
+    assert actual.equals(expected)
 
 
 def test_extract_params():
     file = TESTDATA_DIR / "fragger.params"
     f_fragpipe_workflow = TESTDATA_DIR / "fragpipe.workflow"
     expected = pd.read_csv(TESTDATA_DIR / "fragger_extracted_params.csv", index_col=0).squeeze("columns")
-    actual = fragger_params.extract_params(file, f_fragpipe_workflow)
+    with open(file, "rb") as f1, open(f_fragpipe_workflow, "rb") as f2:
+        actual = fragger_params.extract_params(f1, f2)
     actual = pd.Series(actual.__dict__)
     actual = pd.read_csv(io.StringIO(actual.to_csv()), index_col=0).squeeze("columns")
     assert expected.equals(actual)
diff --git a/webinterface/Home.py b/webinterface/Home.py
@@ -1,6 +1,5 @@
 """psm_utils Streamlit-based web server."""
 
-import streamlit as st
 from _base import StreamlitPage
 
 

diff --git a/webinterface/pages/DDA_Quant_ion.py b/webinterface/pages/DDA_Quant_ion.py
@@ -4,6 +4,7 @@
 import logging
 import uuid
 from datetime import datetime
+from pprint import pformat
 
 import plotly.express as px
 import plotly.graph_objects as go
@@ -179,14 +180,14 @@ def _main_page(self):
                     search. This is important when using MaxQuant and FragPipe, among other tools.
                     """
             )
-            self.user_input["input_csv"] = st.file_uploader(
-                "Software tool result file", help=self.texts.Help.input_file
-            )
-
             self.user_input["input_format"] = st.selectbox(
                 "Software tool", INPUT_FORMATS, help=self.texts.Help.input_format
             )
 
+            self.user_input["input_csv"] = st.file_uploader(
+                "Software tool result file", help=self.texts.Help.input_file
+            )
+
             # self.user_input["pull_req"] = st.text_input(
             #     "Open pull request to make results available to everyone (type \"YES\" to enable)",
             #     "NO"
@@ -590,6 +591,7 @@ def generate_results(
                 "Meta data for searches",
                 help=self.texts.Help.meta_data_file,
                 key=meta_file_uploader_uuid,
+                accept_multiple_files=True,
             )
 
             self.user_input["comments_for_submission"] = st.text_area(
@@ -612,6 +614,7 @@ def generate_results(
             try:
                 print(self.user_input["input_format"])
                 params = IonModule().load_params_file(self.user_input[META_DATA], self.user_input["input_format"])
+                st.text(f"Parsed and selected parameters:\n{pformat(params.__dict__)}")
             except KeyError as e:
                 st.error("Parsing of meta parameters file for this software is not supported yet.")
             # except Exception as err: