From 88ff9c845d768bca071f811ffee1608a58f191ef Mon Sep 17 00:00:00 2001 From: Henry Date: Fri, 1 Mar 2024 11:56:09 +0100 Subject: [PATCH 1/8] :art: allow more than one parameter file --- proteobench/modules/dda_quant_base/module.py | 7 ++++--- webinterface/pages/DDA_Quant_ion.py | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/proteobench/modules/dda_quant_base/module.py b/proteobench/modules/dda_quant_base/module.py index d281271a..426a366b 100644 --- a/proteobench/modules/dda_quant_base/module.py +++ b/proteobench/modules/dda_quant_base/module.py @@ -430,9 +430,10 @@ def write_intermediate_raw(self, dir, ident, input_df, result_performance, param input_df.to_csv(os.path.join(path_write, "input_df.csv")) result_performance.to_csv(os.path.join(path_write, "result_performance.csv")) - def load_params_file(self, input_file: str, input_format: str) -> ProteoBenchParameters: + def load_params_file(self, input_file: list[str], input_format: str) -> ProteoBenchParameters: """Method loads parameters from a metadata file depending on its format.""" - print(self.EXTRACT_PARAMS_DICT) - params = self.EXTRACT_PARAMS_DICT[input_format](input_file) + # ! adapted to be able to parse more than one file. + # ! how to ensure orrect order? + params = self.EXTRACT_PARAMS_DICT[input_format](*input_file) params.software_name = input_format return params diff --git a/webinterface/pages/DDA_Quant_ion.py b/webinterface/pages/DDA_Quant_ion.py index ae663fff..6190a877 100644 --- a/webinterface/pages/DDA_Quant_ion.py +++ b/webinterface/pages/DDA_Quant_ion.py @@ -590,6 +590,7 @@ def generate_results( "Meta data for searches", help=self.texts.Help.meta_data_file, key=meta_file_uploader_uuid, + accept_multiple_files=True, ) self.user_input["comments_for_submission"] = st.text_area( From 85d2cf2d82019de3cabdcc2b698b9dec64923e6e Mon Sep 17 00:00:00 2001 From: Henry Date: Mon, 4 Mar 2024 11:32:01 +0100 Subject: [PATCH 2/8] :sparkles: Adapt to reading BufferIO obj, fix tests - one test was not working -> now explicit testing only --- proteobench/io/params/fragger.py | 99 +++++++++++++------- proteobench/modules/dda_quant_base/module.py | 2 + test/test_parse_params_fragger.py | 34 ++++--- 3 files changed, 86 insertions(+), 49 deletions(-) diff --git a/proteobench/io/params/fragger.py b/proteobench/io/params/fragger.py index 104a31ff..01789cda 100644 --- a/proteobench/io/params/fragger.py +++ b/proteobench/io/params/fragger.py @@ -4,11 +4,13 @@ separates paramters and their value using an equal sign. Optional comments are expressed with a hash sign. """ + from __future__ import annotations import logging import re from collections import namedtuple +from io import BytesIO from pathlib import Path import pandas as pd @@ -22,52 +24,74 @@ VERSION_NO_PATTERN = r"\d+(\.\d+)*" -def read_file(file: str, sep: str = " = ") -> list[Parameter]: +def parse_params(l_of_str: list[str], sep: str = " = ") -> list[Parameter]: """Read FragPipe parameter file as list of records.""" - with open(file) as f: - data = [] - for line in f: - line = line.strip() - logger.debug(line) - # ! logic below also allows to keep the comments as comments - if line.startswith("#"): - continue - if not line: - continue - if "#" in line: - res = line.split("#") - if len(res) == 1: - comment = res[0] - data.append(Parameter(None, None, comment.strip())) - continue - param, comment = [x.strip() for x in res] - else: - param = line - comment = None - res = param.strip().split(sep, maxsplit=1) + data = [] + for line in l_of_str: + line = line.strip() + logger.debug(line) + # ! logic below also allows to keep the comments as comments + if line.startswith("#"): + continue + if not line: + continue + if "#" in line: + res = line.split("#") if len(res) == 1: - param = res[0].strip() - data.append(Parameter(param, None, comment)) + comment = res[0] + data.append(Parameter(None, None, comment.strip())) continue - param, value = [x.strip() for x in res] - data.append(Parameter(param, value, comment)) + param, comment = [x.strip() for x in res] + else: + param = line + comment = None + res = param.strip().split(sep, maxsplit=1) + if len(res) == 1: + param = res[0].strip() + data.append(Parameter(param, None, comment)) + continue + param, value = [x.strip() for x in res] + data.append(Parameter(param, value, comment)) return data -def extract_params(file: str, f_fragpipe_workflow) -> ProteoBenchParameters: - msfragger_params = read_file(file) +def read_msfragger_params(file: BytesIO, sep: str = " = ") -> list[Parameter]: + l_of_str = file.read().decode("utf-8").splitlines() + return parse_params(l_of_str, sep=sep) + + +def read_fragpipe_workflow(file: BytesIO, sep: str = "=") -> list[Parameter]: + l_of_str = file.read().decode("utf-8").splitlines() + header = l_of_str[0][1:].strip() + return header, parse_params(l_of_str, sep=sep) + + +def extract_params(file: str, file1) -> ProteoBenchParameters: + # ! make it possible to pass files in both orders + msfragger_params, fragpipe_params = None, None + for f_ in [file, file1]: + print("file:", f_) + f_suffix = Path(f_.name).suffix + if f_suffix == ".params": + if msfragger_params is not None: + raise ValueError("MSFragger params file already parsed.") + msfragger_params = read_msfragger_params(f_) + elif f_suffix == ".workflow": + if fragpipe_params is not None: + raise ValueError("FragPipe workflow file already parsed.") + header, fragpipe_params = read_fragpipe_workflow(f_) + else: + raise ValueError("File extension not recognized.") + + print("MSFragger params:\n", repr(msfragger_params)) msfragger_params = pd.DataFrame.from_records(msfragger_params, columns=Parameter._fields).set_index( Parameter._fields[0] ) - fragpipe_params = read_file(f_fragpipe_workflow, sep="=") + print("fragpipe params:\n", repr(fragpipe_params)) fragpipe_params = pd.DataFrame.from_records(fragpipe_params, columns=Parameter._fields).set_index( Parameter._fields[0] ) - # FragPipe version in first line - with open(f_fragpipe_workflow) as f: - header = next(iter(f))[1:].strip() - match = re.search(VERSION_NO_PATTERN, header) if match: @@ -121,16 +145,19 @@ def extract_params(file: str, f_fragpipe_workflow) -> ProteoBenchParameters: from pprint import pprint file = pathlib.Path("../../../test/params/fragger.params") - data = read_file(file) + with open(file, "rb") as f: + data = read_msfragger_params(f) df = pd.DataFrame.from_records(data, columns=Parameter._fields).set_index(Parameter._fields[0]) df.to_csv(file.with_suffix(".csv")) file_fragpipe = pathlib.Path("../../../test/params/fragpipe.workflow") - data = read_file(file_fragpipe, sep="=") + with open(file_fragpipe, "rb") as f: + _, data = read_fragpipe_workflow(f) df = pd.DataFrame.from_records(data, columns=Parameter._fields).set_index(Parameter._fields[0]) df.to_csv(file_fragpipe.with_suffix(".csv")) - params = extract_params(file, file_fragpipe) + with open(file, "rb") as f, open(file_fragpipe, "rb") as f2: + params = extract_params(f, f2) pprint(params.__dict__) series = pd.Series(params.__dict__) series.to_csv(file.parent / "fragger_extracted_params.csv") diff --git a/proteobench/modules/dda_quant_base/module.py b/proteobench/modules/dda_quant_base/module.py index 426a366b..ad7a877f 100644 --- a/proteobench/modules/dda_quant_base/module.py +++ b/proteobench/modules/dda_quant_base/module.py @@ -16,6 +16,7 @@ from proteobench.github.gh import clone_repo, pr_github, read_results_json_repo from proteobench.io.params import ProteoBenchParameters from proteobench.io.params.alphapept import extract_params as extract_params_alphapept +from proteobench.io.params.fragger import extract_params as extract_params_fragger from proteobench.io.params.maxquant import extract_params as extract_params_maxquant from proteobench.io.params.proline import extract_params as extract_params_proline from proteobench.io.params.sage import extract_params as extract_params_sage @@ -48,6 +49,7 @@ def is_implemented(self) -> bool: "Proline": extract_params_proline, "AlphaPept": extract_params_alphapept, "Sage": extract_params_sage, + "FragPipe": extract_params_fragger, } @staticmethod diff --git a/test/test_parse_params_fragger.py b/test/test_parse_params_fragger.py index e708eef4..053a4e41 100644 --- a/test/test_parse_params_fragger.py +++ b/test/test_parse_params_fragger.py @@ -2,36 +2,44 @@ from pathlib import Path import pandas as pd -import pytest import proteobench.io.params.fragger as fragger_params TESTDATA_DIR = Path(__file__).parent / "params" -# ! currently fragpipe with msfragger has two parameter/configuration files per run -fnames = ["fragger.params", "fragpipe.workflow"] -fnames = [TESTDATA_DIR / fname for fname in fnames] -fnames = [(fname, fname.with_suffix(".json")) for fname in fnames] - - -@pytest.mark.parametrize("file,csv_expected", fnames) -def test_read_file(file, csv_expected): +def test_read_msfragger_params(): file = TESTDATA_DIR / "fragger.params" csv_expected = TESTDATA_DIR / "fragger.csv" - expected = pd.read_csv(csv_expected) - data = fragger_params.read_file(file) + expected = pd.read_csv(csv_expected, index_col=0) + with open(file, "rb") as f: + data = fragger_params.read_msfragger_params(f) actual = pd.DataFrame.from_records(data, columns=(fragger_params.Parameter._fields)).set_index( fragger_params.Parameter._fields[0] ) - actual.equals(expected) + actual = pd.read_csv(io.StringIO(actual.to_csv()), index_col=0).squeeze("columns") + assert actual.equals(expected) + + +def test_read_fragpipe_workflow(): + file = TESTDATA_DIR / "fragpipe.workflow" + csv_expected = TESTDATA_DIR / "fragpipe.csv" + expected = pd.read_csv(csv_expected, index_col=0) + with open(file, "rb") as f: + _, data = fragger_params.read_fragpipe_workflow(f) + actual = pd.DataFrame.from_records(data, columns=(fragger_params.Parameter._fields)).set_index( + fragger_params.Parameter._fields[0] + ) + actual = pd.read_csv(io.StringIO(actual.to_csv()), index_col=0).squeeze("columns") + assert actual.equals(expected) def test_extract_params(): file = TESTDATA_DIR / "fragger.params" f_fragpipe_workflow = TESTDATA_DIR / "fragpipe.workflow" expected = pd.read_csv(TESTDATA_DIR / "fragger_extracted_params.csv", index_col=0).squeeze("columns") - actual = fragger_params.extract_params(file, f_fragpipe_workflow) + with open(file, "rb") as f1, open(f_fragpipe_workflow, "rb") as f2: + actual = fragger_params.extract_params(f1, f2) actual = pd.Series(actual.__dict__) actual = pd.read_csv(io.StringIO(actual.to_csv()), index_col=0).squeeze("columns") assert expected.equals(actual) From 172b38e5bb986d6eb41c3dd7a644d66e0d86228b Mon Sep 17 00:00:00 2001 From: Henry Date: Mon, 4 Mar 2024 11:48:02 +0100 Subject: [PATCH 3/8] :bug: add units to tolerances --- proteobench/io/params/fragger.py | 19 +++++++++++++------ test/params/fragger_extracted_params.csv | 4 ++-- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/proteobench/io/params/fragger.py b/proteobench/io/params/fragger.py index 01789cda..3da58748 100644 --- a/proteobench/io/params/fragger.py +++ b/proteobench/io/params/fragger.py @@ -1,7 +1,7 @@ """Functionality to parse FragPipe fragger.params parameter files. -FragPipe has a text based paramter file format which -separates paramters and their value using an equal sign. Optional comments are +FragPipe has a text based paramter file format which +separates paramters and their value using an equal sign. Optional comments are expressed with a hash sign. """ @@ -83,11 +83,9 @@ def extract_params(file: str, file1) -> ProteoBenchParameters: else: raise ValueError("File extension not recognized.") - print("MSFragger params:\n", repr(msfragger_params)) msfragger_params = pd.DataFrame.from_records(msfragger_params, columns=Parameter._fields).set_index( Parameter._fields[0] ) - print("fragpipe params:\n", repr(fragpipe_params)) fragpipe_params = pd.DataFrame.from_records(fragpipe_params, columns=Parameter._fields).set_index( Parameter._fields[0] ) @@ -118,8 +116,17 @@ def extract_params(file: str, file1) -> ProteoBenchParameters: params.min_peptide_length = msfragger_params.loc["digest_min_length", "value"] params.max_peptide_length = msfragger_params.loc["digest_max_length", "value"] - params.precursor_mass_tolerance = msfragger_params.loc["precursor_true_tolerance", "value"] - params.fragment_mass_tolerance = msfragger_params.loc["fragment_mass_tolerance", "value"] + precursor_mass_units = "Da" + if int(msfragger_params.loc["precursor_mass_units", "value"]): + precursor_mass_units = "ppm" + params.precursor_mass_tolerance = ( + f'{msfragger_params.loc["precursor_true_tolerance", "value"]} {precursor_mass_units}' + ) + + fragment_mass_units = "Da" + if int(msfragger_params.loc["fragment_mass_units", "value"]): + fragment_mass_units = "ppm" + params.fragment_mass_tolerance = f'{msfragger_params.loc["fragment_mass_tolerance", "value"]} {fragment_mass_units}' # ! ionquant is not necessarily fixed? params.ident_fdr_protein = fragpipe_params.loc["ionquant.proteinfdr", "value"] params.ident_fdr_peptide = fragpipe_params.loc["ionquant.peptidefdr", "value"] diff --git a/test/params/fragger_extracted_params.csv b/test/params/fragger_extracted_params.csv index 97b606ef..b5c4822e 100644 --- a/test/params/fragger_extracted_params.csv +++ b/test/params/fragger_extracted_params.csv @@ -7,8 +7,8 @@ ident_fdr_psm,0.01 ident_fdr_peptide,0.01 ident_fdr_protein,0.01 enable_match_between_runs,True -precursor_mass_tolerance,20 -fragment_mass_tolerance,20 +precursor_mass_tolerance,20 ppm +fragment_mass_tolerance,20 ppm enzyme,stricttrypsin allowed_miscleavages,2 min_peptide_length,7 From 05ce5f4e0ecb870cf64eb933b9c69f8ebc37a384 Mon Sep 17 00:00:00 2001 From: Henry Date: Tue, 5 Mar 2024 10:22:18 +0100 Subject: [PATCH 4/8] :art::fire: type hints and remove unused code --- proteobench/io/params/fragger.py | 2 +- proteobench/modules/dda_quant_base/module.py | 2 -- webinterface/Home.py | 1 - 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/proteobench/io/params/fragger.py b/proteobench/io/params/fragger.py index 3da58748..cbf82390 100644 --- a/proteobench/io/params/fragger.py +++ b/proteobench/io/params/fragger.py @@ -66,7 +66,7 @@ def read_fragpipe_workflow(file: BytesIO, sep: str = "=") -> list[Parameter]: return header, parse_params(l_of_str, sep=sep) -def extract_params(file: str, file1) -> ProteoBenchParameters: +def extract_params(file: BytesIO, file1: BytesIO) -> ProteoBenchParameters: # ! make it possible to pass files in both orders msfragger_params, fragpipe_params = None, None for f_ in [file, file1]: diff --git a/proteobench/modules/dda_quant_base/module.py b/proteobench/modules/dda_quant_base/module.py index ad7a877f..0c2b0ee2 100644 --- a/proteobench/modules/dda_quant_base/module.py +++ b/proteobench/modules/dda_quant_base/module.py @@ -157,8 +157,6 @@ def compute_group_stats( def compute_epsilon(withspecies, species_expected_ratio): # for all columns named parse_settings.species_dict.values() compute the sum over the rows and add it to a new column "unique" withspecies["unique"] = withspecies[species_expected_ratio.keys()].sum(axis=1) - # create a list tabulating how many entries in withspecies["unique"] are 1,2,3,4,5,6 - unique_counts = withspecies["unique"].value_counts() # now remove all rows with withspecies["unique"] > 1 withspecies = withspecies[withspecies["unique"] == 1] diff --git a/webinterface/Home.py b/webinterface/Home.py index 5603ae93..1c1f8fc4 100644 --- a/webinterface/Home.py +++ b/webinterface/Home.py @@ -1,6 +1,5 @@ """psm_utils Streamlit-based web server.""" -import streamlit as st from _base import StreamlitPage From 1048817662a461628fb1cee59517eeb94f63a48b Mon Sep 17 00:00:00 2001 From: Henry Date: Tue, 5 Mar 2024 16:45:09 +0100 Subject: [PATCH 5/8] :art: print dictionary of parsed and selected parameters --- webinterface/pages/DDA_Quant_ion.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/webinterface/pages/DDA_Quant_ion.py b/webinterface/pages/DDA_Quant_ion.py index 6190a877..1633f11c 100644 --- a/webinterface/pages/DDA_Quant_ion.py +++ b/webinterface/pages/DDA_Quant_ion.py @@ -4,6 +4,7 @@ import logging import uuid from datetime import datetime +from pprint import pformat import plotly.express as px import plotly.graph_objects as go @@ -613,6 +614,7 @@ def generate_results( try: print(self.user_input["input_format"]) params = IonModule().load_params_file(self.user_input[META_DATA], self.user_input["input_format"]) + st.text(f"Parsed and selected parameters:\n{pformat(params.__dict__)}") except KeyError as e: st.error("Parsing of meta parameters file for this software is not supported yet.") # except Exception as err: From de09c74d77150fb3aa86719077de763f5ce50b35 Mon Sep 17 00:00:00 2001 From: Henry Date: Tue, 5 Mar 2024 16:46:06 +0100 Subject: [PATCH 6/8] :art: swap inputs: First tool, then results --- webinterface/pages/DDA_Quant_ion.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/webinterface/pages/DDA_Quant_ion.py b/webinterface/pages/DDA_Quant_ion.py index 1633f11c..44221129 100644 --- a/webinterface/pages/DDA_Quant_ion.py +++ b/webinterface/pages/DDA_Quant_ion.py @@ -180,14 +180,14 @@ def _main_page(self): search. This is important when using MaxQuant and FragPipe, among other tools. """ ) - self.user_input["input_csv"] = st.file_uploader( - "Software tool result file", help=self.texts.Help.input_file - ) - self.user_input["input_format"] = st.selectbox( "Software tool", INPUT_FORMATS, help=self.texts.Help.input_format ) + self.user_input["input_csv"] = st.file_uploader( + "Software tool result file", help=self.texts.Help.input_file + ) + # self.user_input["pull_req"] = st.text_input( # "Open pull request to make results available to everyone (type \"YES\" to enable)", # "NO" From 6e8a2af949297b8376eef4677fa2a8f00a71cad2 Mon Sep 17 00:00:00 2001 From: Henry Date: Tue, 5 Mar 2024 16:47:56 +0100 Subject: [PATCH 7/8] :wrench: update VSCode config to latest version --- .vscode/settings.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index e6f95556..31bb8c84 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -2,7 +2,7 @@ "[python]": { "editor.formatOnSave": true, "editor.codeActionsOnSave": { - "source.organizeImports": true + "source.organizeImports": "explicit" }, "editor.rulers": [ 88 From ff277d923d87db83377789b2afaa660c5e4e9f4d Mon Sep 17 00:00:00 2001 From: Marie Locard-Paulet <8386272+mlocardpaulet@users.noreply.github.com> Date: Tue, 5 Mar 2024 17:44:52 +0100 Subject: [PATCH 8/8] amend documentation for FragPipe public submission --- docs/modules/3-DDA-Quantification-ion-level.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/modules/3-DDA-Quantification-ion-level.md b/docs/modules/3-DDA-Quantification-ion-level.md index 7f2522ed..01f60f3e 100644 --- a/docs/modules/3-DDA-Quantification-ion-level.md +++ b/docs/modules/3-DDA-Quantification-ion-level.md @@ -43,7 +43,7 @@ The module is flexible in terms of what workflow the participants can run. Howev When you have successfully uploaded and visualized a benchmark run, we strongly encourage you to add the result to the online repository. This way, your run will be available to the entire community and can be compared to all other uploaded benchmark runs. By doing so, your workflow outputs, parameters and calculated metrics will be stored and publicly available. -To submit your run for public usage, you need to upload the parameter file associated to your run in the field `Meta data for searches`. Currently, we accept outputs from MaxQuant and Proline (see bellow for more tool-specific details). We are working on adding FragPipe outputs. Please fill the `Comments for submission` if needed, and confirm that the metadata is correct (correspond to the benchmark run) before pressing the button `I really want to upload it`. +To submit your run for public usage, you need to upload the parameter file associated to your run in the field `Meta data for searches`. Currently, we accept outputs from MaxQuant, FragPipe and Proline (see bellow for more tool-specific details). Please fill the `Comments for submission` if needed, and confirm that the metadata is correct (correspond to the benchmark run) before pressing the button `I really want to upload it`. After upload, you will get a link to the pull request associated with your data. Please copy it and save it. With this link, you can get the unique identifier of your run (for example "Proline__20240106_141919"), and follow the advancement of your submission and add comments to communicate with the ProteoBench maintainers. If everything looks good, your submission will be reviewed and accepted (it will take a few working days). Then, your benchmark run will be added to the public runs of this module and plotted alongside all other benchmark runs in the figure. @@ -60,6 +60,7 @@ After upload, you will get a link to the pull request associated with your data. 2. Following import of raw files, assign experiments "by File Name" right above the list of raw files. 3. **Make sure contaminants are not added when you add decoys to the database**. 4. Upload "combined_ion/modified_peptides.tsv" in order for Proteobench to calculate the ion ratios. Parameter files are not yet implemented in ProteoBench, but we are working on it. +For public submission, please provide the ".params" and the ".worflow" files that correspond to your search. ### i2MassChroQ -- available soon --