Skip to content

Commit

Permalink
Merge pull request #264 from Proteobench/fragpipe_params_upload
Browse files Browse the repository at this point in the history
Fragpipe params upload
  • Loading branch information
wolski authored Mar 11, 2024
2 parents 2435449 + ff277d9 commit 953dbb9
Show file tree
Hide file tree
Showing 8 changed files with 113 additions and 67 deletions.
2 changes: 1 addition & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"[python]": {
"editor.formatOnSave": true,
"editor.codeActionsOnSave": {
"source.organizeImports": true
"source.organizeImports": "explicit"
},
"editor.rulers": [
88
Expand Down
3 changes: 2 additions & 1 deletion docs/modules/3-DDA-Quantification-ion-level.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ The module is flexible in terms of what workflow the participants can run. Howev

When you have successfully uploaded and visualized a benchmark run, we strongly encourage you to add the result to the online repository. This way, your run will be available to the entire community and can be compared to all other uploaded benchmark runs. By doing so, your workflow outputs, parameters and calculated metrics will be stored and publicly available.

To submit your run for public usage, you need to upload the parameter file associated to your run in the field `Meta data for searches`. Currently, we accept outputs from MaxQuant and Proline (see bellow for more tool-specific details). We are working on adding FragPipe outputs. Please fill the `Comments for submission` if needed, and confirm that the metadata is correct (correspond to the benchmark run) before pressing the button `I really want to upload it`.
To submit your run for public usage, you need to upload the parameter file associated to your run in the field `Meta data for searches`. Currently, we accept outputs from MaxQuant, FragPipe and Proline (see bellow for more tool-specific details). Please fill the `Comments for submission` if needed, and confirm that the metadata is correct (correspond to the benchmark run) before pressing the button `I really want to upload it`.

After upload, you will get a link to the pull request associated with your data. Please copy it and save it. With this link, you can get the unique identifier of your run (for example "Proline__20240106_141919"), and follow the advancement of your submission and add comments to communicate with the ProteoBench maintainers. If everything looks good, your submission will be reviewed and accepted (it will take a few working days). Then, your benchmark run will be added to the public runs of this module and plotted alongside all other benchmark runs in the figure.

Expand All @@ -60,6 +60,7 @@ After upload, you will get a link to the pull request associated with your data.
2. Following import of raw files, assign experiments "by File Name" right above the list of raw files.
3. **Make sure contaminants are not added when you add decoys to the database**.
4. Upload "combined_ion/modified_peptides.tsv" in order for Proteobench to calculate the ion ratios. Parameter files are not yet implemented in ProteoBench, but we are working on it.
For public submission, please provide the ".params" and the ".worflow" files that correspond to your search.

### i2MassChroQ
-- available soon --
Expand Down
114 changes: 74 additions & 40 deletions proteobench/io/params/fragger.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
"""Functionality to parse FragPipe fragger.params parameter files.
FragPipe has a text based paramter file format which
separates paramters and their value using an equal sign. Optional comments are
FragPipe has a text based paramter file format which
separates paramters and their value using an equal sign. Optional comments are
expressed with a hash sign.
"""

from __future__ import annotations

import logging
import re
from collections import namedtuple
from io import BytesIO
from pathlib import Path

import pandas as pd
Expand All @@ -22,52 +24,72 @@
VERSION_NO_PATTERN = r"\d+(\.\d+)*"


def read_file(file: str, sep: str = " = ") -> list[Parameter]:
def parse_params(l_of_str: list[str], sep: str = " = ") -> list[Parameter]:
"""Read FragPipe parameter file as list of records."""
with open(file) as f:
data = []
for line in f:
line = line.strip()
logger.debug(line)
# ! logic below also allows to keep the comments as comments
if line.startswith("#"):
continue
if not line:
continue
if "#" in line:
res = line.split("#")
if len(res) == 1:
comment = res[0]
data.append(Parameter(None, None, comment.strip()))
continue
param, comment = [x.strip() for x in res]
else:
param = line
comment = None
res = param.strip().split(sep, maxsplit=1)
data = []
for line in l_of_str:
line = line.strip()
logger.debug(line)
# ! logic below also allows to keep the comments as comments
if line.startswith("#"):
continue
if not line:
continue
if "#" in line:
res = line.split("#")
if len(res) == 1:
param = res[0].strip()
data.append(Parameter(param, None, comment))
comment = res[0]
data.append(Parameter(None, None, comment.strip()))
continue
param, value = [x.strip() for x in res]
data.append(Parameter(param, value, comment))
param, comment = [x.strip() for x in res]
else:
param = line
comment = None
res = param.strip().split(sep, maxsplit=1)
if len(res) == 1:
param = res[0].strip()
data.append(Parameter(param, None, comment))
continue
param, value = [x.strip() for x in res]
data.append(Parameter(param, value, comment))
return data


def extract_params(file: str, f_fragpipe_workflow) -> ProteoBenchParameters:
msfragger_params = read_file(file)
def read_msfragger_params(file: BytesIO, sep: str = " = ") -> list[Parameter]:
l_of_str = file.read().decode("utf-8").splitlines()
return parse_params(l_of_str, sep=sep)


def read_fragpipe_workflow(file: BytesIO, sep: str = "=") -> list[Parameter]:
l_of_str = file.read().decode("utf-8").splitlines()
header = l_of_str[0][1:].strip()
return header, parse_params(l_of_str, sep=sep)


def extract_params(file: BytesIO, file1: BytesIO) -> ProteoBenchParameters:
# ! make it possible to pass files in both orders
msfragger_params, fragpipe_params = None, None
for f_ in [file, file1]:
print("file:", f_)
f_suffix = Path(f_.name).suffix
if f_suffix == ".params":
if msfragger_params is not None:
raise ValueError("MSFragger params file already parsed.")
msfragger_params = read_msfragger_params(f_)
elif f_suffix == ".workflow":
if fragpipe_params is not None:
raise ValueError("FragPipe workflow file already parsed.")
header, fragpipe_params = read_fragpipe_workflow(f_)
else:
raise ValueError("File extension not recognized.")

msfragger_params = pd.DataFrame.from_records(msfragger_params, columns=Parameter._fields).set_index(
Parameter._fields[0]
)
fragpipe_params = read_file(f_fragpipe_workflow, sep="=")
fragpipe_params = pd.DataFrame.from_records(fragpipe_params, columns=Parameter._fields).set_index(
Parameter._fields[0]
)

# FragPipe version in first line
with open(f_fragpipe_workflow) as f:
header = next(iter(f))[1:].strip()

match = re.search(VERSION_NO_PATTERN, header)

if match:
Expand All @@ -94,8 +116,17 @@ def extract_params(file: str, f_fragpipe_workflow) -> ProteoBenchParameters:
params.min_peptide_length = msfragger_params.loc["digest_min_length", "value"]
params.max_peptide_length = msfragger_params.loc["digest_max_length", "value"]

params.precursor_mass_tolerance = msfragger_params.loc["precursor_true_tolerance", "value"]
params.fragment_mass_tolerance = msfragger_params.loc["fragment_mass_tolerance", "value"]
precursor_mass_units = "Da"
if int(msfragger_params.loc["precursor_mass_units", "value"]):
precursor_mass_units = "ppm"
params.precursor_mass_tolerance = (
f'{msfragger_params.loc["precursor_true_tolerance", "value"]} {precursor_mass_units}'
)

fragment_mass_units = "Da"
if int(msfragger_params.loc["fragment_mass_units", "value"]):
fragment_mass_units = "ppm"
params.fragment_mass_tolerance = f'{msfragger_params.loc["fragment_mass_tolerance", "value"]} {fragment_mass_units}'
# ! ionquant is not necessarily fixed?
params.ident_fdr_protein = fragpipe_params.loc["ionquant.proteinfdr", "value"]
params.ident_fdr_peptide = fragpipe_params.loc["ionquant.peptidefdr", "value"]
Expand All @@ -121,16 +152,19 @@ def extract_params(file: str, f_fragpipe_workflow) -> ProteoBenchParameters:
from pprint import pprint

file = pathlib.Path("../../../test/params/fragger.params")
data = read_file(file)
with open(file, "rb") as f:
data = read_msfragger_params(f)
df = pd.DataFrame.from_records(data, columns=Parameter._fields).set_index(Parameter._fields[0])
df.to_csv(file.with_suffix(".csv"))

file_fragpipe = pathlib.Path("../../../test/params/fragpipe.workflow")
data = read_file(file_fragpipe, sep="=")
with open(file_fragpipe, "rb") as f:
_, data = read_fragpipe_workflow(f)
df = pd.DataFrame.from_records(data, columns=Parameter._fields).set_index(Parameter._fields[0])
df.to_csv(file_fragpipe.with_suffix(".csv"))

params = extract_params(file, file_fragpipe)
with open(file, "rb") as f, open(file_fragpipe, "rb") as f2:
params = extract_params(f, f2)
pprint(params.__dict__)
series = pd.Series(params.__dict__)
series.to_csv(file.parent / "fragger_extracted_params.csv")
11 changes: 6 additions & 5 deletions proteobench/modules/dda_quant_base/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from proteobench.github.gh import clone_repo, pr_github, read_results_json_repo
from proteobench.io.params import ProteoBenchParameters
from proteobench.io.params.alphapept import extract_params as extract_params_alphapept
from proteobench.io.params.fragger import extract_params as extract_params_fragger
from proteobench.io.params.maxquant import extract_params as extract_params_maxquant
from proteobench.io.params.proline import extract_params as extract_params_proline
from proteobench.io.params.sage import extract_params as extract_params_sage
Expand Down Expand Up @@ -48,6 +49,7 @@ def is_implemented(self) -> bool:
"Proline": extract_params_proline,
"AlphaPept": extract_params_alphapept,
"Sage": extract_params_sage,
"FragPipe": extract_params_fragger,
}

@staticmethod
Expand Down Expand Up @@ -155,8 +157,6 @@ def compute_group_stats(
def compute_epsilon(withspecies, species_expected_ratio):
# for all columns named parse_settings.species_dict.values() compute the sum over the rows and add it to a new column "unique"
withspecies["unique"] = withspecies[species_expected_ratio.keys()].sum(axis=1)
# create a list tabulating how many entries in withspecies["unique"] are 1,2,3,4,5,6
unique_counts = withspecies["unique"].value_counts()

# now remove all rows with withspecies["unique"] > 1
withspecies = withspecies[withspecies["unique"] == 1]
Expand Down Expand Up @@ -430,9 +430,10 @@ def write_intermediate_raw(self, dir, ident, input_df, result_performance, param
input_df.to_csv(os.path.join(path_write, "input_df.csv"))
result_performance.to_csv(os.path.join(path_write, "result_performance.csv"))

def load_params_file(self, input_file: str, input_format: str) -> ProteoBenchParameters:
def load_params_file(self, input_file: list[str], input_format: str) -> ProteoBenchParameters:
"""Method loads parameters from a metadata file depending on its format."""
print(self.EXTRACT_PARAMS_DICT)
params = self.EXTRACT_PARAMS_DICT[input_format](input_file)
# ! adapted to be able to parse more than one file.
# ! how to ensure orrect order?
params = self.EXTRACT_PARAMS_DICT[input_format](*input_file)
params.software_name = input_format
return params
4 changes: 2 additions & 2 deletions test/params/fragger_extracted_params.csv
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ ident_fdr_psm,0.01
ident_fdr_peptide,0.01
ident_fdr_protein,0.01
enable_match_between_runs,True
precursor_mass_tolerance,20
fragment_mass_tolerance,20
precursor_mass_tolerance,20 ppm
fragment_mass_tolerance,20 ppm
enzyme,stricttrypsin
allowed_miscleavages,2
min_peptide_length,7
Expand Down
34 changes: 21 additions & 13 deletions test/test_parse_params_fragger.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,36 +2,44 @@
from pathlib import Path

import pandas as pd
import pytest

import proteobench.io.params.fragger as fragger_params

TESTDATA_DIR = Path(__file__).parent / "params"

# ! currently fragpipe with msfragger has two parameter/configuration files per run
fnames = ["fragger.params", "fragpipe.workflow"]
fnames = [TESTDATA_DIR / fname for fname in fnames]

fnames = [(fname, fname.with_suffix(".json")) for fname in fnames]


@pytest.mark.parametrize("file,csv_expected", fnames)
def test_read_file(file, csv_expected):
def test_read_msfragger_params():
file = TESTDATA_DIR / "fragger.params"
csv_expected = TESTDATA_DIR / "fragger.csv"
expected = pd.read_csv(csv_expected)
data = fragger_params.read_file(file)
expected = pd.read_csv(csv_expected, index_col=0)
with open(file, "rb") as f:
data = fragger_params.read_msfragger_params(f)
actual = pd.DataFrame.from_records(data, columns=(fragger_params.Parameter._fields)).set_index(
fragger_params.Parameter._fields[0]
)
actual.equals(expected)
actual = pd.read_csv(io.StringIO(actual.to_csv()), index_col=0).squeeze("columns")
assert actual.equals(expected)


def test_read_fragpipe_workflow():
file = TESTDATA_DIR / "fragpipe.workflow"
csv_expected = TESTDATA_DIR / "fragpipe.csv"
expected = pd.read_csv(csv_expected, index_col=0)
with open(file, "rb") as f:
_, data = fragger_params.read_fragpipe_workflow(f)
actual = pd.DataFrame.from_records(data, columns=(fragger_params.Parameter._fields)).set_index(
fragger_params.Parameter._fields[0]
)
actual = pd.read_csv(io.StringIO(actual.to_csv()), index_col=0).squeeze("columns")
assert actual.equals(expected)


def test_extract_params():
file = TESTDATA_DIR / "fragger.params"
f_fragpipe_workflow = TESTDATA_DIR / "fragpipe.workflow"
expected = pd.read_csv(TESTDATA_DIR / "fragger_extracted_params.csv", index_col=0).squeeze("columns")
actual = fragger_params.extract_params(file, f_fragpipe_workflow)
with open(file, "rb") as f1, open(f_fragpipe_workflow, "rb") as f2:
actual = fragger_params.extract_params(f1, f2)
actual = pd.Series(actual.__dict__)
actual = pd.read_csv(io.StringIO(actual.to_csv()), index_col=0).squeeze("columns")
assert expected.equals(actual)
1 change: 0 additions & 1 deletion webinterface/Home.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""psm_utils Streamlit-based web server."""

import streamlit as st
from _base import StreamlitPage


Expand Down
11 changes: 7 additions & 4 deletions webinterface/pages/DDA_Quant_ion.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import logging
import uuid
from datetime import datetime
from pprint import pformat

import plotly.express as px
import plotly.graph_objects as go
Expand Down Expand Up @@ -179,14 +180,14 @@ def _main_page(self):
search. This is important when using MaxQuant and FragPipe, among other tools.
"""
)
self.user_input["input_csv"] = st.file_uploader(
"Software tool result file", help=self.texts.Help.input_file
)

self.user_input["input_format"] = st.selectbox(
"Software tool", INPUT_FORMATS, help=self.texts.Help.input_format
)

self.user_input["input_csv"] = st.file_uploader(
"Software tool result file", help=self.texts.Help.input_file
)

# self.user_input["pull_req"] = st.text_input(
# "Open pull request to make results available to everyone (type \"YES\" to enable)",
# "NO"
Expand Down Expand Up @@ -590,6 +591,7 @@ def generate_results(
"Meta data for searches",
help=self.texts.Help.meta_data_file,
key=meta_file_uploader_uuid,
accept_multiple_files=True,
)

self.user_input["comments_for_submission"] = st.text_area(
Expand All @@ -612,6 +614,7 @@ def generate_results(
try:
print(self.user_input["input_format"])
params = IonModule().load_params_file(self.user_input[META_DATA], self.user_input["input_format"])
st.text(f"Parsed and selected parameters:\n{pformat(params.__dict__)}")
except KeyError as e:
st.error("Parsing of meta parameters file for this software is not supported yet.")
# except Exception as err:
Expand Down

0 comments on commit 953dbb9

Please sign in to comment.