Skip to content

Commit

Permalink
Merge pull request #198 from Proteobench/fragpipe_msfragger_params
Browse files Browse the repository at this point in the history
FragPipe+MsFragger argument parsing (+fixing Proline Parsing Errors)
  • Loading branch information
mlocardpaulet authored Jan 19, 2024
2 parents 1efc3b6 + eda8402 commit b4cc7bf
Show file tree
Hide file tree
Showing 12 changed files with 824 additions and 36 deletions.
2 changes: 1 addition & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"-p",
"test_*.py"
],
"python.testing.pytestEnabled": false,
"python.testing.pytestEnabled": true,
"python.testing.unittestEnabled": true,
"flake8.args": [
"--max-line-length=120",
Expand Down
88 changes: 83 additions & 5 deletions proteobench/io/params/fragger.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,22 @@
from __future__ import annotations

import logging
import re
from collections import namedtuple
from pathlib import Path

import pandas as pd

from proteobench.io.params import ProteoBenchParameters

logger = logging.getLogger(__name__)

Parameter = namedtuple("Parameter", ["name", "value", "comment"])

VERSION_NO_PATTERN = r"\d+(\.\d+)*"


def read_file(file: str) -> list[Parameter]:
def read_file(file: str, sep: str = " = ") -> list[Parameter]:
"""Read FragPipe parameter file as list of records."""
with open(file) as f:
data = []
Expand All @@ -36,7 +44,7 @@ def read_file(file: str) -> list[Parameter]:
else:
param = line
comment = None
res = param.strip().split(" = ")
res = param.strip().split(sep, maxsplit=1)
if len(res) == 1:
param = res[0].strip()
data.append(Parameter(param, None, comment))
Expand All @@ -46,13 +54,83 @@ def read_file(file: str) -> list[Parameter]:
return data


def extract_params(file: str, f_fragpipe_workflow) -> ProteoBenchParameters:
msfragger_params = read_file(file)
msfragger_params = pd.DataFrame.from_records(msfragger_params, columns=Parameter._fields).set_index(
Parameter._fields[0]
)
fragpipe_params = read_file(f_fragpipe_workflow, sep="=")
fragpipe_params = pd.DataFrame.from_records(fragpipe_params, columns=Parameter._fields).set_index(
Parameter._fields[0]
)

# FragPipe version in first line
with open(f_fragpipe_workflow) as f:
header = next(iter(f))[1:].strip()

match = re.search(VERSION_NO_PATTERN, header)

if match:
header = match.group()

params = ProteoBenchParameters()
params.software_name = "FragPipe"
params.software_version = header
params.search_engine = "MSFragger"

msfragger_executable = fragpipe_params.loc["fragpipe-config.bin-msfragger", "value"]
msfragger_executable = Path(msfragger_executable).name
match = re.search(VERSION_NO_PATTERN, msfragger_executable)

if match:
msfragger_executable = match.group()

params.search_engine_version = msfragger_executable
params.enzyme = msfragger_params.loc["search_enzyme_name_1", "value"]
params.allowed_miscleavages = msfragger_params.loc["allowed_missed_cleavage_1", "value"]
params.fixed_mods = fragpipe_params.loc["msfragger.table.fix-mods", "value"]
params.variable_mods = fragpipe_params.loc["msfragger.table.var-mods", "value"]
params.max_mods = msfragger_params.loc["max_variable_mods_per_peptide", "value"]
params.min_peptide_length = msfragger_params.loc["digest_min_length", "value"]
params.max_peptide_length = msfragger_params.loc["digest_max_length", "value"]

params.precursor_mass_tolerance = msfragger_params.loc["precursor_true_tolerance", "value"]
params.fragment_mass_tolerance = msfragger_params.loc["fragment_mass_tolerance", "value"]
# ! ionquant is not necessarily fixed?
params.ident_fdr_protein = fragpipe_params.loc["ionquant.proteinfdr", "value"]
params.ident_fdr_peptide = fragpipe_params.loc["ionquant.peptidefdr", "value"]
params.ident_fdr_psm = fragpipe_params.loc["ionquant.ionfdr", "value"]

for key in ["ident_fdr_protein", "ident_fdr_peptide", "ident_fdr_psm"]:
value = getattr(params, key)
try:
value = int(value) / 100
setattr(params, key, value)
except ValueError:
logging.warning(f"Could not convert {value} to int.")

min_precursor_charge, max_precursor_charge = msfragger_params.loc["precursor_charge", "value"].split(" ")
params.min_precursor_charge = int(min_precursor_charge)
params.max_precursor_charge = int(max_precursor_charge)
params.enable_match_between_runs = bool(fragpipe_params.loc["ionquant.mbr", "value"])
return params


if __name__ == "__main__":
import pathlib

import pandas as pd
from pprint import pprint

file = pathlib.Path("../../../test/params/fragger.params")
data = read_file(file)
df = pd.DataFrame.from_records(data, columns=Parameter._fields).set_index(Parameter._fields[0])
df
df.to_csv(file.with_suffix(".csv"))

file_fragpipe = pathlib.Path("../../../test/params/fragpipe.workflow")
data = read_file(file_fragpipe, sep="=")
df = pd.DataFrame.from_records(data, columns=Parameter._fields).set_index(Parameter._fields[0])
df.to_csv(file_fragpipe.with_suffix(".csv"))

params = extract_params(file, file_fragpipe)
pprint(params.__dict__)
series = pd.Series(params.__dict__)
series.to_csv(file.parent / "fragger_extracted_params.csv")
13 changes: 7 additions & 6 deletions proteobench/io/params/proline.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
- "Import and filters"
- "Quant config"
"""
import pathlib
import re

import pandas as pd
Expand Down Expand Up @@ -56,8 +55,8 @@ def extract_params(fname) -> ProteoBenchParameters:
sheet = sheet[cols].drop_duplicates().reset_index(drop=True)
# Extract
params.software_name = "Proline"
params.software_version = sheet.loc[0, "software_version"]
params.search_engine = sheet.loc[0, "software_name"]
params.search_engine_version = sheet.loc[0, "software_version"]
params.enzyme = sheet.loc[0, "enzymes"]
params.allowed_miscleavages = sheet.loc[0, "max_missed_cleavages"]
params.fixed_mods = sheet.loc[0, "fixed_ptms"]
Expand All @@ -74,24 +73,26 @@ def extract_params(fname) -> ProteoBenchParameters:
assert all(stats.loc["unique", cols] == 1), "Not all columns are unique"
sheet = sheet[cols].drop_duplicates().reset_index(drop=True)
# Extract
params.ident_fdr_psm = sheet.loc[0, "psm_filter_expected_fdr"] # ! 1 stands for 1% FDR
params.ident_fdr_psm = int(sheet.loc[0, "psm_filter_expected_fdr"]) / 100
params.min_peptide_length = find_min_pep_length(sheet.loc[0, "psm_filter_2"])

# ! Third sheet only contains match between runs (MBR) information indirectly
sheet_name = "Quant config"
sheet = excel.parse(sheet_name, dtype="object", index_col=0)
enable_match_between_runs = sheet.index.str.contains("cross assignment").any()
params.enable_match_between_runs = enable_match_between_runs
return params


if __name__ == "__main__":
file = pathlib.Path("../../../test/params/Proline_example_w_Mascot_wo_proteinSets.xlsx")
from pathlib import Path

file = Path("../../../test/params/Proline_example_w_Mascot_wo_proteinSets.xlsx")
params = extract_params(file)
data_dict = params.__dict__
series = pd.Series(data_dict)
series.to_csv(file.with_suffix(".csv"))

file = pathlib.Path("../../../test/params/Proline_example_2.xlsx")
file = Path("../../../test/params/Proline_example_2.xlsx")
params = extract_params(file)
data_dict = params.__dict__
series = pd.Series(data_dict)
Expand Down
8 changes: 4 additions & 4 deletions test/params/Proline_example_2.csv
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
,0
software_name,Proline
software_version,X! Tandem Vengeance (2015.12.15.2)
software_version,
search_engine,XTandem
search_engine_version,
ident_fdr_psm,1
search_engine_version,X! Tandem Vengeance (2015.12.15.2)
ident_fdr_psm,0.01
ident_fdr_peptide,
ident_fdr_protein,
enable_match_between_runs,False
enable_match_between_runs,True
precursor_mass_tolerance,10.0 ppm
fragment_mass_tolerance,0.02 Da
enzyme,Trypsin
Expand Down
8 changes: 4 additions & 4 deletions test/params/Proline_example_w_Mascot_wo_proteinSets.csv
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
,0
software_name,Proline
software_version,2.8.0.1
software_version,
search_engine,Mascot
search_engine_version,
ident_fdr_psm,1
search_engine_version,2.8.0.1
ident_fdr_psm,0.01
ident_fdr_peptide,
ident_fdr_protein,
enable_match_between_runs,False
enable_match_between_runs,True
precursor_mass_tolerance,10.0 ppm
fragment_mass_tolerance,0.02 Da
enzyme,Trypsin/P
Expand Down
19 changes: 12 additions & 7 deletions test/params/fragger.csv
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
name,value,comment
database_name,Q:\MISC_PERSONAL\Bart\Benchmark_experiment_EuBIC\Shared\2023-01-30-decoys-BenchmarkFASTAModule1_DDA.fasta.fas,Path to the protein database file in FASTA format.
num_threads,47,Number of CPU threads to use.
precursor_mass_lower,-10,Lower bound of the precursor mass window.
precursor_mass_upper,10,Upper bound of the precursor mass window.
database_name,/mnt/PUMA/ProjectSQ/ASchmidt/KlemensErwinFrohlich_568/20230131-143622_Analysis/ProteoBench/proteobench/Module_2_DDA_quantification/FASTA/2023-12-16-decoys-BenchmarkFASTAModule2_DDA.fasta.fas,Path to the protein database file in FASTA format.
num_threads,50,Number of CPU threads to use.
precursor_mass_lower,-20,Lower bound of the precursor mass window.
precursor_mass_upper,20,Upper bound of the precursor mass window.
precursor_mass_units,1,"Precursor mass tolerance units (0 for Da, 1 for ppm)."
data_type,0,"Data type (0 for DDA, 1 for DIA, 2 for gas-phase fractionation DIA)."
data_type,0,"Data type (0 for DDA, 1 for DIA, 2 for gas-phase fractionation DIA, 3 for wide-window acquisition DDA)."
precursor_true_tolerance,20,True precursor mass tolerance (window is +/- this value).
precursor_true_units,1,"True precursor mass tolerance units (0 for Da, 1 for ppm)."
fragment_mass_tolerance,20,Fragment mass tolerance (window is +/- this value).
Expand All @@ -16,13 +16,19 @@ deisotope,1,"Perform deisotoping or not (0=no, 1=yes and assume singleton peaks
deneutralloss,1,"Perform deneutrallossing or not (0=no, 1=yes)."
isotope_error,0/1/2,Also search for MS/MS events triggered on specified isotopic peaks.
mass_offsets,0.0,Creates multiple precursor tolerance windows with specified mass offsets.
mass_offsets_detailed =,,Optional detailed mass offset list. Overrides mass_offsets if use_detailed_offsets = 1.
use_detailed_offsets,0,Whether to use the regular (0) or detailed (1) mass offset list.
precursor_mass_mode,selected,One of isolated/selected/corrected.
remove_precursor_peak,1,Remove precursor peaks from tandem mass spectra. 0 = not remove; 1 = remove the peak with precursor charge; 2 = remove the peaks with all charge states (only for DDA mode).
remove_precursor_range,"-1.500000,1.500000",m/z range in removing precursor peaks. Only for DDA mode. Unit: Th.
intensity_transform,0,Transform peaks intensities with sqrt root. 0 = not transform; 1 = transform using sqrt root.
activation_types,all,"Filter to only search scans of provided activation type(s). Allowed: All, HCD, CID, ETD, ECD."
group_variable,0,Specify the variable used to decide the PSM group in the group FDR estimation. 0 = no group FDR; 1 = num_enzyme_termini; 2 = PE from protein header.
require_precursor,1,"If required, PSMs with no precursor peaks will be discarded. For DIA data type only. 0 = no, 1 = yes."
reuse_dia_fragment_peaks,0,"Allow the same peak matches to multiple peptides. For DIA data type only. 0 = no, 1 = yes."
write_calibrated_mzml,0,"Write calibrated MS2 scan to a mzML file (0 for No, 1 for Yes)."
write_uncalibrated_mgf,0,"Write uncalibrated MS2 scan to a MGF file (0 for No, 1 for Yes). Only for .raw and .d formats."
write_mzbin_all,0,
mass_diff_to_variable_mod,0,Put mass diff as a variable modification. 0 for no; 1 for yes and remove delta mass; 2 for yes and keep delta mass.
localize_delta_mass,0,"Include fragment ions mass-shifted by unknown modifications (recommended for open and mass offset searches) (0 for OFF, 1 for ON)."
delta_mass_exclude_ranges,"(-1.5,3.5)",Exclude mass range for shifted ions searching.
Expand Down Expand Up @@ -51,7 +57,7 @@ variable_mod_02,42.0106 [^ 1,
allow_multiple_variable_mods_on_residue,0,
max_variable_mods_per_peptide,3,Maximum total number of variable modifications per peptide.
max_variable_mods_combinations,5000,Maximum number of modified forms allowed for each peptide (up to 65534).
output_format,tsv_pepXML_pin,"File format of output files (tsv, pin, pepxml, tsv_pin, tsv_pepxml, pepxml_pin, or tsv_pepxml_pin)."
output_format,pepXML_pin,"File format of output files (tsv, pin, pepxml, tsv_pin, tsv_pepxml, pepxml_pin, or tsv_pepxml_pin)."
output_report_topN,1,Reports top N PSMs per input spectrum.
output_max_expect,50,Suppresses reporting of PSM if top hit has expectation value greater than this threshold.
report_alternative_proteins,1,"Report alternative proteins for peptides that are found in multiple proteins (0 for no, 1 for yes)."
Expand All @@ -64,7 +70,6 @@ max_fragment_charge,2,Maximum charge state for theoretical fragments to match (1
track_zero_topN,0,Track top N unmodified peptide results separately from main results internally for boosting features.
zero_bin_accept_expect,0,Ranks a zero-bin hit above all non-zero-bin hit if it has expectation less than this value.
zero_bin_mult_expect,1,Multiplies expect value of PSMs in the zero-bin during results ordering (set to less than 1 for boosting).
add_topN_complementary,0,Inserts complementary ions corresponding to the top N most intense fragments in each experimental spectra.
check_spectral_files,1,Checking spectral files before searching.
minimum_peaks,15,Minimum number of peaks in experimental spectrum for matching.
use_topN_peaks,150,Pre-process experimental spectrum to only use top N peaks.
Expand Down
19 changes: 12 additions & 7 deletions test/params/fragger.params
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
database_name = Q:\MISC_PERSONAL\Bart\Benchmark_experiment_EuBIC\Shared\2023-01-30-decoys-BenchmarkFASTAModule1_DDA.fasta.fas # Path to the protein database file in FASTA format.
num_threads = 47 # Number of CPU threads to use.
database_name = /mnt/PUMA/ProjectSQ/ASchmidt/KlemensErwinFrohlich_568/20230131-143622_Analysis/ProteoBench/proteobench/Module_2_DDA_quantification/FASTA/2023-12-16-decoys-BenchmarkFASTAModule2_DDA.fasta.fas # Path to the protein database file in FASTA format.
num_threads = 50 # Number of CPU threads to use.

precursor_mass_lower = -10 # Lower bound of the precursor mass window.
precursor_mass_upper = 10 # Upper bound of the precursor mass window.
precursor_mass_lower = -20 # Lower bound of the precursor mass window.
precursor_mass_upper = 20 # Upper bound of the precursor mass window.
precursor_mass_units = 1 # Precursor mass tolerance units (0 for Da, 1 for ppm).
data_type = 0 # Data type (0 for DDA, 1 for DIA, 2 for gas-phase fractionation DIA).
data_type = 0 # Data type (0 for DDA, 1 for DIA, 2 for gas-phase fractionation DIA, 3 for wide-window acquisition DDA).
precursor_true_tolerance = 20 # True precursor mass tolerance (window is +/- this value).
precursor_true_units = 1 # True precursor mass tolerance units (0 for Da, 1 for ppm).
fragment_mass_tolerance = 20 # Fragment mass tolerance (window is +/- this value).
Expand All @@ -17,15 +17,21 @@ deisotope = 1 # Perform deisotoping or not (0=no, 1=yes and assume singleton p
deneutralloss = 1 # Perform deneutrallossing or not (0=no, 1=yes).
isotope_error = 0/1/2 # Also search for MS/MS events triggered on specified isotopic peaks.
mass_offsets = 0.0 # Creates multiple precursor tolerance windows with specified mass offsets.
mass_offsets_detailed = # Optional detailed mass offset list. Overrides mass_offsets if use_detailed_offsets = 1.
use_detailed_offsets = 0 # Whether to use the regular (0) or detailed (1) mass offset list.
precursor_mass_mode = selected # One of isolated/selected/corrected.

remove_precursor_peak = 1 # Remove precursor peaks from tandem mass spectra. 0 = not remove; 1 = remove the peak with precursor charge; 2 = remove the peaks with all charge states (only for DDA mode).
remove_precursor_range = -1.500000,1.500000 # m/z range in removing precursor peaks. Only for DDA mode. Unit: Th.
intensity_transform = 0 # Transform peaks intensities with sqrt root. 0 = not transform; 1 = transform using sqrt root.
activation_types = all # Filter to only search scans of provided activation type(s). Allowed: All, HCD, CID, ETD, ECD.
group_variable = 0 # Specify the variable used to decide the PSM group in the group FDR estimation. 0 = no group FDR; 1 = num_enzyme_termini; 2 = PE from protein header.
require_precursor = 1 # If required, PSMs with no precursor peaks will be discarded. For DIA data type only. 0 = no, 1 = yes.
reuse_dia_fragment_peaks = 0 # Allow the same peak matches to multiple peptides. For DIA data type only. 0 = no, 1 = yes.

write_calibrated_mzml = 0 # Write calibrated MS2 scan to a mzML file (0 for No, 1 for Yes).
write_uncalibrated_mgf = 0 # Write uncalibrated MS2 scan to a MGF file (0 for No, 1 for Yes). Only for .raw and .d formats.
write_mzbin_all = 0
mass_diff_to_variable_mod = 0 # Put mass diff as a variable modification. 0 for no; 1 for yes and remove delta mass; 2 for yes and keep delta mass.

localize_delta_mass = 0 # Include fragment ions mass-shifted by unknown modifications (recommended for open and mass offset searches) (0 for OFF, 1 for ON).
Expand Down Expand Up @@ -78,7 +84,7 @@ allow_multiple_variable_mods_on_residue = 0
max_variable_mods_per_peptide = 3 # Maximum total number of variable modifications per peptide.
max_variable_mods_combinations = 5000 # Maximum number of modified forms allowed for each peptide (up to 65534).

output_format = tsv_pepXML_pin # File format of output files (tsv, pin, pepxml, tsv_pin, tsv_pepxml, pepxml_pin, or tsv_pepxml_pin).
output_format = pepXML_pin # File format of output files (tsv, pin, pepxml, tsv_pin, tsv_pepxml, pepxml_pin, or tsv_pepxml_pin).
output_report_topN = 1 # Reports top N PSMs per input spectrum.
output_max_expect = 50 # Suppresses reporting of PSM if top hit has expectation value greater than this threshold.
report_alternative_proteins = 1 # Report alternative proteins for peptides that are found in multiple proteins (0 for no, 1 for yes).
Expand All @@ -94,7 +100,6 @@ max_fragment_charge = 2 # Maximum charge state for theoretical fragments to ma
track_zero_topN = 0 # Track top N unmodified peptide results separately from main results internally for boosting features.
zero_bin_accept_expect = 0 # Ranks a zero-bin hit above all non-zero-bin hit if it has expectation less than this value.
zero_bin_mult_expect = 1 # Multiplies expect value of PSMs in the zero-bin during results ordering (set to less than 1 for boosting).
add_topN_complementary = 0 # Inserts complementary ions corresponding to the top N most intense fragments in each experimental spectra.

check_spectral_files = 1 # Checking spectral files before searching.
minimum_peaks = 15 # Minimum number of peaks in experimental spectrum for matching.
Expand Down
20 changes: 20 additions & 0 deletions test/params/fragger_extracted_params.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
,0
software_name,FragPipe
software_version,21.0
search_engine,MSFragger
search_engine_version,4.0
ident_fdr_psm,0.01
ident_fdr_peptide,0.01
ident_fdr_protein,0.01
enable_match_between_runs,True
precursor_mass_tolerance,20
fragment_mass_tolerance,20
enzyme,stricttrypsin
allowed_miscleavages,2
min_peptide_length,7
max_peptide_length,50
fixed_mods,"0.0,C-Term Peptide,true,-1; 0.0,N-Term Peptide,true,-1; 0.0,C-Term Protein,true,-1; 0.0,N-Term Protein,true,-1; 0.0,G (glycine),true,-1; 0.0,A (alanine),true,-1; 0.0,S (serine),true,-1; 0.0,P (proline),true,-1; 0.0,V (valine),true,-1; 0.0,T (threonine),true,-1; 57.02146,C (cysteine),true,-1; 0.0,L (leucine),true,-1; 0.0,I (isoleucine),true,-1; 0.0,N (asparagine),true,-1; 0.0,D (aspartic acid),true,-1; 0.0,Q (glutamine),true,-1; 0.0,K (lysine),true,-1; 0.0,E (glutamic acid),true,-1; 0.0,M (methionine),true,-1; 0.0,H (histidine),true,-1; 0.0,F (phenylalanine),true,-1; 0.0,R (arginine),true,-1; 0.0,Y (tyrosine),true,-1; 0.0,W (tryptophan),true,-1; 0.0,B ,true,-1; 0.0,J,true,-1; 0.0,O,true,-1; 0.0,U,true,-1; 0.0,X,true,-1; 0.0,Z,true,-1"
variable_mods,"15.9949,M,true,3; 42.0106,[^,true,1; 79.96633,STY,false,3; -17.0265,nQnC,false,1; -18.0106,nE,false,1; 4.025107,K,false,2; 6.020129,R,false,2; 8.014199,K,false,2; 10.008269,R,false,2; 0.0,site_10,false,1; 0.0,site_11,false,1; 0.0,site_12,false,1; 0.0,site_13,false,1; 0.0,site_14,false,1; 0.0,site_15,false,1; 0.0,site_16,false,1"
max_mods,3
min_precursor_charge,1
max_precursor_charge,4
Loading

0 comments on commit b4cc7bf

Please sign in to comment.