Skip to content

Commit

Permalink
✨ Parse MSFragger parameter files
Browse files Browse the repository at this point in the history
  • Loading branch information
Henry committed Oct 21, 2023
1 parent 27079ef commit fcce49f
Show file tree
Hide file tree
Showing 5 changed files with 322 additions and 0 deletions.
60 changes: 60 additions & 0 deletions proteobench/io/params/fragger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""Functionality to parse MSFragger fragger.params parameter files.
MSFragger has a text based paramter file format which
separates paramters and their value using an equal sign. Optional comments are
expressed with a hash sign.
"""
from __future__ import annotations

import logging
from collections import namedtuple

logger = logging.getLogger(__name__)

Parameter = namedtuple("Parameter", ["name", "value", "comment"])


def read_file(file: str) -> list[Parameter]:
"""Read MSFragger parameter file as list of records."""
with open(file) as f:
data = []
for line in f:
line = line.strip()
logger.debug(line)
# ! logic below also allows to keep the comments as comments
if line.startswith("#"):
continue
if not line:
continue
if "#" in line:
res = line.split("#")
if len(res) == 1:
comment = res[0]
data.append(Parameter(None, None, comment.strip()))
continue
param, comment = [x.strip() for x in res]
else:
param = line
comment = None
res = param.strip().split(" = ")
if len(res) == 1:
param = res[0].strip()
data.append(Parameter(param, None, comment))
continue
param, value = [x.strip() for x in res]
data.append(Parameter(param, value, comment))
return data


if __name__ == "__main__":
import pathlib

import pandas as pd

file = pathlib.Path("../../../test/params/fragger.params")
data = read_file(file)
df = pd.DataFrame.from_records(data, columns=Parameter._fields).set_index(
Parameter._fields[0]
)
df
df.to_csv(file.with_suffix(".csv"))
105 changes: 105 additions & 0 deletions test/params/fragger.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
name,value,comment
database_name,Q:\MISC_PERSONAL\Bart\Benchmark_experiment_EuBIC\Shared\2023-01-30-decoys-BenchmarkFASTAModule1_DDA.fasta.fas,Path to the protein database file in FASTA format.
num_threads,47,Number of CPU threads to use.
precursor_mass_lower,-10,Lower bound of the precursor mass window.
precursor_mass_upper,10,Upper bound of the precursor mass window.
precursor_mass_units,1,"Precursor mass tolerance units (0 for Da, 1 for ppm)."
data_type,0,"Data type (0 for DDA, 1 for DIA, 2 for gas-phase fractionation DIA)."
precursor_true_tolerance,20,True precursor mass tolerance (window is +/- this value).
precursor_true_units,1,"True precursor mass tolerance units (0 for Da, 1 for ppm)."
fragment_mass_tolerance,20,Fragment mass tolerance (window is +/- this value).
fragment_mass_units,1,"Fragment mass tolerance units (0 for Da, 1 for ppm)."
calibrate_mass,2,"Perform mass calibration (0 for OFF, 1 for ON, 2 for ON and find optimal parameters)."
use_all_mods_in_first_search,0,"Use all variable modifications in first search (0 for No, 1 for Yes)."
decoy_prefix,rev_,Prefix of the decoy protein entries. Used for parameter optimization only.
deisotope,1,"Perform deisotoping or not (0=no, 1=yes and assume singleton peaks single charged, 2=yes and assume singleton peaks single or double charged)."
deneutralloss,1,"Perform deneutrallossing or not (0=no, 1=yes)."
isotope_error,0/1/2,Also search for MS/MS events triggered on specified isotopic peaks.
mass_offsets,0.0,Creates multiple precursor tolerance windows with specified mass offsets.
precursor_mass_mode,selected,One of isolated/selected/corrected.
remove_precursor_peak,1,Remove precursor peaks from tandem mass spectra. 0 = not remove; 1 = remove the peak with precursor charge; 2 = remove the peaks with all charge states (only for DDA mode).
remove_precursor_range,"-1.500000,1.500000",m/z range in removing precursor peaks. Only for DDA mode. Unit: Th.
intensity_transform,0,Transform peaks intensities with sqrt root. 0 = not transform; 1 = transform using sqrt root.
activation_types,all,"Filter to only search scans of provided activation type(s). Allowed: All, HCD, CID, ETD, ECD."
write_calibrated_mzml,0,"Write calibrated MS2 scan to a mzML file (0 for No, 1 for Yes)."
write_uncalibrated_mgf,0,"Write uncalibrated MS2 scan to a MGF file (0 for No, 1 for Yes). Only for .raw and .d formats."
mass_diff_to_variable_mod,0,Put mass diff as a variable modification. 0 for no; 1 for yes and remove delta mass; 2 for yes and keep delta mass.
localize_delta_mass,0,"Include fragment ions mass-shifted by unknown modifications (recommended for open and mass offset searches) (0 for OFF, 1 for ON)."
delta_mass_exclude_ranges,"(-1.5,3.5)",Exclude mass range for shifted ions searching.
fragment_ion_series,"b,y","Ion series used in search, specify any of a,b,c,x,y,z,Y,b-18,y-18 (comma separated)."
ion_series_definitions =,,"User defined ion series. Example: ""b* N -17.026548;b0 N -18.010565""."
labile_search_mode,off,"type of search (nglycan, labile, or off). Off means non-labile/typical search."
restrict_deltamass_to,all,"Specify amino acids on which delta masses (mass offsets or search modifications) can occur. Allowed values are single letter codes (e.g. ACD) and '-', must be capitalized. Use 'all' to allow any amino acid."
diagnostic_intensity_filter,0,[nglycan/labile search_mode only]. Minimum relative intensity for SUM of all detected oxonium ions to achieve for spectrum to contain diagnostic fragment evidence. Calculated relative to spectrum base peak. 0 <= value.
Y_type_masses =,,[nglycan/labile search_mode only]. Specify fragments of labile mods that are commonly retained on intact peptides (e.g. Y ions for glycans). Only used if 'Y' is included in fragment_ion_series.
diagnostic_fragments =,,[nglycan/labile search_mode only]. Specify diagnostic fragments of labile mods that appear in the low m/z region. Only used if diagnostic_intensity_filter > 0.
remainder_fragment_masses =,,[labile search_mode only] List of possible remainder fragment ions to consider. Remainder masses are partial modification masses left on b/y ions after fragmentation.
search_enzyme_name_1,stricttrypsin,Name of the first enzyme.
search_enzyme_cut_1,KR,First enzyme's cutting amino acid.
search_enzyme_nocut_1 =,,First enzyme's protecting amino acid.
search_enzyme_sense_1,C,First enzyme's cutting terminal.
allowed_missed_cleavage_1,2,First enzyme's allowed number of missed cleavages per peptide. Maximum value is 5.
search_enzyme_name_2,null,Name of the second enzyme.
search_enzyme_cut_2 =,,Second enzyme's cutting amino acid.
search_enzyme_nocut_2 =,,Second enzyme's protecting amino acid.
search_enzyme_sense_2,C,Second enzyme's cutting terminal.
allowed_missed_cleavage_2,2,Second enzyme's allowed number of missed cleavages per peptide. Maximum value is 5.
num_enzyme_termini,2,"0 for non-enzymatic, 1 for semi-enzymatic, and 2 for fully-enzymatic."
clip_nTerm_M,1,Specifies the trimming of a protein N-terminal methionine as a variable modification (0 or 1).
variable_mod_01,15.9949 M 3,
variable_mod_02,42.0106 [^ 1,
allow_multiple_variable_mods_on_residue,0,
max_variable_mods_per_peptide,3,Maximum total number of variable modifications per peptide.
max_variable_mods_combinations,5000,Maximum number of modified forms allowed for each peptide (up to 65534).
output_format,tsv_pepXML_pin,"File format of output files (tsv, pin, pepxml, tsv_pin, tsv_pepxml, pepxml_pin, or tsv_pepxml_pin)."
output_report_topN,1,Reports top N PSMs per input spectrum.
output_max_expect,50,Suppresses reporting of PSM if top hit has expectation value greater than this threshold.
report_alternative_proteins,1,"Report alternative proteins for peptides that are found in multiple proteins (0 for no, 1 for yes)."
precursor_charge,1 4,Assumed range of potential precursor charge states. Only relevant when override_charge is set to 1.
override_charge,0,Ignores precursor charge and uses charge state specified in precursor_charge range (0 or 1).
digest_min_length,7,Minimum length of peptides to be generated during in-silico digestion.
digest_max_length,50,Maximum length of peptides to be generated during in-silico digestion.
digest_mass_range,500.0 5000.0,Mass range of peptides to be generated during in-silico digestion in Daltons.
max_fragment_charge,2,Maximum charge state for theoretical fragments to match (1-4).
track_zero_topN,0,Track top N unmodified peptide results separately from main results internally for boosting features.
zero_bin_accept_expect,0,Ranks a zero-bin hit above all non-zero-bin hit if it has expectation less than this value.
zero_bin_mult_expect,1,Multiplies expect value of PSMs in the zero-bin during results ordering (set to less than 1 for boosting).
add_topN_complementary,0,Inserts complementary ions corresponding to the top N most intense fragments in each experimental spectra.
check_spectral_files,1,Checking spectral files before searching.
minimum_peaks,15,Minimum number of peaks in experimental spectrum for matching.
use_topN_peaks,150,Pre-process experimental spectrum to only use top N peaks.
min_fragments_modelling,2,Minimum number of matched peaks in PSM for inclusion in statistical modeling.
min_matched_fragments,4,Minimum number of matched peaks for PSM to be reported.
min_sequence_matches,2,[nglycan/labile search_mode only] Minimum number of sequence-specific (not Y) ions to record a match.
minimum_ratio,0.01,Filters out all peaks in experimental spectrum less intense than this multiple of the base peak intensity.
clear_mz_range,0.0 0.0,Removes peaks in this m/z range prior to matching.
add_Cterm_peptide,0.0,
add_Nterm_peptide,0.0,
add_Cterm_protein,0.0,
add_Nterm_protein,0.0,
add_G_glycine,0.0,
add_A_alanine,0.0,
add_S_serine,0.0,
add_P_proline,0.0,
add_V_valine,0.0,
add_T_threonine,0.0,
add_C_cysteine,57.02146,
add_L_leucine,0.0,
add_I_isoleucine,0.0,
add_N_asparagine,0.0,
add_D_aspartic_acid,0.0,
add_Q_glutamine,0.0,
add_K_lysine,0.0,
add_E_glutamic_acid,0.0,
add_M_methionine,0.0,
add_H_histidine,0.0,
add_F_phenylalanine,0.0,
add_R_arginine,0.0,
add_Y_tyrosine,0.0,
add_W_tryptophan,0.0,
add_B_user_amino_acid,0.0,
add_J_user_amino_acid,0.0,
add_O_user_amino_acid,0.0,
add_U_user_amino_acid,0.0,
add_X_user_amino_acid,0.0,
add_Z_user_amino_acid,0.0,
139 changes: 139 additions & 0 deletions test/params/fragger.params
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
database_name = Q:\MISC_PERSONAL\Bart\Benchmark_experiment_EuBIC\Shared\2023-01-30-decoys-BenchmarkFASTAModule1_DDA.fasta.fas # Path to the protein database file in FASTA format.
num_threads = 47 # Number of CPU threads to use.

precursor_mass_lower = -10 # Lower bound of the precursor mass window.
precursor_mass_upper = 10 # Upper bound of the precursor mass window.
precursor_mass_units = 1 # Precursor mass tolerance units (0 for Da, 1 for ppm).
data_type = 0 # Data type (0 for DDA, 1 for DIA, 2 for gas-phase fractionation DIA).
precursor_true_tolerance = 20 # True precursor mass tolerance (window is +/- this value).
precursor_true_units = 1 # True precursor mass tolerance units (0 for Da, 1 for ppm).
fragment_mass_tolerance = 20 # Fragment mass tolerance (window is +/- this value).
fragment_mass_units = 1 # Fragment mass tolerance units (0 for Da, 1 for ppm).
calibrate_mass = 2 # Perform mass calibration (0 for OFF, 1 for ON, 2 for ON and find optimal parameters).
use_all_mods_in_first_search = 0 # Use all variable modifications in first search (0 for No, 1 for Yes).
decoy_prefix = rev_ # Prefix of the decoy protein entries. Used for parameter optimization only.

deisotope = 1 # Perform deisotoping or not (0=no, 1=yes and assume singleton peaks single charged, 2=yes and assume singleton peaks single or double charged).
deneutralloss = 1 # Perform deneutrallossing or not (0=no, 1=yes).
isotope_error = 0/1/2 # Also search for MS/MS events triggered on specified isotopic peaks.
mass_offsets = 0.0 # Creates multiple precursor tolerance windows with specified mass offsets.
precursor_mass_mode = selected # One of isolated/selected/corrected.

remove_precursor_peak = 1 # Remove precursor peaks from tandem mass spectra. 0 = not remove; 1 = remove the peak with precursor charge; 2 = remove the peaks with all charge states (only for DDA mode).
remove_precursor_range = -1.500000,1.500000 # m/z range in removing precursor peaks. Only for DDA mode. Unit: Th.
intensity_transform = 0 # Transform peaks intensities with sqrt root. 0 = not transform; 1 = transform using sqrt root.
activation_types = all # Filter to only search scans of provided activation type(s). Allowed: All, HCD, CID, ETD, ECD.

write_calibrated_mzml = 0 # Write calibrated MS2 scan to a mzML file (0 for No, 1 for Yes).
write_uncalibrated_mgf = 0 # Write uncalibrated MS2 scan to a MGF file (0 for No, 1 for Yes). Only for .raw and .d formats.
mass_diff_to_variable_mod = 0 # Put mass diff as a variable modification. 0 for no; 1 for yes and remove delta mass; 2 for yes and keep delta mass.

localize_delta_mass = 0 # Include fragment ions mass-shifted by unknown modifications (recommended for open and mass offset searches) (0 for OFF, 1 for ON).
delta_mass_exclude_ranges = (-1.5,3.5) # Exclude mass range for shifted ions searching.
fragment_ion_series = b,y # Ion series used in search, specify any of a,b,c,x,y,z,Y,b-18,y-18 (comma separated).
ion_series_definitions = # User defined ion series. Example: "b* N -17.026548;b0 N -18.010565".

labile_search_mode = off # type of search (nglycan, labile, or off). Off means non-labile/typical search.
restrict_deltamass_to = all # Specify amino acids on which delta masses (mass offsets or search modifications) can occur. Allowed values are single letter codes (e.g. ACD) and '-', must be capitalized. Use 'all' to allow any amino acid.
diagnostic_intensity_filter = 0 # [nglycan/labile search_mode only]. Minimum relative intensity for SUM of all detected oxonium ions to achieve for spectrum to contain diagnostic fragment evidence. Calculated relative to spectrum base peak. 0 <= value.
Y_type_masses = # [nglycan/labile search_mode only]. Specify fragments of labile mods that are commonly retained on intact peptides (e.g. Y ions for glycans). Only used if 'Y' is included in fragment_ion_series.
diagnostic_fragments = # [nglycan/labile search_mode only]. Specify diagnostic fragments of labile mods that appear in the low m/z region. Only used if diagnostic_intensity_filter > 0.
remainder_fragment_masses = # [labile search_mode only] List of possible remainder fragment ions to consider. Remainder masses are partial modification masses left on b/y ions after fragmentation.

search_enzyme_name_1 = stricttrypsin # Name of the first enzyme.
search_enzyme_cut_1 = KR # First enzyme's cutting amino acid.
search_enzyme_nocut_1 = # First enzyme's protecting amino acid.
search_enzyme_sense_1 = C # First enzyme's cutting terminal.
allowed_missed_cleavage_1 = 2 # First enzyme's allowed number of missed cleavages per peptide. Maximum value is 5.

search_enzyme_name_2 = null # Name of the second enzyme.
search_enzyme_cut_2 = # Second enzyme's cutting amino acid.
search_enzyme_nocut_2 = # Second enzyme's protecting amino acid.
search_enzyme_sense_2 = C # Second enzyme's cutting terminal.
allowed_missed_cleavage_2 = 2 # Second enzyme's allowed number of missed cleavages per peptide. Maximum value is 5.

num_enzyme_termini = 2 # 0 for non-enzymatic, 1 for semi-enzymatic, and 2 for fully-enzymatic.

clip_nTerm_M = 1 # Specifies the trimming of a protein N-terminal methionine as a variable modification (0 or 1).

# maximum of 16 mods - amino acid codes, * for any amino acid, [ and ] specifies protein termini, n and c specifies peptide termini
variable_mod_01 = 15.9949 M 3
variable_mod_02 = 42.0106 [^ 1
# variable_mod_03 = 79.96633 STY 3
# variable_mod_04 = -17.0265 nQnC 1
# variable_mod_05 = -18.0106 nE 1
# variable_mod_06 = 4.025107 K 2
# variable_mod_07 = 6.020129 R 2
# variable_mod_08 = 8.014199 K 2
# variable_mod_09 = 10.008269 R 2
# variable_mod_10 = 0.0 site_10 1
# variable_mod_11 = 0.0 site_11 1
# variable_mod_12 = 0.0 site_12 1
# variable_mod_13 = 0.0 site_13 1
# variable_mod_14 = 0.0 site_14 1
# variable_mod_15 = 0.0 site_15 1
# variable_mod_16 = 0.0 site_16 1

allow_multiple_variable_mods_on_residue = 0
max_variable_mods_per_peptide = 3 # Maximum total number of variable modifications per peptide.
max_variable_mods_combinations = 5000 # Maximum number of modified forms allowed for each peptide (up to 65534).

output_format = tsv_pepXML_pin # File format of output files (tsv, pin, pepxml, tsv_pin, tsv_pepxml, pepxml_pin, or tsv_pepxml_pin).
output_report_topN = 1 # Reports top N PSMs per input spectrum.
output_max_expect = 50 # Suppresses reporting of PSM if top hit has expectation value greater than this threshold.
report_alternative_proteins = 1 # Report alternative proteins for peptides that are found in multiple proteins (0 for no, 1 for yes).

precursor_charge = 1 4 # Assumed range of potential precursor charge states. Only relevant when override_charge is set to 1.
override_charge = 0 # Ignores precursor charge and uses charge state specified in precursor_charge range (0 or 1).

digest_min_length = 7 # Minimum length of peptides to be generated during in-silico digestion.
digest_max_length = 50 # Maximum length of peptides to be generated during in-silico digestion.
digest_mass_range = 500.0 5000.0 # Mass range of peptides to be generated during in-silico digestion in Daltons.
max_fragment_charge = 2 # Maximum charge state for theoretical fragments to match (1-4).

track_zero_topN = 0 # Track top N unmodified peptide results separately from main results internally for boosting features.
zero_bin_accept_expect = 0 # Ranks a zero-bin hit above all non-zero-bin hit if it has expectation less than this value.
zero_bin_mult_expect = 1 # Multiplies expect value of PSMs in the zero-bin during results ordering (set to less than 1 for boosting).
add_topN_complementary = 0 # Inserts complementary ions corresponding to the top N most intense fragments in each experimental spectra.

check_spectral_files = 1 # Checking spectral files before searching.
minimum_peaks = 15 # Minimum number of peaks in experimental spectrum for matching.
use_topN_peaks = 150 # Pre-process experimental spectrum to only use top N peaks.
min_fragments_modelling = 2 # Minimum number of matched peaks in PSM for inclusion in statistical modeling.
min_matched_fragments = 4 # Minimum number of matched peaks for PSM to be reported.
min_sequence_matches = 2 # [nglycan/labile search_mode only] Minimum number of sequence-specific (not Y) ions to record a match.
minimum_ratio = 0.01 # Filters out all peaks in experimental spectrum less intense than this multiple of the base peak intensity.
clear_mz_range = 0.0 0.0 # Removes peaks in this m/z range prior to matching.

add_Cterm_peptide = 0.0
add_Nterm_peptide = 0.0
add_Cterm_protein = 0.0
add_Nterm_protein = 0.0

add_G_glycine = 0.0
add_A_alanine = 0.0
add_S_serine = 0.0
add_P_proline = 0.0
add_V_valine = 0.0
add_T_threonine = 0.0
add_C_cysteine = 57.02146
add_L_leucine = 0.0
add_I_isoleucine = 0.0
add_N_asparagine = 0.0
add_D_aspartic_acid = 0.0
add_Q_glutamine = 0.0
add_K_lysine = 0.0
add_E_glutamic_acid = 0.0
add_M_methionine = 0.0
add_H_histidine = 0.0
add_F_phenylalanine = 0.0
add_R_arginine = 0.0
add_Y_tyrosine = 0.0
add_W_tryptophan = 0.0
add_B_user_amino_acid = 0.0
add_J_user_amino_acid = 0.0
add_O_user_amino_acid = 0.0
add_U_user_amino_acid = 0.0
add_X_user_amino_acid = 0.0
add_Z_user_amino_acid = 0.0

18 changes: 18 additions & 0 deletions test/test_parse_params_fragger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from pathlib import Path

import pandas as pd

import proteobench.io.params.fragger as fragger_params

TESTDATA_DIR = Path(__file__).parent / "params"


def test_read_file():
file = TESTDATA_DIR / "fragger.params"
csv_expected = TESTDATA_DIR / "fragger.csv"
expected = pd.read_csv(csv_expected)
data = fragger_params.read_file(file)
actual = pd.DataFrame.from_records(
data, columns=(fragger_params.Parameter._fields)
).set_index(fragger_params.Parameter._fields[0])
actual.equals(expected)
File renamed without changes.

0 comments on commit fcce49f

Please sign in to comment.