Merge pull request #198 from Proteobench/fragpipe_msfragger_params

FragPipe+MsFragger argument parsing (+fixing Proline Parsing Errors)
Proteobench · Jan 19, 2024 · b4cc7bf · b4cc7bf
2 parents 1efc3b6 + eda8402
commit b4cc7bf
Show file tree

Hide file tree

Showing 12 changed files with 824 additions and 36 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -17,7 +17,7 @@
         "-p",
         "test_*.py"
     ],
-    "python.testing.pytestEnabled": false,
+    "python.testing.pytestEnabled": true,
     "python.testing.unittestEnabled": true,
     "flake8.args": [
         "--max-line-length=120",

diff --git a/proteobench/io/params/fragger.py b/proteobench/io/params/fragger.py
@@ -7,14 +7,22 @@
 from __future__ import annotations
 
 import logging
+import re
 from collections import namedtuple
+from pathlib import Path
+
+import pandas as pd
+
+from proteobench.io.params import ProteoBenchParameters
 
 logger = logging.getLogger(__name__)
 
 Parameter = namedtuple("Parameter", ["name", "value", "comment"])
 
+VERSION_NO_PATTERN = r"\d+(\.\d+)*"
+
 
-def read_file(file: str) -> list[Parameter]:
+def read_file(file: str, sep: str = " = ") -> list[Parameter]:
     """Read FragPipe parameter file as list of records."""
     with open(file) as f:
         data = []
@@ -36,7 +44,7 @@ def read_file(file: str) -> list[Parameter]:
             else:
                 param = line
                 comment = None
-            res = param.strip().split(" = ")
+            res = param.strip().split(sep, maxsplit=1)
             if len(res) == 1:
                 param = res[0].strip()
                 data.append(Parameter(param, None, comment))
@@ -46,13 +54,83 @@ def read_file(file: str) -> list[Parameter]:
     return data
 
 
+def extract_params(file: str, f_fragpipe_workflow) -> ProteoBenchParameters:
+    msfragger_params = read_file(file)
+    msfragger_params = pd.DataFrame.from_records(msfragger_params, columns=Parameter._fields).set_index(
+        Parameter._fields[0]
+    )
+    fragpipe_params = read_file(f_fragpipe_workflow, sep="=")
+    fragpipe_params = pd.DataFrame.from_records(fragpipe_params, columns=Parameter._fields).set_index(
+        Parameter._fields[0]
+    )
+
+    # FragPipe version in first line
+    with open(f_fragpipe_workflow) as f:
+        header = next(iter(f))[1:].strip()
+
+    match = re.search(VERSION_NO_PATTERN, header)
+
+    if match:
+        header = match.group()
+
+    params = ProteoBenchParameters()
+    params.software_name = "FragPipe"
+    params.software_version = header
+    params.search_engine = "MSFragger"
+
+    msfragger_executable = fragpipe_params.loc["fragpipe-config.bin-msfragger", "value"]
+    msfragger_executable = Path(msfragger_executable).name
+    match = re.search(VERSION_NO_PATTERN, msfragger_executable)
+
+    if match:
+        msfragger_executable = match.group()
+
+    params.search_engine_version = msfragger_executable
+    params.enzyme = msfragger_params.loc["search_enzyme_name_1", "value"]
+    params.allowed_miscleavages = msfragger_params.loc["allowed_missed_cleavage_1", "value"]
+    params.fixed_mods = fragpipe_params.loc["msfragger.table.fix-mods", "value"]
+    params.variable_mods = fragpipe_params.loc["msfragger.table.var-mods", "value"]
+    params.max_mods = msfragger_params.loc["max_variable_mods_per_peptide", "value"]
+    params.min_peptide_length = msfragger_params.loc["digest_min_length", "value"]
+    params.max_peptide_length = msfragger_params.loc["digest_max_length", "value"]
+
+    params.precursor_mass_tolerance = msfragger_params.loc["precursor_true_tolerance", "value"]
+    params.fragment_mass_tolerance = msfragger_params.loc["fragment_mass_tolerance", "value"]
+    # ! ionquant is not necessarily fixed?
+    params.ident_fdr_protein = fragpipe_params.loc["ionquant.proteinfdr", "value"]
+    params.ident_fdr_peptide = fragpipe_params.loc["ionquant.peptidefdr", "value"]
+    params.ident_fdr_psm = fragpipe_params.loc["ionquant.ionfdr", "value"]
+
+    for key in ["ident_fdr_protein", "ident_fdr_peptide", "ident_fdr_psm"]:
+        value = getattr(params, key)
+        try:
+            value = int(value) / 100
+            setattr(params, key, value)
+        except ValueError:
+            logging.warning(f"Could not convert {value} to int.")
+
+    min_precursor_charge, max_precursor_charge = msfragger_params.loc["precursor_charge", "value"].split(" ")
+    params.min_precursor_charge = int(min_precursor_charge)
+    params.max_precursor_charge = int(max_precursor_charge)
+    params.enable_match_between_runs = bool(fragpipe_params.loc["ionquant.mbr", "value"])
+    return params
+
+
 if __name__ == "__main__":
     import pathlib
-
-    import pandas as pd
+    from pprint import pprint
 
     file = pathlib.Path("../../../test/params/fragger.params")
     data = read_file(file)
     df = pd.DataFrame.from_records(data, columns=Parameter._fields).set_index(Parameter._fields[0])
-    df
     df.to_csv(file.with_suffix(".csv"))
+
+    file_fragpipe = pathlib.Path("../../../test/params/fragpipe.workflow")
+    data = read_file(file_fragpipe, sep="=")
+    df = pd.DataFrame.from_records(data, columns=Parameter._fields).set_index(Parameter._fields[0])
+    df.to_csv(file_fragpipe.with_suffix(".csv"))
+
+    params = extract_params(file, file_fragpipe)
+    pprint(params.__dict__)
+    series = pd.Series(params.__dict__)
+    series.to_csv(file.parent / "fragger_extracted_params.csv")
diff --git a/proteobench/io/params/proline.py b/proteobench/io/params/proline.py
@@ -7,7 +7,6 @@
 - "Import and filters"
 - "Quant config"
 """
-import pathlib
 import re
 
 import pandas as pd
@@ -56,8 +55,8 @@ def extract_params(fname) -> ProteoBenchParameters:
     sheet = sheet[cols].drop_duplicates().reset_index(drop=True)
     # Extract
     params.software_name = "Proline"
-    params.software_version = sheet.loc[0, "software_version"]
     params.search_engine = sheet.loc[0, "software_name"]
+    params.search_engine_version = sheet.loc[0, "software_version"]
     params.enzyme = sheet.loc[0, "enzymes"]
     params.allowed_miscleavages = sheet.loc[0, "max_missed_cleavages"]
     params.fixed_mods = sheet.loc[0, "fixed_ptms"]
@@ -74,24 +73,26 @@ def extract_params(fname) -> ProteoBenchParameters:
     assert all(stats.loc["unique", cols] == 1), "Not all columns are unique"
     sheet = sheet[cols].drop_duplicates().reset_index(drop=True)
     # Extract
-    params.ident_fdr_psm = sheet.loc[0, "psm_filter_expected_fdr"]  # ! 1 stands for 1% FDR
+    params.ident_fdr_psm = int(sheet.loc[0, "psm_filter_expected_fdr"]) / 100
     params.min_peptide_length = find_min_pep_length(sheet.loc[0, "psm_filter_2"])
 
     # ! Third sheet only contains match between runs (MBR) information indirectly
+    sheet_name = "Quant config"
     sheet = excel.parse(sheet_name, dtype="object", index_col=0)
     enable_match_between_runs = sheet.index.str.contains("cross assignment").any()
     params.enable_match_between_runs = enable_match_between_runs
     return params
 
 
 if __name__ == "__main__":
-    file = pathlib.Path("../../../test/params/Proline_example_w_Mascot_wo_proteinSets.xlsx")
+    from pathlib import Path
+
+    file = Path("../../../test/params/Proline_example_w_Mascot_wo_proteinSets.xlsx")
     params = extract_params(file)
     data_dict = params.__dict__
     series = pd.Series(data_dict)
     series.to_csv(file.with_suffix(".csv"))
-
-    file = pathlib.Path("../../../test/params/Proline_example_2.xlsx")
+    file = Path("../../../test/params/Proline_example_2.xlsx")
     params = extract_params(file)
     data_dict = params.__dict__
     series = pd.Series(data_dict)

diff --git a/test/params/Proline_example_2.csv b/test/params/Proline_example_2.csv
@@ -1,12 +1,12 @@
 ,0
 software_name,Proline
-software_version,X! Tandem Vengeance (2015.12.15.2)
+software_version,
 search_engine,XTandem
-search_engine_version,
-ident_fdr_psm,1
+search_engine_version,X! Tandem Vengeance (2015.12.15.2)
+ident_fdr_psm,0.01
 ident_fdr_peptide,
 ident_fdr_protein,
-enable_match_between_runs,False
+enable_match_between_runs,True
 precursor_mass_tolerance,10.0 ppm
 fragment_mass_tolerance,0.02 Da
 enzyme,Trypsin

diff --git a/test/params/Proline_example_w_Mascot_wo_proteinSets.csv b/test/params/Proline_example_w_Mascot_wo_proteinSets.csv
@@ -1,12 +1,12 @@
 ,0
 software_name,Proline
-software_version,2.8.0.1
+software_version,
 search_engine,Mascot
-search_engine_version,
-ident_fdr_psm,1
+search_engine_version,2.8.0.1
+ident_fdr_psm,0.01
 ident_fdr_peptide,
 ident_fdr_protein,
-enable_match_between_runs,False
+enable_match_between_runs,True
 precursor_mass_tolerance,10.0 ppm
 fragment_mass_tolerance,0.02 Da
 enzyme,Trypsin/P

diff --git a/test/params/fragger.csv b/test/params/fragger.csv
@@ -1,10 +1,10 @@
 name,value,comment
-database_name,Q:\MISC_PERSONAL\Bart\Benchmark_experiment_EuBIC\Shared\2023-01-30-decoys-BenchmarkFASTAModule1_DDA.fasta.fas,Path to the protein database file in FASTA format.
-num_threads,47,Number of CPU threads to use.
-precursor_mass_lower,-10,Lower bound of the precursor mass window.
-precursor_mass_upper,10,Upper bound of the precursor mass window.
+database_name,/mnt/PUMA/ProjectSQ/ASchmidt/KlemensErwinFrohlich_568/20230131-143622_Analysis/ProteoBench/proteobench/Module_2_DDA_quantification/FASTA/2023-12-16-decoys-BenchmarkFASTAModule2_DDA.fasta.fas,Path to the protein database file in FASTA format.
+num_threads,50,Number of CPU threads to use.
+precursor_mass_lower,-20,Lower bound of the precursor mass window.
+precursor_mass_upper,20,Upper bound of the precursor mass window.
 precursor_mass_units,1,"Precursor mass tolerance units (0 for Da, 1 for ppm)."
-data_type,0,"Data type (0 for DDA, 1 for DIA, 2 for gas-phase fractionation DIA)."
+data_type,0,"Data type (0 for DDA, 1 for DIA, 2 for gas-phase fractionation DIA, 3 for wide-window acquisition DDA)."
 precursor_true_tolerance,20,True precursor mass tolerance (window is +/- this value).
 precursor_true_units,1,"True precursor mass tolerance units (0 for Da, 1 for ppm)."
 fragment_mass_tolerance,20,Fragment mass tolerance (window is +/- this value).
@@ -16,13 +16,19 @@ deisotope,1,"Perform deisotoping or not (0=no, 1=yes and assume singleton peaks
 deneutralloss,1,"Perform deneutrallossing or not (0=no, 1=yes)."
 isotope_error,0/1/2,Also search for MS/MS events triggered on specified isotopic peaks.
 mass_offsets,0.0,Creates multiple precursor tolerance windows with specified mass offsets.
+mass_offsets_detailed =,,Optional detailed mass offset list. Overrides mass_offsets if use_detailed_offsets = 1.
+use_detailed_offsets,0,Whether to use the regular (0) or detailed (1) mass offset list.
 precursor_mass_mode,selected,One of isolated/selected/corrected.
 remove_precursor_peak,1,Remove precursor peaks from tandem mass spectra. 0 = not remove; 1 = remove the peak with precursor charge; 2 = remove the peaks with all charge states (only for DDA mode).
 remove_precursor_range,"-1.500000,1.500000",m/z range in removing precursor peaks. Only for DDA mode. Unit: Th.
 intensity_transform,0,Transform peaks intensities with sqrt root. 0 = not transform; 1 = transform using sqrt root.
 activation_types,all,"Filter to only search scans of provided activation type(s). Allowed: All, HCD, CID, ETD, ECD."
+group_variable,0,Specify the variable used to decide the PSM group in the group FDR estimation. 0 = no group FDR; 1 = num_enzyme_termini; 2 = PE from protein header.
+require_precursor,1,"If required, PSMs with no precursor peaks will be discarded. For DIA data type only. 0 = no, 1 = yes."
+reuse_dia_fragment_peaks,0,"Allow the same peak matches to multiple peptides. For DIA data type only. 0 = no, 1 = yes."
 write_calibrated_mzml,0,"Write calibrated MS2 scan to a mzML file (0 for No, 1 for Yes)."
 write_uncalibrated_mgf,0,"Write uncalibrated MS2 scan to a MGF file (0 for No, 1 for Yes). Only for .raw and .d formats."
+write_mzbin_all,0,
 mass_diff_to_variable_mod,0,Put mass diff as a variable modification. 0 for no; 1 for yes and remove delta mass; 2 for yes and keep delta mass.
 localize_delta_mass,0,"Include fragment ions mass-shifted by unknown modifications (recommended for open and mass offset searches) (0 for OFF, 1 for ON)."
 delta_mass_exclude_ranges,"(-1.5,3.5)",Exclude mass range for shifted ions searching.
@@ -51,7 +57,7 @@ variable_mod_02,42.0106 [^ 1,
 allow_multiple_variable_mods_on_residue,0,
 max_variable_mods_per_peptide,3,Maximum total number of variable modifications per peptide.
 max_variable_mods_combinations,5000,Maximum number of modified forms allowed for each peptide (up to 65534).
-output_format,tsv_pepXML_pin,"File format of output files (tsv, pin, pepxml, tsv_pin, tsv_pepxml, pepxml_pin, or tsv_pepxml_pin)."
+output_format,pepXML_pin,"File format of output files (tsv, pin, pepxml, tsv_pin, tsv_pepxml, pepxml_pin, or tsv_pepxml_pin)."
 output_report_topN,1,Reports top N PSMs per input spectrum.
 output_max_expect,50,Suppresses reporting of PSM if top hit has expectation value greater than this threshold.
 report_alternative_proteins,1,"Report alternative proteins for peptides that are found in multiple proteins (0 for no, 1 for yes)."
@@ -64,7 +70,6 @@ max_fragment_charge,2,Maximum charge state for theoretical fragments to match (1
 track_zero_topN,0,Track top N unmodified peptide results separately from main results internally for boosting features.
 zero_bin_accept_expect,0,Ranks a zero-bin hit above all non-zero-bin hit if it has expectation less than this value.
 zero_bin_mult_expect,1,Multiplies expect value of PSMs in the zero-bin during  results ordering (set to less than 1 for boosting).
-add_topN_complementary,0,Inserts complementary ions corresponding to the top N most intense fragments in each experimental spectra.
 check_spectral_files,1,Checking spectral files before searching.
 minimum_peaks,15,Minimum number of peaks in experimental spectrum for matching.
 use_topN_peaks,150,Pre-process experimental spectrum to only use top N peaks.

diff --git a/test/params/fragger.params b/test/params/fragger.params
@@ -1,10 +1,10 @@
-database_name = Q:\MISC_PERSONAL\Bart\Benchmark_experiment_EuBIC\Shared\2023-01-30-decoys-BenchmarkFASTAModule1_DDA.fasta.fas			# Path to the protein database file in FASTA format.
-num_threads = 47			# Number of CPU threads to use.
+database_name = /mnt/PUMA/ProjectSQ/ASchmidt/KlemensErwinFrohlich_568/20230131-143622_Analysis/ProteoBench/proteobench/Module_2_DDA_quantification/FASTA/2023-12-16-decoys-BenchmarkFASTAModule2_DDA.fasta.fas			# Path to the protein database file in FASTA format.
+num_threads = 50			# Number of CPU threads to use.
 
-precursor_mass_lower = -10			# Lower bound of the precursor mass window.
-precursor_mass_upper = 10			# Upper bound of the precursor mass window.
+precursor_mass_lower = -20			# Lower bound of the precursor mass window.
+precursor_mass_upper = 20			# Upper bound of the precursor mass window.
 precursor_mass_units = 1			# Precursor mass tolerance units (0 for Da, 1 for ppm).
-data_type = 0			# Data type (0 for DDA, 1 for DIA, 2 for gas-phase fractionation DIA).
+data_type = 0			# Data type (0 for DDA, 1 for DIA, 2 for gas-phase fractionation DIA, 3 for wide-window acquisition DDA).
 precursor_true_tolerance = 20			# True precursor mass tolerance (window is +/- this value).
 precursor_true_units = 1			# True precursor mass tolerance units (0 for Da, 1 for ppm).
 fragment_mass_tolerance = 20			# Fragment mass tolerance (window is +/- this value).
@@ -17,15 +17,21 @@ deisotope = 1			# Perform deisotoping or not (0=no, 1=yes and assume singleton p
 deneutralloss = 1			# Perform deneutrallossing or not (0=no, 1=yes).
 isotope_error = 0/1/2			# Also search for MS/MS events triggered on specified isotopic peaks.
 mass_offsets = 0.0			# Creates multiple precursor tolerance windows with specified mass offsets.
+mass_offsets_detailed = 			# Optional detailed mass offset list. Overrides mass_offsets if use_detailed_offsets = 1.
+use_detailed_offsets = 0			# Whether to use the regular (0) or detailed (1) mass offset list.
 precursor_mass_mode = selected			# One of isolated/selected/corrected.
 
 remove_precursor_peak = 1			#  Remove precursor peaks from tandem mass spectra. 0 = not remove; 1 = remove the peak with precursor charge; 2 = remove the peaks with all charge states (only for DDA mode).
 remove_precursor_range = -1.500000,1.500000			# m/z range in removing precursor peaks. Only for DDA mode. Unit: Th.
 intensity_transform = 0			# Transform peaks intensities with sqrt root. 0 = not transform; 1 = transform using sqrt root.
 activation_types = all			# Filter to only search scans of provided activation type(s). Allowed: All, HCD, CID, ETD, ECD.
+group_variable = 0			# Specify the variable used to decide the PSM group in the group FDR estimation. 0 = no group FDR; 1 = num_enzyme_termini; 2 = PE from protein header.
+require_precursor = 1			# If required, PSMs with no precursor peaks will be discarded. For DIA data type only. 0 = no, 1 = yes.
+reuse_dia_fragment_peaks = 0			# Allow the same peak matches to multiple peptides. For DIA data type only. 0 = no, 1 = yes.
 
 write_calibrated_mzml = 0			# Write calibrated MS2 scan to a mzML file (0 for No, 1 for Yes).
 write_uncalibrated_mgf = 0			# Write uncalibrated MS2 scan to a MGF file (0 for No, 1 for Yes). Only for .raw and .d formats.
+write_mzbin_all = 0
 mass_diff_to_variable_mod = 0			# Put mass diff as a variable modification. 0 for no; 1 for yes and remove delta mass; 2 for yes and keep delta mass.
 
 localize_delta_mass = 0			# Include fragment ions mass-shifted by unknown modifications (recommended for open and mass offset searches) (0 for OFF, 1 for ON).
@@ -78,7 +84,7 @@ allow_multiple_variable_mods_on_residue = 0
 max_variable_mods_per_peptide = 3			# Maximum total number of variable modifications per peptide.
 max_variable_mods_combinations = 5000			# Maximum number of modified forms allowed for each peptide (up to 65534).
 
-output_format = tsv_pepXML_pin			# File format of output files (tsv, pin, pepxml, tsv_pin, tsv_pepxml, pepxml_pin, or tsv_pepxml_pin).
+output_format = pepXML_pin			# File format of output files (tsv, pin, pepxml, tsv_pin, tsv_pepxml, pepxml_pin, or tsv_pepxml_pin).
 output_report_topN = 1			# Reports top N PSMs per input spectrum.
 output_max_expect = 50			# Suppresses reporting of PSM if top hit has expectation value greater than this threshold.
 report_alternative_proteins = 1			# Report alternative proteins for peptides that are found in multiple proteins (0 for no, 1 for yes).
@@ -94,7 +100,6 @@ max_fragment_charge = 2			# Maximum charge state for theoretical fragments to ma
 track_zero_topN = 0			# Track top N unmodified peptide results separately from main results internally for boosting features.
 zero_bin_accept_expect = 0			# Ranks a zero-bin hit above all non-zero-bin hit if it has expectation less than this value.
 zero_bin_mult_expect = 1			# Multiplies expect value of PSMs in the zero-bin during  results ordering (set to less than 1 for boosting).
-add_topN_complementary = 0			# Inserts complementary ions corresponding to the top N most intense fragments in each experimental spectra.
 
 check_spectral_files = 1			# Checking spectral files before searching.
 minimum_peaks = 15			# Minimum number of peaks in experimental spectrum for matching.

diff --git a/test/params/fragger_extracted_params.csv b/test/params/fragger_extracted_params.csv
@@ -0,0 +1,20 @@
+,0
+software_name,FragPipe
+software_version,21.0
+search_engine,MSFragger
+search_engine_version,4.0
+ident_fdr_psm,0.01
+ident_fdr_peptide,0.01
+ident_fdr_protein,0.01
+enable_match_between_runs,True
+precursor_mass_tolerance,20
+fragment_mass_tolerance,20
+enzyme,stricttrypsin
+allowed_miscleavages,2
+min_peptide_length,7
+max_peptide_length,50
+fixed_mods,"0.0,C-Term Peptide,true,-1; 0.0,N-Term Peptide,true,-1; 0.0,C-Term Protein,true,-1; 0.0,N-Term Protein,true,-1; 0.0,G (glycine),true,-1; 0.0,A (alanine),true,-1; 0.0,S (serine),true,-1; 0.0,P (proline),true,-1; 0.0,V (valine),true,-1; 0.0,T (threonine),true,-1; 57.02146,C (cysteine),true,-1; 0.0,L (leucine),true,-1; 0.0,I (isoleucine),true,-1; 0.0,N (asparagine),true,-1; 0.0,D (aspartic acid),true,-1; 0.0,Q (glutamine),true,-1; 0.0,K (lysine),true,-1; 0.0,E (glutamic acid),true,-1; 0.0,M (methionine),true,-1; 0.0,H (histidine),true,-1; 0.0,F (phenylalanine),true,-1; 0.0,R (arginine),true,-1; 0.0,Y (tyrosine),true,-1; 0.0,W (tryptophan),true,-1; 0.0,B ,true,-1; 0.0,J,true,-1; 0.0,O,true,-1; 0.0,U,true,-1; 0.0,X,true,-1; 0.0,Z,true,-1"
+variable_mods,"15.9949,M,true,3; 42.0106,[^,true,1; 79.96633,STY,false,3; -17.0265,nQnC,false,1; -18.0106,nE,false,1; 4.025107,K,false,2; 6.020129,R,false,2; 8.014199,K,false,2; 10.008269,R,false,2; 0.0,site_10,false,1; 0.0,site_11,false,1; 0.0,site_12,false,1; 0.0,site_13,false,1; 0.0,site_14,false,1; 0.0,site_15,false,1; 0.0,site_16,false,1"
+max_mods,3
+min_precursor_charge,1
+max_precursor_charge,4