Merge branch 'main' into automatic_submission_notebook

Proteobench · Dec 19, 2024 · fa27c6a · fa27c6a
2 parents ee2e6f7 + 7777e2b
commit fa27c6a
Show file tree

Hide file tree

Showing 30 changed files with 8,277 additions and 421 deletions.
diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -13,4 +13,5 @@ python:
         - docs
 
 sphinx:
+  configuration: docs/conf.py
   builder: dirhtml
diff --git a/proteobench/io/params/i2masschroq.py b/proteobench/io/params/i2masschroq.py
@@ -1,25 +1,12 @@
 import pathlib
-from typing import Optional
 
 import pandas as pd
 
 from proteobench.io.params import ProteoBenchParameters
 
 
-def extract_params(fname: pathlib.Path) -> ProteoBenchParameters:
-    """
-    Extract parameters from an i2MassChroQ parameter file and return a `ProteoBenchParameters` object.
-
-    Args:
-        fname (pathlib.Path): The file path to the i2MassChroQ parameter file.
-
-    Returns:
-        ProteoBenchParameters: The extracted parameters encapsulated in a `ProteoBenchParameters` object.
-    """
-    # Read parameters from the file
-    params = pd.read_csv(fname, sep="\t", header=None, index_col=0).squeeze()
-
-    # Construct tolerance strings for fragment and parent mass errors
+def _extract_xtandem_params(params: pd.Series) -> ProteoBenchParameters:
+    """Parse i2MassChroQ parameters when with X!Tandem is used."""
     _tol_frag = "{} {}".format(
         params.loc["spectrum, fragment monoisotopic mass error"],
         params.loc["spectrum, fragment monoisotopic mass error units"].replace("Daltons", "Da"),
@@ -52,11 +39,10 @@ def extract_params(fname: pathlib.Path) -> ProteoBenchParameters:
     var_mods_list = list(params.loc[params.index.str.contains("residue, potential modification mass")].dropna())
 
     # Add "hidden" modifications when using X!Tandem:
-    if params.loc["AnalysisSoftware_name"] == "X!Tandem" or params.loc["AnalysisSoftware_name"] == "X! Tandem":
-        if params.loc["protein, quick acetyl"] == "yes":
-            var_mods_list.append("Acetyl(N-term)")
-        if params.loc["protein, quick pyrolidone"] == "yes":
-            var_mods_list.append("Pyrolidone(N-term)")
+    if params.loc["protein, quick acetyl"] == "yes":
+        var_mods_list.append("Acetyl(N-term)")
+    if params.loc["protein, quick pyrolidone"] == "yes":
+        var_mods_list.append("Pyrolidone(N-term)")
 
     # Create and return a ProteoBenchParameters object with the extracted values
     params = ProteoBenchParameters(
@@ -73,27 +59,101 @@ def extract_params(fname: pathlib.Path) -> ProteoBenchParameters:
         fragment_mass_tolerance="[-" + _tol_frag + ", " + _tol_frag + "]",
         enzyme=_enzyme,
         allowed_miscleavages=max_cleavage,
-        min_peptide_length=None,  # "spectrum, minimum fragment mz"
-        max_peptide_length=None,  # Not mentioned, up to 38 AA in peptides
+        min_peptide_length=None,  # xtandem: "spectrum, minimum fragment mz"
+        max_peptide_length=None,
         fixed_mods=";".join(fixed_mods_list),
         variable_mods=";".join(var_mods_list),
         max_mods=None,
-        min_precursor_charge=1,  # Fixed in software
+        min_precursor_charge=1,
         max_precursor_charge=int(params.loc["spectrum, maximum parent charge"]),
     )
+    return params
+
+
+def _extract_sage_params(params: pd.Series) -> ProteoBenchParameters:
+    """Parse i2MassChroQ parameters when Sage is used."""
+    # Construct tolerance strings for fragment and parent mass errors
+    fragment_mass_tolerance = params.loc["sage_fragment_tol"]  # e.g '-0.02 0.02 da'
+
+    # Construct tolerance strings for parent mass error
+    precursor_mass_tolerance = params.loc["sage_precursor_tol"]  # e.g. "-10 10 ppm"
+
+    # Max missed cleavage sites, either from scoring or refinement
+    max_cleavage = int(params.loc["sage_database_enzyme_missed_cleavages"])  # e.g. "2"
+
+    _enzyme = "{},{},{}".format(
+        params.loc["sage_database_enzyme_cleave_at"],
+        params.loc["sage_database_enzyme_restrict"],
+        params.loc["sage_database_enzyme_c_terminal"],
+    )  # e.g. "KR" and "sage_database_enzyme_restrict"	"P" and 'sage_database_enzyme_c_terminal'	"true"
+    # Replace the enzyme pattern with the enzyme name used in ProteoBench
+    # if _enzyme == "[RK]|{P}":
+    #     _enzyme = "Trypsin"
+    # elif _enzyme == "[RK]":
+    #     _enzyme = "Trypsin/P"
+
+    fixed_mods_list = params.loc["sage_database_static_mods"]  # 	C:57.021465
+    var_mods_list = params.loc["sage_database_variable_mods"]  # "M:15.994915 ^E:-18.010565 ^Q:-17.026548"
 
+    min_precursor_charge, max_precursor_charge = params.loc["sage_precursor_charge"].split()
+
+    # Create and return a ProteoBenchParameters object with the extracted values
+    params = ProteoBenchParameters(
+        software_name="i2MassChroQ",
+        software_version=params.loc["i2MassChroQ_VERSION"],
+        search_engine=params.loc["AnalysisSoftware_name"],
+        search_engine_version=str(params.loc["AnalysisSoftware_version"] or ""),
+        ident_fdr_psm=float(params.loc["psm_fdr"]),
+        ident_fdr_peptide=float(params.loc["peptide_fdr"]),
+        ident_fdr_protein=float(params.loc["protein_fdr"]),
+        # set match between runs to True if it is enabled
+        enable_match_between_runs=True if params.loc["mcq_mbr"] == "T" else False,
+        precursor_mass_tolerance=precursor_mass_tolerance,
+        fragment_mass_tolerance=fragment_mass_tolerance,
+        enzyme=_enzyme,
+        allowed_miscleavages=max_cleavage,
+        min_peptide_length=int(params.loc["sage_database_enzyme_min_len"]),  # 5
+        max_peptide_length=int(params.loc["sage_database_enzyme_max_len"]),  # 50
+        fixed_mods=fixed_mods_list,
+        variable_mods=var_mods_list,
+        max_mods=int(params.loc["sage_database_max_variable_mods"]),  # 2
+        min_precursor_charge=int(min_precursor_charge),
+        max_precursor_charge=int(max_precursor_charge),
+    )
     return params
 
 
-if __name__ == "__main__":
+def extract_params(fname: pathlib.Path) -> ProteoBenchParameters:
     """
-    Reads i2MassChroQ parameter files, extracts parameters, and writes them to CSV files.
+    Extract parameters from an i2MassChroQ parameter file and return a `ProteoBenchParameters` object.
+
+    Args:
+        fname (pathlib.Path): The file path to the i2MassChroQ parameter file.
+
+    Returns:
+        ProteoBenchParameters: The extracted parameters encapsulated in a `ProteoBenchParameters` object.
     """
+    # Read parameters from the file
+    params = pd.read_csv(fname, sep="\t", header=None, index_col=0).squeeze()
+
+    if params.loc["AnalysisSoftware_name"] in ["X!Tandem", "X! Tandem"]:
+        return _extract_xtandem_params(params)
+    elif params.loc["AnalysisSoftware_name"] == "Sage":
+        return _extract_sage_params(params)
+    else:
+        raise ValueError(f"Unsupported search engine: {params.loc['AnalysisSoftware_name']}")
+
+
+if __name__ == "__main__":
+    # Reads i2MassChroQ parameter files, extracts parameters, and writes them to CSV files.
     # List of parameter file paths
+    base_dir = pathlib.Path("../../../test/params/")
     for fname in [
-        "../../../test/params/i2mproteobench_2pep_fdr01psm_fdr01prot.tsv",
+        "i2mproteobench_2pep_fdr01psm_fdr01prot_xtandem.tsv",
+        "i2mq_result_parameters.tsv",
+        "i2mproteobench_params_sage.tsv",
     ]:
-        file = pathlib.Path(fname)
+        file = base_dir / fname
 
         # Read the parameter file to extract parameters
         params = pd.read_csv(file, sep="\t", header=None, index_col=0).squeeze()

diff --git a/...arsing/io_parse_settings/Quant/lfq/peptidoform/DDA/parse_settings_proteomediscoverer.toml b/...arsing/io_parse_settings/Quant/lfq/peptidoform/DDA/parse_settings_proteomediscoverer.toml
@@ -0,0 +1,29 @@
+[mapper]
+"Protein Accessions" = "Proteins"
+"Sequence" = "sequence"
+"Modifications" = "modifications"
+
+[condition_mapper]
+"Abundances (Normalized): F1: Sample, ConditionA" = "A"
+"Abundances (Normalized): F2: Sample, ConditionA" = "A"
+"Abundances (Normalized): F3: Sample, ConditionA" = "A"
+"Abundances (Normalized): F4: Sample, ConditionB" = "B"
+"Abundances (Normalized): F5: Sample, ConditionB" = "B"
+"Abundances (Normalized): F6: Sample, ConditionB" = "B"
+
+[run_mapper]
+"Abundances (Normalized): F1: Sample, ConditionA" = "Condition_A_Sample_Alpha_01"
+"Abundances (Normalized): F2: Sample, ConditionA" = "Condition_A_Sample_Alpha_02"
+"Abundances (Normalized): F3: Sample, ConditionA" = "Condition_A_Sample_Alpha_03"
+"Abundances (Normalized): F4: Sample, ConditionB" = "Condition_B_Sample_Alpha_01"
+"Abundances (Normalized): F5: Sample, ConditionB" = "Condition_B_Sample_Alpha_02"
+"Abundances (Normalized): F6: Sample, ConditionB" = "Condition_B_Sample_Alpha_03"
+
+[species_mapper]
+"_YEAST" = "YEAST"
+"_ECOLI" = "ECOLI"
+"_HUMAN" = "HUMAN"
+
+[general]
+"contaminant_flag" = "Cont_"
+"decoy_flag" = true
diff --git a/proteobench/io/parsing/io_parse_settings/parse_settings_files.toml b/proteobench/io/parsing/io_parse_settings/parse_settings_files.toml
@@ -10,6 +10,7 @@
 
 [quant_lfq_peptidoform_DDA]
 "WOMBAT" = "parse_settings_wombat.toml"
+"Proteome Discoverer" = "parse_settings_proteomediscoverer.toml"
 "Custom" = "parse_settings_custom.toml"
 
 [quant_lfq_ion_DIA_AIF]

diff --git a/proteobench/io/parsing/parse_peptidoform.py b/proteobench/io/parsing/parse_peptidoform.py
@@ -16,6 +16,13 @@ def load_input_file(input_csv: str, input_format: str) -> pd.DataFrame:
         pd.DataFrame: The loaded dataframe with the required columns added (like "proforma").
     """
     input_data_frame: pd.DataFrame
+    if input_format == "Proteome Discoverer":
+        input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t")
+        input_data_frame["Modifications"].fillna("", inplace=True)
+        input_data_frame["proforma"] = input_data_frame.apply(
+            lambda x: aggregate_modification_column(x["Sequence"], x["Modifications"]),
+            axis=1,
+        )
     if input_format == "WOMBAT":
         input_data_frame = pd.read_csv(input_csv, low_memory=False, sep=",")
         input_data_frame["proforma"] = input_data_frame["modified_peptide"]
@@ -34,11 +41,17 @@ def aggregate_modification_column(
         "Any C-term": -1,
         "Protein N-term": 0,
         "Protein C-term": -1,
+        "N-Term": 0,  # Added to handle "N-Term"
+        "C-Term": -1,  # If you also expect "C-Term"
     },
 ) -> str:
     """
     Aggregate modifications into a string representing the modified sequence.
 
+    This version handles both:
+    - Original format (e.g. "Methylation (C11)" or "Carbamidomethyl (Any N-term)")
+    - New format (e.g. "1xCarbamidomethyl [C11]", "1xOxidation [M4]", "1xAcetyl [N-Term]")
+
     Args:
         input_string_seq (str): The input sequence string.
         input_string_modifications (str): The modifications applied to the sequence.
@@ -47,25 +60,81 @@ def aggregate_modification_column(
     Returns:
         str: The modified sequence string with aggregated modifications.
     """
+
+    # If no modifications, return the original sequence unchanged
+    if not input_string_modifications.strip():
+        return input_string_seq
+
+    # Split modifications by ';' to handle multiple modifications
+    raw_mods = [x.strip() for x in input_string_modifications.split(";") if x.strip()]
+
     all_mods = []
-    for m in input_string_modifications.split("; "):
-        if len(m) == 0:
-            continue
-        m_stripped = m.split(" (")[1].rstrip(")")
-        m_name = m.split(" (")[0]
-
-        if m_stripped in special_locations.keys():
-            if special_locations[m_stripped] == -1:
-                all_mods.append((m_name, len(input_string_seq)))
-            else:
-                all_mods.append((m_name, special_locations[m_stripped]))
-            continue
 
-        all_mods.append((m_name, int(m_stripped[1:])))
+    for m in raw_mods:
+        # Detect format by checking for '(' or '['
+        if "(" in m and "[" not in m:
+            # Original format (e.g. "Carbamidomethyl (C11)" or "Methylation (Any N-term)")
+            parts = m.split(" (")
+            if len(parts) < 2:
+                continue
+            m_name = parts[0].strip()
+            m_stripped = parts[1].rstrip(")")
+
+            # Check if this is a special location
+            if m_stripped in special_locations:
+                loc = special_locations[m_stripped]
+                if loc == -1:
+                    loc = len(input_string_seq)  # C-term
+                all_mods.append((m_name, loc))
+            else:
+                # Assume format like C11 means position 11
+                loc = int(m_stripped[1:])
+                all_mods.append((m_name, loc))
 
+        else:
+            # New format, e.g. "1xCarbamidomethyl [C11]", "1xAcetyl [N-Term]"
+            # Remove any count prefix like "1x"
+            entry = re.sub(r"\d+x", "", m).strip()
+
+            # Extract modification name and bracketed portion
+            mod_name_match = re.match(r"([A-Za-z]+)\s*\[(.+)\]", entry)
+            if not mod_name_match:
+                continue
+
+            mod_name = mod_name_match.group(1)
+            positions_str = mod_name_match.group(2).strip()
+
+            # Positions could be multiple (e.g. "C10; C13")
+            pos_parts = [p.strip() for p in positions_str.split(";") if p.strip()]
+            if not pos_parts:
+                # If there's nothing after the brackets, skip
+                continue
+
+            for pos_part in pos_parts:
+                # Check if pos_part is a known special location (e.g. "N-Term")
+                if pos_part in special_locations:
+                    loc = special_locations[pos_part]
+                    if loc == -1:
+                        loc = len(input_string_seq)
+                    all_mods.append((mod_name, loc))
+                else:
+                    # Otherwise, assume format like C11 or M4
+                    if len(pos_part) > 1:
+                        loc = int(pos_part[1:])
+                        all_mods.append((mod_name, loc))
+
+    # Sort modifications by descending position so we insert from the end
     all_mods.sort(key=lambda x: x[1], reverse=True)
 
     for name, loc in all_mods:
+        # Insert the modification into the sequence.
+        # 'loc' is a 1-based index if it's a residue position.
+        # For terminal modifications, special_locations will have adjusted it.
+        # If loc is -1 or at sequence end, we've already resolved it to len(sequence).
+
+        # Insert the modification brackets at position 'loc'.
+        # Note: If loc == 0 (N-term), insert at start of sequence.
+        #       If loc == len(sequence), insert at end (C-term).
         input_string_seq = input_string_seq[:loc] + f"[{name}]" + input_string_seq[loc:]
 
     return input_string_seq

diff --git a/proteobench/modules/quant/lfq/peptidoform/DDA/quant_lfq_peptidoform_DDA.py b/proteobench/modules/quant/lfq/peptidoform/DDA/quant_lfq_peptidoform_DDA.py
@@ -37,6 +37,7 @@ def __init__(
                 "..",
                 "..",
                 "..",
+                "..",
                 "io",
                 "parsing",
                 "io_parse_settings",

diff --git a/proteobench/modules/quant/quant_base/quant_base_module.py b/proteobench/modules/quant/quant_base/quant_base_module.py
@@ -71,6 +71,10 @@ class QuantModule:
         "FragPipe (DIA-NN quant)": extract_params_fragger,
         "MSAID": extract_params_msaid,
         "Spectronaut": extract_params_spectronaut,
+        # TODO needs to be replace with parameter extraction function
+        "WOMBAT": extract_params_spectronaut,
+        # TODO needs to be replace with parameter extraction function
+        "Proteome Discoverer": extract_params_spectronaut,
     }
 
     def __init__(

diff --git a/proteobench/plotting/plot_quant.py b/proteobench/plotting/plot_quant.py
@@ -87,6 +87,7 @@ def plot_metric(
             "Spectronaut": "#bcbd22",
             "FragPipe (DIA-NN quant)": "#ff7f00",
             "MSAID": "#afff57",
+            "Proteome Discoverer": "#8c564b",
         },
         mapping: Dict[str, int] = {"old": 10, "new": 20},
         highlight_color: str = "#d30067",
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,4 +13,5 @@ python: @@
             - docs
     sphinx:
+      configuration: docs/conf.py
       builder: dirhtml