Skip to content

Commit

Permalink
Merge branch 'main' into automatic_submission_notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
RobbinBouwmeester authored Dec 19, 2024
2 parents ee2e6f7 + 7777e2b commit fa27c6a
Show file tree
Hide file tree
Showing 30 changed files with 8,277 additions and 421 deletions.
1 change: 1 addition & 0 deletions .readthedocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ python:
- docs

sphinx:
configuration: docs/conf.py
builder: dirhtml
114 changes: 87 additions & 27 deletions proteobench/io/params/i2masschroq.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,12 @@
import pathlib
from typing import Optional

import pandas as pd

from proteobench.io.params import ProteoBenchParameters


def extract_params(fname: pathlib.Path) -> ProteoBenchParameters:
"""
Extract parameters from an i2MassChroQ parameter file and return a `ProteoBenchParameters` object.
Args:
fname (pathlib.Path): The file path to the i2MassChroQ parameter file.
Returns:
ProteoBenchParameters: The extracted parameters encapsulated in a `ProteoBenchParameters` object.
"""
# Read parameters from the file
params = pd.read_csv(fname, sep="\t", header=None, index_col=0).squeeze()

# Construct tolerance strings for fragment and parent mass errors
def _extract_xtandem_params(params: pd.Series) -> ProteoBenchParameters:
"""Parse i2MassChroQ parameters when with X!Tandem is used."""
_tol_frag = "{} {}".format(
params.loc["spectrum, fragment monoisotopic mass error"],
params.loc["spectrum, fragment monoisotopic mass error units"].replace("Daltons", "Da"),
Expand Down Expand Up @@ -52,11 +39,10 @@ def extract_params(fname: pathlib.Path) -> ProteoBenchParameters:
var_mods_list = list(params.loc[params.index.str.contains("residue, potential modification mass")].dropna())

# Add "hidden" modifications when using X!Tandem:
if params.loc["AnalysisSoftware_name"] == "X!Tandem" or params.loc["AnalysisSoftware_name"] == "X! Tandem":
if params.loc["protein, quick acetyl"] == "yes":
var_mods_list.append("Acetyl(N-term)")
if params.loc["protein, quick pyrolidone"] == "yes":
var_mods_list.append("Pyrolidone(N-term)")
if params.loc["protein, quick acetyl"] == "yes":
var_mods_list.append("Acetyl(N-term)")
if params.loc["protein, quick pyrolidone"] == "yes":
var_mods_list.append("Pyrolidone(N-term)")

# Create and return a ProteoBenchParameters object with the extracted values
params = ProteoBenchParameters(
Expand All @@ -73,27 +59,101 @@ def extract_params(fname: pathlib.Path) -> ProteoBenchParameters:
fragment_mass_tolerance="[-" + _tol_frag + ", " + _tol_frag + "]",
enzyme=_enzyme,
allowed_miscleavages=max_cleavage,
min_peptide_length=None, # "spectrum, minimum fragment mz"
max_peptide_length=None, # Not mentioned, up to 38 AA in peptides
min_peptide_length=None, # xtandem: "spectrum, minimum fragment mz"
max_peptide_length=None,
fixed_mods=";".join(fixed_mods_list),
variable_mods=";".join(var_mods_list),
max_mods=None,
min_precursor_charge=1, # Fixed in software
min_precursor_charge=1,
max_precursor_charge=int(params.loc["spectrum, maximum parent charge"]),
)
return params


def _extract_sage_params(params: pd.Series) -> ProteoBenchParameters:
"""Parse i2MassChroQ parameters when Sage is used."""
# Construct tolerance strings for fragment and parent mass errors
fragment_mass_tolerance = params.loc["sage_fragment_tol"] # e.g '-0.02 0.02 da'

# Construct tolerance strings for parent mass error
precursor_mass_tolerance = params.loc["sage_precursor_tol"] # e.g. "-10 10 ppm"

# Max missed cleavage sites, either from scoring or refinement
max_cleavage = int(params.loc["sage_database_enzyme_missed_cleavages"]) # e.g. "2"

_enzyme = "{},{},{}".format(
params.loc["sage_database_enzyme_cleave_at"],
params.loc["sage_database_enzyme_restrict"],
params.loc["sage_database_enzyme_c_terminal"],
) # e.g. "KR" and "sage_database_enzyme_restrict" "P" and 'sage_database_enzyme_c_terminal' "true"
# Replace the enzyme pattern with the enzyme name used in ProteoBench
# if _enzyme == "[RK]|{P}":
# _enzyme = "Trypsin"
# elif _enzyme == "[RK]":
# _enzyme = "Trypsin/P"

fixed_mods_list = params.loc["sage_database_static_mods"] # C:57.021465
var_mods_list = params.loc["sage_database_variable_mods"] # "M:15.994915 ^E:-18.010565 ^Q:-17.026548"

min_precursor_charge, max_precursor_charge = params.loc["sage_precursor_charge"].split()

# Create and return a ProteoBenchParameters object with the extracted values
params = ProteoBenchParameters(
software_name="i2MassChroQ",
software_version=params.loc["i2MassChroQ_VERSION"],
search_engine=params.loc["AnalysisSoftware_name"],
search_engine_version=str(params.loc["AnalysisSoftware_version"] or ""),
ident_fdr_psm=float(params.loc["psm_fdr"]),
ident_fdr_peptide=float(params.loc["peptide_fdr"]),
ident_fdr_protein=float(params.loc["protein_fdr"]),
# set match between runs to True if it is enabled
enable_match_between_runs=True if params.loc["mcq_mbr"] == "T" else False,
precursor_mass_tolerance=precursor_mass_tolerance,
fragment_mass_tolerance=fragment_mass_tolerance,
enzyme=_enzyme,
allowed_miscleavages=max_cleavage,
min_peptide_length=int(params.loc["sage_database_enzyme_min_len"]), # 5
max_peptide_length=int(params.loc["sage_database_enzyme_max_len"]), # 50
fixed_mods=fixed_mods_list,
variable_mods=var_mods_list,
max_mods=int(params.loc["sage_database_max_variable_mods"]), # 2
min_precursor_charge=int(min_precursor_charge),
max_precursor_charge=int(max_precursor_charge),
)
return params


if __name__ == "__main__":
def extract_params(fname: pathlib.Path) -> ProteoBenchParameters:
"""
Reads i2MassChroQ parameter files, extracts parameters, and writes them to CSV files.
Extract parameters from an i2MassChroQ parameter file and return a `ProteoBenchParameters` object.
Args:
fname (pathlib.Path): The file path to the i2MassChroQ parameter file.
Returns:
ProteoBenchParameters: The extracted parameters encapsulated in a `ProteoBenchParameters` object.
"""
# Read parameters from the file
params = pd.read_csv(fname, sep="\t", header=None, index_col=0).squeeze()

if params.loc["AnalysisSoftware_name"] in ["X!Tandem", "X! Tandem"]:
return _extract_xtandem_params(params)
elif params.loc["AnalysisSoftware_name"] == "Sage":
return _extract_sage_params(params)
else:
raise ValueError(f"Unsupported search engine: {params.loc['AnalysisSoftware_name']}")


if __name__ == "__main__":
# Reads i2MassChroQ parameter files, extracts parameters, and writes them to CSV files.
# List of parameter file paths
base_dir = pathlib.Path("../../../test/params/")
for fname in [
"../../../test/params/i2mproteobench_2pep_fdr01psm_fdr01prot.tsv",
"i2mproteobench_2pep_fdr01psm_fdr01prot_xtandem.tsv",
"i2mq_result_parameters.tsv",
"i2mproteobench_params_sage.tsv",
]:
file = pathlib.Path(fname)
file = base_dir / fname

# Read the parameter file to extract parameters
params = pd.read_csv(file, sep="\t", header=None, index_col=0).squeeze()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
[mapper]
"Protein Accessions" = "Proteins"
"Sequence" = "sequence"
"Modifications" = "modifications"

[condition_mapper]
"Abundances (Normalized): F1: Sample, ConditionA" = "A"
"Abundances (Normalized): F2: Sample, ConditionA" = "A"
"Abundances (Normalized): F3: Sample, ConditionA" = "A"
"Abundances (Normalized): F4: Sample, ConditionB" = "B"
"Abundances (Normalized): F5: Sample, ConditionB" = "B"
"Abundances (Normalized): F6: Sample, ConditionB" = "B"

[run_mapper]
"Abundances (Normalized): F1: Sample, ConditionA" = "Condition_A_Sample_Alpha_01"
"Abundances (Normalized): F2: Sample, ConditionA" = "Condition_A_Sample_Alpha_02"
"Abundances (Normalized): F3: Sample, ConditionA" = "Condition_A_Sample_Alpha_03"
"Abundances (Normalized): F4: Sample, ConditionB" = "Condition_B_Sample_Alpha_01"
"Abundances (Normalized): F5: Sample, ConditionB" = "Condition_B_Sample_Alpha_02"
"Abundances (Normalized): F6: Sample, ConditionB" = "Condition_B_Sample_Alpha_03"

[species_mapper]
"_YEAST" = "YEAST"
"_ECOLI" = "ECOLI"
"_HUMAN" = "HUMAN"

[general]
"contaminant_flag" = "Cont_"
"decoy_flag" = true
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

[quant_lfq_peptidoform_DDA]
"WOMBAT" = "parse_settings_wombat.toml"
"Proteome Discoverer" = "parse_settings_proteomediscoverer.toml"
"Custom" = "parse_settings_custom.toml"

[quant_lfq_ion_DIA_AIF]
Expand Down
95 changes: 82 additions & 13 deletions proteobench/io/parsing/parse_peptidoform.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,13 @@ def load_input_file(input_csv: str, input_format: str) -> pd.DataFrame:
pd.DataFrame: The loaded dataframe with the required columns added (like "proforma").
"""
input_data_frame: pd.DataFrame
if input_format == "Proteome Discoverer":
input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t")
input_data_frame["Modifications"].fillna("", inplace=True)
input_data_frame["proforma"] = input_data_frame.apply(
lambda x: aggregate_modification_column(x["Sequence"], x["Modifications"]),
axis=1,
)
if input_format == "WOMBAT":
input_data_frame = pd.read_csv(input_csv, low_memory=False, sep=",")
input_data_frame["proforma"] = input_data_frame["modified_peptide"]
Expand All @@ -34,11 +41,17 @@ def aggregate_modification_column(
"Any C-term": -1,
"Protein N-term": 0,
"Protein C-term": -1,
"N-Term": 0, # Added to handle "N-Term"
"C-Term": -1, # If you also expect "C-Term"
},
) -> str:
"""
Aggregate modifications into a string representing the modified sequence.
This version handles both:
- Original format (e.g. "Methylation (C11)" or "Carbamidomethyl (Any N-term)")
- New format (e.g. "1xCarbamidomethyl [C11]", "1xOxidation [M4]", "1xAcetyl [N-Term]")
Args:
input_string_seq (str): The input sequence string.
input_string_modifications (str): The modifications applied to the sequence.
Expand All @@ -47,25 +60,81 @@ def aggregate_modification_column(
Returns:
str: The modified sequence string with aggregated modifications.
"""

# If no modifications, return the original sequence unchanged
if not input_string_modifications.strip():
return input_string_seq

# Split modifications by ';' to handle multiple modifications
raw_mods = [x.strip() for x in input_string_modifications.split(";") if x.strip()]

all_mods = []
for m in input_string_modifications.split("; "):
if len(m) == 0:
continue
m_stripped = m.split(" (")[1].rstrip(")")
m_name = m.split(" (")[0]

if m_stripped in special_locations.keys():
if special_locations[m_stripped] == -1:
all_mods.append((m_name, len(input_string_seq)))
else:
all_mods.append((m_name, special_locations[m_stripped]))
continue

all_mods.append((m_name, int(m_stripped[1:])))
for m in raw_mods:
# Detect format by checking for '(' or '['
if "(" in m and "[" not in m:
# Original format (e.g. "Carbamidomethyl (C11)" or "Methylation (Any N-term)")
parts = m.split(" (")
if len(parts) < 2:
continue
m_name = parts[0].strip()
m_stripped = parts[1].rstrip(")")

# Check if this is a special location
if m_stripped in special_locations:
loc = special_locations[m_stripped]
if loc == -1:
loc = len(input_string_seq) # C-term
all_mods.append((m_name, loc))
else:
# Assume format like C11 means position 11
loc = int(m_stripped[1:])
all_mods.append((m_name, loc))

else:
# New format, e.g. "1xCarbamidomethyl [C11]", "1xAcetyl [N-Term]"
# Remove any count prefix like "1x"
entry = re.sub(r"\d+x", "", m).strip()

# Extract modification name and bracketed portion
mod_name_match = re.match(r"([A-Za-z]+)\s*\[(.+)\]", entry)
if not mod_name_match:
continue

mod_name = mod_name_match.group(1)
positions_str = mod_name_match.group(2).strip()

# Positions could be multiple (e.g. "C10; C13")
pos_parts = [p.strip() for p in positions_str.split(";") if p.strip()]
if not pos_parts:
# If there's nothing after the brackets, skip
continue

for pos_part in pos_parts:
# Check if pos_part is a known special location (e.g. "N-Term")
if pos_part in special_locations:
loc = special_locations[pos_part]
if loc == -1:
loc = len(input_string_seq)
all_mods.append((mod_name, loc))
else:
# Otherwise, assume format like C11 or M4
if len(pos_part) > 1:
loc = int(pos_part[1:])
all_mods.append((mod_name, loc))

# Sort modifications by descending position so we insert from the end
all_mods.sort(key=lambda x: x[1], reverse=True)

for name, loc in all_mods:
# Insert the modification into the sequence.
# 'loc' is a 1-based index if it's a residue position.
# For terminal modifications, special_locations will have adjusted it.
# If loc is -1 or at sequence end, we've already resolved it to len(sequence).

# Insert the modification brackets at position 'loc'.
# Note: If loc == 0 (N-term), insert at start of sequence.
# If loc == len(sequence), insert at end (C-term).
input_string_seq = input_string_seq[:loc] + f"[{name}]" + input_string_seq[loc:]

return input_string_seq
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def __init__(
"..",
"..",
"..",
"..",
"io",
"parsing",
"io_parse_settings",
Expand Down
4 changes: 4 additions & 0 deletions proteobench/modules/quant/quant_base/quant_base_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ class QuantModule:
"FragPipe (DIA-NN quant)": extract_params_fragger,
"MSAID": extract_params_msaid,
"Spectronaut": extract_params_spectronaut,
# TODO needs to be replace with parameter extraction function
"WOMBAT": extract_params_spectronaut,
# TODO needs to be replace with parameter extraction function
"Proteome Discoverer": extract_params_spectronaut,
}

def __init__(
Expand Down
1 change: 1 addition & 0 deletions proteobench/plotting/plot_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def plot_metric(
"Spectronaut": "#bcbd22",
"FragPipe (DIA-NN quant)": "#ff7f00",
"MSAID": "#afff57",
"Proteome Discoverer": "#8c564b",
},
mapping: Dict[str, int] = {"old": 10, "new": 20},
highlight_color: str = "#d30067",
Expand Down
Loading

0 comments on commit fa27c6a

Please sign in to comment.