Skip to content

Commit

Permalink
Merge branch 'development' into test-interval-bug-fix
Browse files Browse the repository at this point in the history
  • Loading branch information
mo-sameh authored Sep 11, 2024
2 parents e90126b + 3867112 commit 631104c
Show file tree
Hide file tree
Showing 23 changed files with 2,500 additions and 724 deletions.
126 changes: 117 additions & 9 deletions alphadia/constants/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ library_prediction:
# set path for custom peptdeep model. If set to null, the default model will be used
peptdeep_model_path: null

# set peptdeep model type. Possible values are 'generic', 'phospho', 'digly'. If set to null, the generic model will be used
peptdeep_model_type: null

# define custom alphabase modifications not part of unimod or alphabase
# also used for decoy channels
custom_modififcations:
Expand Down Expand Up @@ -85,10 +88,14 @@ search:
compete_for_fragments: True

target_num_candidates: 2
# target ms1 tolerance in ppm
target_ms1_tolerance: 15
# target ms2 tolerance in ppm
target_ms2_tolerance: 15
target_mobility_tolerance: 0.04
target_rt_tolerance: 60
# target ion mobility tolerance in 1/K_0
target_mobility_tolerance: 0.0 # default is to optimize automatically
# target retention time tolerance in seconds if > 1, or a proportion of the total gradient length if < 1
target_rt_tolerance: 0.0 # default is to optimize automatically

quant_window: 3
quant_all: True
Expand All @@ -98,9 +105,6 @@ search_advanced:

calibration:

# minimum number of steps taken during the optimization lock (during which the elution groups used for optimization are extracted)
optimization_lock_min_steps: 0

# Number of precursors searched and scored per batch
batch_size: 8000

Expand All @@ -110,17 +114,26 @@ calibration:
# the maximum number of steps that a given optimizer is permitted to take
max_steps: 20

# the maximum number of steps that a given optimizer is permitted to take
# the minimum number of steps that a given optimizer must take before it can be said to have converged
min_steps: 2

# the maximum number of times an automatic optimizer can be skipped before it is considered to have converged
max_skips: 1

# TODO: remove this parameter
final_full_calibration: False

# TODO: remove this parameter
norm_rt_mode: 'linear'

# the maximum number of fragments with correlation scores exceeding correlation_threshold to use for calibrating fragment mz (i.e. ms2)
max_fragments: 5000

# the correlation threshold for fragments used to calibrate fragment mz (i.e. ms2)
min_correlation: 0.7

search_initial:
# Number of peak groups identified in the convolution score to classify with target decoy comeptition
# Number of peak groups identified in the convolution score to classify with target decoy competition
initial_num_candidates: 1

# initial ms1 tolerance in ppm
Expand All @@ -132,7 +145,7 @@ search_initial:
# initial ion mobility tolerance in 1/K_0
initial_mobility_tolerance: 0.08

# initial retention time tolerance in seconds
# initial retention time tolerance in seconds if > 1, or a proportion of the total gradient length if < 1
initial_rt_tolerance: 240

selection_config:
Expand Down Expand Up @@ -166,6 +179,38 @@ scoring_config:
precursor_mz_tolerance: 10
fragment_mz_tolerance: 15

# perform non-isobaric multiplexing of any input library
library_multiplexing:
# if true, the library is multiplexed
enabled: False

# if the input library already contains multiplexed channels, the input channel has to be specified.
input_channel: 0

# define channels by their name and how modifications should be translated from the input library to the multiplexed library
# channels can be either a number or a string
# for every channel, the library gets copied and the modifications are translated according to the mapping
# the following example shows how to multiplex mTRAQ to three sample channels and a decoy channel
multiplex_mapping: {}

#0:
# mTRAQ@K: mTRAQ@K
# mTRAQ@Any_N-term: mTRAQ@Any_N-term

#4:
# mTRAQ@K: mTRAQ:13C(3)15N(1)@K
# mTRAQ@Any_N-term: mTRAQ:13C(3)15N(1)@Any_N-term

#8:
# mTRAQ@K: mTRAQ:13C(6)15N(2)@K
# mTRAQ@Any_N-term: mTRAQ:13C(6)15N(2)@Any_N-term

#12:
# mTRAQ@K: mTRAQ:d12@K
# mTRAQ@Any_N-term: mTRAQ:d12@Any_N-term



multiplexing:
enabled: False
target_channels: '4,8'
Expand All @@ -192,8 +237,71 @@ search_output:
# can be either "parquet" or "tsv"
file_format: "tsv"

# Configuration for the optimization of search parameters. These parameters should not normally be adjusted and are for the use of experienced users only.
optimization:
# The order in which to perform optimization. Should be a list of lists of parameter names
# Example:
# order_of_optimization:
# - - "rt_error"
# - - "ms2_error"
# - - "ms1_error"
# - - "mobility_error"
# The above means that first rt_error is optimized, then ms2_error, then ms1_error, and finally mobility_error. (Other examples are shown in Python list format rather than YAML format to save space.)
# Example: [['ms1_error', 'ms2_error', 'rt_error', 'mobility_error']] means that all parameters are optimized simultaneously.
# Example: [["ms2_error"], ["rt_error"], ["ms1_error"], ["mobility_error"]] means that the parameters are optimized sequentially in the order given.
# Example: [["rt_error"], ["ms1_error", "ms2_error"]] means that first rt_error is optimized, then ms1_error and ms2_error are optimized simultaneously, and mobility_error is not optimized at all.
# If order_of_optimization is null, first all targeted optimizers run simultaneously, then any remaining automatic optimizers run sequentially in the order [["ms2_error"], ["rt_error"], ["ms1_error"], ["mobility_error"]]
order_of_optimization: null

# Parameters for the update rule for each parameter:
# - update_percentile_range: the percentile interval to use (as a decimal)
# - update_factor: the factor by which to multiply the result from the percentile interval to get the new parameter value for the next round of search
# - try_narrower_parameters: if True, the optimization will try narrower parameters until a substantial (as determined by maximal_decrease) decrease in the feature used for optimization is observed.
# - maximal_decrease: the maximal decrease of the parameter value before stopping optimization (only relevant if favour_narrower_parameter is True).
# For example, a value of 0.2 indicates up to 20% decrease from the previous parameter is permissible.
# - favour_narrower_optimum: if True, the optimization will not take the value that maximizes the feature used for optimization, but instead the smallest value compatible with the maximum_decrease_from_maximum value.
# This setting can be useful for optimizing parameters for which many parameter values have similar feature values and therefore favouring narrower parameters helps to overcome noise.
# - maximum_decrease_from_maximum: the maximum proportional decrease from the maximum value of the parameter that the designated optimum should have (only relevant if favour_narrower_optimum is True).
# For example, a value of 0.1 indicates that the optimum should no more than 10% less than the maximum value.
ms2_error:
targeted_update_percentile_range: 0.95
targeted_update_factor: 1.0
automatic_update_percentile_range: 0.99
automatic_update_factor: 1.1
try_narrower_values: True
maximal_decrease: 0.5
favour_narrower_optimum: False
maximum_decrease_from_maximum: 0.1
ms1_error:
targeted_update_percentile_range: 0.95
targeted_update_factor: 1.0
automatic_update_percentile_range: 0.99
automatic_update_factor: 1.1
try_narrower_values: False
maximal_decrease: 0.2
favour_narrower_optimum: False
maximum_decrease_from_maximum: 0.1
mobility_error:
targeted_update_percentile_range: 0.95
targeted_update_factor: 1.0
automatic_update_percentile_range: 0.99
automatic_update_factor: 1.1
try_narrower_values: False
maximal_decrease: 0.2
favour_narrower_optimum: False
maximum_decrease_from_maximum: 0.1
rt_error:
targeted_update_percentile_range: 0.95
targeted_update_factor: 1.0
automatic_update_percentile_range: 0.99
automatic_update_factor: 1.1
try_narrower_values: True
maximal_decrease: 0.2
favour_narrower_optimum: True
maximum_decrease_from_maximum: 0.1

# configuration for the optimization manager
# initial parameters, will nbe optimized
# initial parameters, will be optimized
optimization_manager:
fwhm_rt: 5
fwhm_mobility: 0.01
Expand Down
12 changes: 6 additions & 6 deletions alphadia/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,19 @@ class NoPsmFoundError(BusinessError):
_msg = "No psm files accumulated, can't continue"


class NoRecalibrationTargetError(BusinessError):
"""Raise when no recalibration target is found."""
class NoOptimizationLockTargetError(BusinessError):
"""Raise when the optimization lock target is not found."""

_error_code = "NO_RECALIBRATION_TARGET"
_error_code = "NO_OPTIMIZATION_LOCK_TARGET"

_msg = "Searched all data without finding recalibration target"
_msg = "Searched all data without finding optimization lock target"

_detail_msg = """Search for raw file failed as not enough precursors were found for calibration.
_detail_msg = """Search for raw file failed as not enough precursors were found for calibration and optimization.
This can have the following reasons:
1. The sample was empty and therefore no precursors were found.
2. The sample contains only very few precursors.
For small libraries, try to set recalibration_target to a lower value.
For large libraries, try to reduce the library size and reduce the calibration MS1 and MS2 tolerance.
For large libraries, try to reduce the library size and reduce the initial MS1 and MS2 tolerance.
3. There was a fundamental issue with search parameters."""


Expand Down
81 changes: 81 additions & 0 deletions alphadia/libtransform.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@
import logging
import os
import typing
from functools import reduce
from pathlib import Path

# third party imports
import numpy as np
import pandas as pd
from alphabase.constants.modification import MOD_DF

# alpha family imports
from alphabase.peptide import fragment
Expand Down Expand Up @@ -253,6 +255,7 @@ def __init__(
nce: int = 25,
instrument: str = "Lumos",
peptdeep_model_path: str | None = None,
peptdeep_model_type: str | None = None,
fragment_types: list[str] | None = None,
max_fragment_charge: int = 2,
) -> None:
Expand All @@ -279,6 +282,11 @@ def __init__(
peptdeep_model_path : str, optional
Path to a folder containing PeptDeep models. If not provided, the default models will be used.
peptdeep_model_type : str, optional
Use other peptdeep models provided by the peptdeep model manager.
Default is None, which means the peptdeep default model ("generic") is being used.
Possible values are ['generic','phospho','digly']
fragment_types : List[str], optional
Fragment types to predict. Default is ["b", "y"].
Expand All @@ -296,6 +304,7 @@ def __init__(
self.instrument = instrument
self.mp_process_num = mp_process_num
self.peptdeep_model_path = peptdeep_model_path
self.peptdeep_model_type = peptdeep_model_type

self.fragment_types = fragment_types
self.max_fragment_charge = max_fragment_charge
Expand All @@ -313,6 +322,12 @@ def forward(self, input: SpecLibBase) -> SpecLibBase:
device = utils.get_torch_device(self.use_gpu)

model_mgr = ModelManager(device=device)

# will load other model than default generic
if self.peptdeep_model_type is not None:
logging.info(f"Loading PeptDeep models of type {self.peptdeep_model_type}")
model_mgr.load_installed_models(self.peptdeep_model_type)

if self.peptdeep_model_path is not None:
if not os.path.exists(self.peptdeep_model_path):
raise ValueError(
Expand Down Expand Up @@ -602,6 +617,72 @@ def forward(self, input: SpecLibBase) -> SpecLibBase:
return input


class MultiplexLibrary(ProcessingStep):
def __init__(self, multiplex_mapping: dict, input_channel: str | int | None = None):
"""Initialize the MultiplexLibrary step."""

self._multiplex_mapping = multiplex_mapping
self._input_channel = input_channel

def validate(self, input: str) -> bool:
"""Validate the input object. It is expected that the input is a path to a file which exists."""
valid = True
valid &= isinstance(input, SpecLibBase)

# check if all modifications are valid
for _, channel_multiplex_mapping in self._multiplex_mapping.items():
for key, value in channel_multiplex_mapping.items():
for mod in [key, value]:
if mod not in MOD_DF.index:
logger.error(f"Modification {mod} not found in input library")
valid = False

if "channel" in input.precursor_df.columns:
channel_unique = input.precursor_df["channel"].unique()
if self._input_channel not in channel_unique:
logger.error(
f"Input library does not contain channel {self._input_channel}"
)
valid = False

if (len(channel_unique) > 1) and (self._input_channel is None):
logger.error(
f"Input library contains multiple channels {channel_unique}. Please specify a channel."
)
valid = False

return valid

def forward(self, input: SpecLibBase) -> SpecLibBase:
"""Apply the MultiplexLibrary step to the input object."""

if "channel" in input.precursor_df.columns:
input.precursor_df = input.precursor_df[
input.precursor_df["channel"] == self._input_channel
]

channel_lib_list = []
for channel, channel_mod_translations in self._multiplex_mapping.items():
logger.info(f"Multiplexing library for channel {channel}")
channel_lib = input.copy()
for original_mod, channel_mod in channel_mod_translations.items():
channel_lib._precursor_df["mods"] = channel_lib._precursor_df[
"mods"
].str.replace(original_mod, channel_mod)
channel_lib._precursor_df["channel"] = channel

channel_lib.calc_fragment_mz_df()
channel_lib_list.append(channel_lib)

def apply_func(x, y):
x.append(y)
return x

speclib = reduce(lambda x, y: apply_func(x, y), channel_lib_list)
speclib.remove_unused_fragments()
return speclib


class FlattenLibrary(ProcessingStep):
def __init__(
self, top_k_fragments: int = 12, min_fragment_intensity: float = 0.01
Expand Down
7 changes: 3 additions & 4 deletions alphadia/outputaccumulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,15 +137,14 @@ def parse_output_folder(
psm_df["raw_name"] = foldername

# remove decoy precursors
psm_df = psm_df[psm_df["decoy"] == 0]
# assert that decoy is int
psm_df["decoy"] = psm_df["decoy"].astype(int)
psm_df = psm_df[psm_df["decoy"] == 0].reset_index(drop=True)

self._precursor_df = pd.DataFrame()
for col in psm_df.columns:
self._precursor_df[col] = psm_df[col]

self._precursor_df["decoy"] = self._precursor_df["decoy"].astype(int)
self._precursor_df = psm_df[psm_df["decoy"] == 0].reset_index(drop=True)

# self._precursor_df.set_index('precursor_idx', inplace=True)
# Change the data type of the mods column to string
self._precursor_df["mods"] = self._precursor_df["mods"].astype(str)
Expand Down
Loading

0 comments on commit 631104c

Please sign in to comment.