Merge branch 'development' into test-interval-bug-fix

MannLabs · Sep 11, 2024 · 631104c · 631104c
2 parents e90126b + 3867112
commit 631104c
Show file tree

Hide file tree

Showing 23 changed files with 2,500 additions and 724 deletions.
diff --git a/alphadia/constants/default.yaml b/alphadia/constants/default.yaml
@@ -48,6 +48,9 @@ library_prediction:
   # set path for custom peptdeep model. If set to null, the default model will be used
   peptdeep_model_path: null
 
+  # set peptdeep model type. Possible values are 'generic', 'phospho', 'digly'. If set to null, the generic model will be used
+  peptdeep_model_type: null
+
 # define custom alphabase modifications not part of unimod or alphabase
 # also used for decoy channels
 custom_modififcations:
@@ -85,10 +88,14 @@ search:
   compete_for_fragments: True
 
   target_num_candidates: 2
+  # target ms1 tolerance in ppm
   target_ms1_tolerance: 15
+  # target ms2 tolerance in ppm
   target_ms2_tolerance: 15
-  target_mobility_tolerance: 0.04
-  target_rt_tolerance: 60
+  # target ion mobility tolerance in 1/K_0
+  target_mobility_tolerance: 0.0 # default is to optimize automatically
+  # target retention time tolerance in seconds if > 1, or a proportion of the total gradient length if < 1
+  target_rt_tolerance: 0.0 # default is to optimize automatically
 
   quant_window: 3
   quant_all: True
@@ -98,9 +105,6 @@ search_advanced:
 
 calibration:
 
-  # minimum number of steps taken during the optimization lock (during which the elution groups used for optimization are extracted)
-  optimization_lock_min_steps: 0
-
   # Number of precursors searched and scored per batch
   batch_size: 8000
 
@@ -110,17 +114,26 @@ calibration:
   # the maximum number of steps that a given optimizer is permitted to take
   max_steps: 20
 
-  # the maximum number of steps that a given optimizer is permitted to take
+  # the minimum number of steps that a given optimizer must take before it can be said to have converged
   min_steps: 2
 
+  # the maximum number of times an automatic optimizer can be skipped before it is considered to have converged
+  max_skips: 1
+
   # TODO: remove this parameter
   final_full_calibration: False
 
   # TODO: remove this parameter
   norm_rt_mode: 'linear'
 
+  # the maximum number of fragments with correlation scores exceeding correlation_threshold to use for calibrating fragment mz (i.e. ms2)
+  max_fragments: 5000
+
+  # the correlation threshold for fragments used to calibrate fragment mz (i.e. ms2)
+  min_correlation: 0.7
+
 search_initial:
-  # Number of peak groups identified in the convolution score to classify with target decoy comeptition
+  # Number of peak groups identified in the convolution score to classify with target decoy competition
   initial_num_candidates: 1
 
   # initial ms1 tolerance in ppm
@@ -132,7 +145,7 @@ search_initial:
   # initial ion mobility tolerance in 1/K_0
   initial_mobility_tolerance: 0.08
 
-  # initial retention time tolerance in seconds
+  # initial retention time tolerance in seconds if > 1, or a proportion of the total gradient length if < 1
   initial_rt_tolerance: 240
 
 selection_config:
@@ -166,6 +179,38 @@ scoring_config:
   precursor_mz_tolerance: 10
   fragment_mz_tolerance: 15
 
+# perform non-isobaric multiplexing of any input library
+library_multiplexing:
+  # if true, the library is multiplexed
+  enabled: False
+
+  # if the input library already contains multiplexed channels, the input channel has to be specified.
+  input_channel: 0
+
+  # define channels by their name and how modifications should be translated from the input library to the multiplexed library
+  # channels can be either a number or a string
+  # for every channel, the library gets copied and the modifications are translated according to the mapping
+  # the following example shows how to multiplex mTRAQ to three sample channels and a decoy channel
+  multiplex_mapping: {}
+
+    #0:
+    #  mTRAQ@K: mTRAQ@K
+    #  mTRAQ@Any_N-term: mTRAQ@Any_N-term
+
+    #4:
+    #  mTRAQ@K: mTRAQ:13C(3)15N(1)@K
+    #  mTRAQ@Any_N-term: mTRAQ:13C(3)15N(1)@Any_N-term
+
+    #8:
+    #  mTRAQ@K: mTRAQ:13C(6)15N(2)@K
+    #  mTRAQ@Any_N-term: mTRAQ:13C(6)15N(2)@Any_N-term
+
+    #12:
+    #  mTRAQ@K: mTRAQ:d12@K
+    #  mTRAQ@Any_N-term: mTRAQ:d12@Any_N-term
+
+
+
 multiplexing:
   enabled: False
   target_channels: '4,8'
@@ -192,8 +237,71 @@ search_output:
   # can be either "parquet" or "tsv"
   file_format: "tsv"
 
+# Configuration for the optimization of search parameters. These parameters should not normally be adjusted and are for the use of experienced users only.
+optimization:
+  # The order in which to perform optimization. Should be a list of lists of parameter names
+  # Example:
+    # order_of_optimization:
+    #   - - "rt_error"
+    #   - - "ms2_error"
+    #   - - "ms1_error"
+    #   - - "mobility_error"
+  # The above means that first rt_error is optimized, then ms2_error, then ms1_error, and finally mobility_error. (Other examples are shown in Python list format rather than YAML format to save space.)
+  # Example: [['ms1_error', 'ms2_error', 'rt_error', 'mobility_error']] means that all parameters are optimized simultaneously.
+  # Example: [["ms2_error"], ["rt_error"], ["ms1_error"], ["mobility_error"]] means that the parameters are optimized sequentially in the order given.
+  # Example: [["rt_error"], ["ms1_error", "ms2_error"]] means that first rt_error is optimized, then ms1_error and ms2_error are optimized simultaneously, and mobility_error is not optimized at all.
+  # If order_of_optimization is null, first all targeted optimizers run simultaneously, then any remaining automatic optimizers run sequentially in the order [["ms2_error"], ["rt_error"], ["ms1_error"], ["mobility_error"]]
+  order_of_optimization: null
+
+  # Parameters for the update rule for each parameter:
+  #   - update_percentile_range: the percentile interval to use (as a decimal)
+  #   - update_factor: the factor by which to multiply the result from the percentile interval to get the new parameter value for the next round of search
+  #   - try_narrower_parameters: if True, the optimization will try narrower parameters until a substantial (as determined by maximal_decrease) decrease in the feature used for optimization is observed.
+  #   - maximal_decrease: the maximal decrease of the parameter value before stopping optimization (only relevant if favour_narrower_parameter is True).
+  #     For example, a value of 0.2 indicates up to 20% decrease from the previous parameter is permissible.
+  #   - favour_narrower_optimum: if True, the optimization will not take the value that maximizes the feature used for optimization, but instead the smallest value compatible with the maximum_decrease_from_maximum value.
+  #     This setting can be useful for optimizing parameters for which many parameter values have similar feature values and therefore favouring narrower parameters helps to overcome noise.
+  #   - maximum_decrease_from_maximum: the maximum proportional decrease from the maximum value of the parameter that the designated optimum should have (only relevant if favour_narrower_optimum is True).
+  #     For example, a value of 0.1 indicates that the optimum should no more than 10% less than the maximum value.
+  ms2_error:
+      targeted_update_percentile_range: 0.95
+      targeted_update_factor: 1.0
+      automatic_update_percentile_range: 0.99
+      automatic_update_factor: 1.1
+      try_narrower_values: True
+      maximal_decrease: 0.5
+      favour_narrower_optimum: False
+      maximum_decrease_from_maximum: 0.1
+  ms1_error:
+      targeted_update_percentile_range: 0.95
+      targeted_update_factor: 1.0
+      automatic_update_percentile_range: 0.99
+      automatic_update_factor: 1.1
+      try_narrower_values: False
+      maximal_decrease: 0.2
+      favour_narrower_optimum: False
+      maximum_decrease_from_maximum: 0.1
+  mobility_error:
+      targeted_update_percentile_range: 0.95
+      targeted_update_factor: 1.0
+      automatic_update_percentile_range: 0.99
+      automatic_update_factor: 1.1
+      try_narrower_values: False
+      maximal_decrease: 0.2
+      favour_narrower_optimum: False
+      maximum_decrease_from_maximum: 0.1
+  rt_error:
+      targeted_update_percentile_range: 0.95
+      targeted_update_factor: 1.0
+      automatic_update_percentile_range: 0.99
+      automatic_update_factor: 1.1
+      try_narrower_values: True
+      maximal_decrease: 0.2
+      favour_narrower_optimum: True
+      maximum_decrease_from_maximum: 0.1
+
 # configuration for the optimization manager
-# initial parameters, will nbe optimized
+# initial parameters, will be optimized
 optimization_manager:
   fwhm_rt: 5
   fwhm_mobility: 0.01

diff --git a/alphadia/exceptions.py b/alphadia/exceptions.py
@@ -37,19 +37,19 @@ class NoPsmFoundError(BusinessError):
     _msg = "No psm files accumulated, can't continue"
 
 
-class NoRecalibrationTargetError(BusinessError):
-    """Raise when no recalibration target is found."""
+class NoOptimizationLockTargetError(BusinessError):
+    """Raise when the optimization lock target is not found."""
 
-    _error_code = "NO_RECALIBRATION_TARGET"
+    _error_code = "NO_OPTIMIZATION_LOCK_TARGET"
 
-    _msg = "Searched all data without finding recalibration target"
+    _msg = "Searched all data without finding optimization lock target"
 
-    _detail_msg = """Search for raw file failed as not enough precursors were found for calibration.
+    _detail_msg = """Search for raw file failed as not enough precursors were found for calibration and optimization.
                  This can have the following reasons:
                    1. The sample was empty and therefore no precursors were found.
                    2. The sample contains only very few precursors.
                       For small libraries, try to set recalibration_target to a lower value.
-                      For large libraries, try to reduce the library size and reduce the calibration MS1 and MS2 tolerance.
+                      For large libraries, try to reduce the library size and reduce the initial MS1 and MS2 tolerance.
                    3. There was a fundamental issue with search parameters."""
 
 

diff --git a/alphadia/libtransform.py b/alphadia/libtransform.py
@@ -2,11 +2,13 @@
 import logging
 import os
 import typing
+from functools import reduce
 from pathlib import Path
 
 # third party imports
 import numpy as np
 import pandas as pd
+from alphabase.constants.modification import MOD_DF
 
 # alpha family imports
 from alphabase.peptide import fragment
@@ -253,6 +255,7 @@ def __init__(
         nce: int = 25,
         instrument: str = "Lumos",
         peptdeep_model_path: str | None = None,
+        peptdeep_model_type: str | None = None,
         fragment_types: list[str] | None = None,
         max_fragment_charge: int = 2,
     ) -> None:
@@ -279,6 +282,11 @@ def __init__(
         peptdeep_model_path : str, optional
             Path to a folder containing PeptDeep models. If not provided, the default models will be used.
 
+        peptdeep_model_type : str, optional
+            Use other peptdeep models provided by the peptdeep model manager.
+            Default is None, which means the peptdeep default model ("generic") is being used.
+            Possible values are ['generic','phospho','digly']
+
         fragment_types : List[str], optional
             Fragment types to predict. Default is ["b", "y"].
 
@@ -296,6 +304,7 @@ def __init__(
         self.instrument = instrument
         self.mp_process_num = mp_process_num
         self.peptdeep_model_path = peptdeep_model_path
+        self.peptdeep_model_type = peptdeep_model_type
 
         self.fragment_types = fragment_types
         self.max_fragment_charge = max_fragment_charge
@@ -313,6 +322,12 @@ def forward(self, input: SpecLibBase) -> SpecLibBase:
         device = utils.get_torch_device(self.use_gpu)
 
         model_mgr = ModelManager(device=device)
+
+        # will load other model than default generic
+        if self.peptdeep_model_type is not None:
+            logging.info(f"Loading PeptDeep models of type {self.peptdeep_model_type}")
+            model_mgr.load_installed_models(self.peptdeep_model_type)
+
         if self.peptdeep_model_path is not None:
             if not os.path.exists(self.peptdeep_model_path):
                 raise ValueError(
@@ -602,6 +617,72 @@ def forward(self, input: SpecLibBase) -> SpecLibBase:
         return input
 
 
+class MultiplexLibrary(ProcessingStep):
+    def __init__(self, multiplex_mapping: dict, input_channel: str | int | None = None):
+        """Initialize the MultiplexLibrary step."""
+
+        self._multiplex_mapping = multiplex_mapping
+        self._input_channel = input_channel
+
+    def validate(self, input: str) -> bool:
+        """Validate the input object. It is expected that the input is a path to a file which exists."""
+        valid = True
+        valid &= isinstance(input, SpecLibBase)
+
+        # check if all modifications are valid
+        for _, channel_multiplex_mapping in self._multiplex_mapping.items():
+            for key, value in channel_multiplex_mapping.items():
+                for mod in [key, value]:
+                    if mod not in MOD_DF.index:
+                        logger.error(f"Modification {mod} not found in input library")
+                        valid = False
+
+        if "channel" in input.precursor_df.columns:
+            channel_unique = input.precursor_df["channel"].unique()
+            if self._input_channel not in channel_unique:
+                logger.error(
+                    f"Input library does not contain channel {self._input_channel}"
+                )
+                valid = False
+
+            if (len(channel_unique) > 1) and (self._input_channel is None):
+                logger.error(
+                    f"Input library contains multiple channels {channel_unique}. Please specify a channel."
+                )
+                valid = False
+
+        return valid
+
+    def forward(self, input: SpecLibBase) -> SpecLibBase:
+        """Apply the MultiplexLibrary step to the input object."""
+
+        if "channel" in input.precursor_df.columns:
+            input.precursor_df = input.precursor_df[
+                input.precursor_df["channel"] == self._input_channel
+            ]
+
+        channel_lib_list = []
+        for channel, channel_mod_translations in self._multiplex_mapping.items():
+            logger.info(f"Multiplexing library for channel {channel}")
+            channel_lib = input.copy()
+            for original_mod, channel_mod in channel_mod_translations.items():
+                channel_lib._precursor_df["mods"] = channel_lib._precursor_df[
+                    "mods"
+                ].str.replace(original_mod, channel_mod)
+                channel_lib._precursor_df["channel"] = channel
+
+            channel_lib.calc_fragment_mz_df()
+            channel_lib_list.append(channel_lib)
+
+        def apply_func(x, y):
+            x.append(y)
+            return x
+
+        speclib = reduce(lambda x, y: apply_func(x, y), channel_lib_list)
+        speclib.remove_unused_fragments()
+        return speclib
+
+
 class FlattenLibrary(ProcessingStep):
     def __init__(
         self, top_k_fragments: int = 12, min_fragment_intensity: float = 0.01

diff --git a/alphadia/outputaccumulator.py b/alphadia/outputaccumulator.py
@@ -137,15 +137,14 @@ def parse_output_folder(
         psm_df["raw_name"] = foldername
 
         # remove decoy precursors
-        psm_df = psm_df[psm_df["decoy"] == 0]
+        # assert that  decoy is int
+        psm_df["decoy"] = psm_df["decoy"].astype(int)
+        psm_df = psm_df[psm_df["decoy"] == 0].reset_index(drop=True)
 
         self._precursor_df = pd.DataFrame()
         for col in psm_df.columns:
             self._precursor_df[col] = psm_df[col]
 
-        self._precursor_df["decoy"] = self._precursor_df["decoy"].astype(int)
-        self._precursor_df = psm_df[psm_df["decoy"] == 0].reset_index(drop=True)
-
         # self._precursor_df.set_index('precursor_idx', inplace=True)
         # Change the data type of the mods column to string
         self._precursor_df["mods"] = self._precursor_df["mods"].astype(str)