Add first version of SequencePreprocessor (not tested and no examples)

breimanntools · Jun 28, 2024 · d729bdf · d729bdf
1 parent ca9e339
commit d729bdf
Show file tree

Hide file tree

Showing 10 changed files with 266 additions and 191 deletions.
diff --git a/aaanalysis/__init__.py b/aaanalysis/__init__.py
@@ -18,7 +18,6 @@
     "read_fasta",
     "to_fasta",
     # "comp_seq_sim",       BioPython
-    # "comp_pw_seq_sim",    BioPython
     # "filter_seq",         BioPython
     "SequencePreprocessor",
     "AAclust",

diff --git a/aaanalysis/data_handling/_backend/seq_preproc/encode_integer.py b/aaanalysis/data_handling/_backend/seq_preproc/encode_integer.py
@@ -1,5 +1,5 @@
 """
-This is a script for ...
+This is a script for the backend of the SequenceProcessor().encode_integer() method.
 """
 import time
 import pandas as pd
@@ -11,48 +11,25 @@
 
 
 # II Main Functions
-def encode_integer(list_seq: List[str] = None,
-                   alphabet: str = "ARNDCEQGHILKMFPSTWYV",
-                   gap: str = "_",
-                   pad_at: Literal["C", "N"] = "C",
-                   ) -> np.array:
+def encode_integer(list_seq=None, alphabet="ARNDCEQGHILKMFPSTWYV", gap="-", pad_at="C"):
     """
     Integer-encode a list of protein sequences into a feature matrix, padding shorter sequences
     with gaps represented as zero vectors.
-
-    Parameters:
-    ----------
-    list_seq : List of str
-        List of protein sequences to encode.
-    alphabet : str, default='ARNDCEQGHILKMFPSTWYV'
-        The alphabet of amino acids used for encoding. The gap character is not part of the alphabet.
-    gap : str, default='_'
-        The character used to represent gaps in sequences.
-    pad_at : Literal['N', 'C'], default='C'
-        Specifies where to add the padding:
-        'N' for N-terminus (beginning of the sequence),
-        'C' for C-terminus (end of the sequence).
-
-    Returns:
-    -------
-    np.array
-        A numpy array where each row represents an encoded sequence, and each column represents a feature.
-
     """
     # Map amino acids to integers
     aa_to_int = {aa: idx + 1 for idx, aa in enumerate(alphabet)}
     aa_to_int[gap] = 0
 
     # Pad sequences
     padded_sequences = pad_sequences(list_seq, pad_at=pad_at, gap=gap)
-
-    # Create integer encoding
+    # Create feature names
     max_length = len(padded_sequences[0])
+    list_features = [f"P{i}" for i in range(1, max_length+1)]
+    # Create integer encoding
     feature_matrix = np.zeros((len(padded_sequences), max_length), dtype=int)
     for idx, seq in enumerate(padded_sequences):
         encoded_seq = [aa_to_int[aa] for aa in seq]
         feature_matrix[idx, :] = encoded_seq
-
-    return feature_matrix
+    return feature_matrix, list_features
 
 
diff --git a/aaanalysis/data_handling/_backend/seq_preproc/encode_one_hot.py b/aaanalysis/data_handling/_backend/seq_preproc/encode_one_hot.py
@@ -1,5 +1,5 @@
 """
-This is a script for creating one-hot-encoding of sequences used as baseline representation.
+This is a script for the backend of the SequenceProcessor().encode_one_hot() method.
 """
 import pandas as pd
 from typing import Optional, Dict, Union, List, Tuple, Type, Literal
@@ -14,35 +14,30 @@ def _one_hot_encode(amino_acid=None, alphabet=None, gap="_"):
     Encodes a single amino acid into a one-hot vector based on a specified alphabet.
     Returns a zero vector for gaps represented as '_'.
     """
-    index_dict = {aa: i for i, aa in enumerate(alphabet)}
+    dict_aa_index = {aa: i for i, aa in enumerate(alphabet)}
     vector = np.zeros(len(alphabet), dtype=int)
     if amino_acid != gap:
-        if amino_acid in index_dict:
-            vector[index_dict[amino_acid]] = 1
-        else:
-            raise ValueError(f"Unrecognized amino acid '{amino_acid}' not in alphabet.")
+        vector[dict_aa_index[amino_acid]] = 1
     return vector
 
 
 # II Main Functions
-# TODO finish, docu, test, example ..
-def encode_one_hot(list_seq: List[str] = None,
-                   alphabet: str = "ARNDCEQGHILKMFPSTWYV",
-                   gap: str = "_",
-                   pad_at: Literal["C", "N"] = "C",
-                   ) -> np.array:
+def encode_one_hot(list_seq=None, alphabet="ARNDCEQGHILKMFPSTWYV", gap="-", pad_at="C"):
     """
     One-hot-encode a list of protein sequences into a feature matrix with padding shorter sequences
     with gaps represented as zero vectors.
     """
     # Pad sequences
     padded_sequences = pad_sequences(list_seq, pad_at=pad_at, gap=gap)
-    # Create one-hot-encoding
+    # Create feature names
     max_length = len(padded_sequences[0])
+    list_features = [f"{i}{aa}" for i in range(1, max_length+1) for aa in alphabet]
+    # Create one-hot-encoding
     num_amino_acids = len(alphabet)
     feature_matrix = np.zeros((len(padded_sequences), max_length * num_amino_acids), dtype=int)
     args = dict(alphabet=alphabet, gap=gap)
     for idx, seq in enumerate(padded_sequences):
         encoded_seq = [_one_hot_encode(amino_acid=aa, **args) for aa in seq]
         feature_matrix[idx, :] = np.array(encoded_seq).flatten()
-    return feature_matrix
+    return feature_matrix, list_features
+
diff --git a/aaanalysis/data_handling/_backend/seq_preproc/get_aa_window.py b/aaanalysis/data_handling/_backend/seq_preproc/get_aa_window.py
@@ -1,5 +1,5 @@
 """
-This is a script for ...
+This is a script for the backend of the SequenceProcessor().get_aa_window() method.
 """
 import time
 import pandas as pd
@@ -10,55 +10,15 @@
 
 
 # II Main Functions
-def get_aa_window(seq: str, pos_start: int, pos_stop: int = None, window_size: int = None, gap: str = '-', accept_gap: bool = True) -> str:
-    """
-    Extracts a window of amino acids from a sequence, padding with gaps if necessary.
-
-    Parameters:
-    ----------
-    seq : str
-        The protein sequence from which to extract the window.
-    pos_start : int
-        The starting position of the window (1-based index).
-    pos_end : int, optional
-        The ending position of the window (1-based index). If None, window_size is used.
-    window_size : int, optional
-        The size of the window to extract. Only used if pos_end is None.
-    gap : str, default='-'
-        The character used to represent gaps.
-    accept_gap : bool, default=True
-        Whether to accept gaps in the window. If False, windows containing gaps are rejected.
-
-    Returns:
-    -------
-    str
-        The extracted window of amino acids, padded with gaps if necessary.
-
-    Raises:
-    ------
-    ValueError:
-        If both pos_end and window_size are None, or if the window contains gaps and accept_gap is False.
-    """
-    if pos_stop is None and window_size is None:
-        raise ValueError("Either pos_end or window_size must be specified.")
-
+def get_aa_window(seq=None, pos_start=None, pos_stop=None, window_size=None, gap='-'):
+    """Extracts a window of amino acids from a sequence, padding with gaps if necessary."""
     if pos_stop is None:
         pos_stop = pos_start + window_size - 1
-
-    # Convert 1-based positions to 0-based indices
-    pos_start -= 1
-    pos_stop -= 1
-
     # Calculate the necessary padding if pos_end exceeds sequence length
     seq_length = len(seq)
     if pos_stop >= seq_length:
         seq += gap * (pos_stop - seq_length + 1)
-
     # Extract the window
     window = seq[pos_start:pos_stop + 1]
-
-    if not accept_gap and gap in window:
-        raise ValueError("The window contains gaps and accept_gap is set to False.")
-
     return window
 
diff --git a/aaanalysis/data_handling/_backend/seq_preproc/get_sliding_aa_window.py b/aaanalysis/data_handling/_backend/seq_preproc/get_sliding_aa_window.py
@@ -1,53 +1,25 @@
 """
-This is a script for ...
+This is a script for the backend of the SequenceProcessor().get_sliding_aa_window() method.
 """
 import pandas as pd
 
 from .get_aa_window import get_aa_window
 
-# Settings
-pd.set_option('expand_frame_repr', False)  # Single line print for pd.Dataframe
-
 
 # I Helper Functions
 
 
 # II Main Functions
-def get_sliding_aa_window(seq: str,
-                          slide_start: int,
-                          slide_stop: int = None,
-                          window_size: int = 5,
-                          gap: str = '-',
-                          accept_gap: bool = True):
-    """
-    Extracts sliding list_windows of amino acids from a sequence.
-
-    Parameters:
-    ----------
-    seq : str
-        The protein sequence from which to extract the list_windows.
-    slide_start : int
-        The starting position for sliding window extraction (1-based index).
-    slide_end : int, optional
-        The ending position for sliding window extraction (1-based index). If None, extract all possible list_windows.
-    window_size : int, default=5
-        The size of each window to extract.
-    gap : str, default='-'
-        The character used to represent gaps.
-    accept_gap : bool, default=True
-        Whether to accept gaps in the list_windows. If False, list_windows containing gaps are rejected.
-
-    Returns:
-    -------
-    List[str]
-        A list of extracted list_windows of amino acids.
-    """
+def get_sliding_aa_window(seq=None, slide_start=0, slide_stop=None, window_size=5, gap='-', index1=False):
+    """Extracts sliding list_windows of amino acids from a sequence"""
     if slide_stop is None:
-        slide_stop = len(seq)
-        if not accept_gap:
-            slide_stop -= window_size
+        slide_stop = len(seq) - 1
+        if index1:
+            slide_stop += 1
+    n_windows = slide_stop - window_size - slide_start + 1
     list_windows = []
-    for start in range(slide_start, slide_stop + 1):
-        aa_window = get_aa_window(seq, pos_start=start, window_size=window_size, gap=gap, accept_gap=accept_gap)
+    for start in range(slide_start, slide_start + n_windows + 1):
+        # Do not provide index1 again (it will be otherwise two time corrected)
+        aa_window = get_aa_window(seq, pos_start=start, window_size=window_size, gap=gap)
         list_windows.append(aa_window)
     return list_windows