diff --git a/aaanalysis/__init__.py b/aaanalysis/__init__.py index 8fb61699..43a80900 100644 --- a/aaanalysis/__init__.py +++ b/aaanalysis/__init__.py @@ -18,7 +18,6 @@ "read_fasta", "to_fasta", # "comp_seq_sim", BioPython - # "comp_pw_seq_sim", BioPython # "filter_seq", BioPython "SequencePreprocessor", "AAclust", diff --git a/aaanalysis/data_handling/_backend/seq_preproc/encode_integer.py b/aaanalysis/data_handling/_backend/seq_preproc/encode_integer.py index 35beaf60..fa4c09f1 100644 --- a/aaanalysis/data_handling/_backend/seq_preproc/encode_integer.py +++ b/aaanalysis/data_handling/_backend/seq_preproc/encode_integer.py @@ -1,5 +1,5 @@ """ -This is a script for ... +This is a script for the backend of the SequenceProcessor().encode_integer() method. """ import time import pandas as pd @@ -11,33 +11,10 @@ # II Main Functions -def encode_integer(list_seq: List[str] = None, - alphabet: str = "ARNDCEQGHILKMFPSTWYV", - gap: str = "_", - pad_at: Literal["C", "N"] = "C", - ) -> np.array: +def encode_integer(list_seq=None, alphabet="ARNDCEQGHILKMFPSTWYV", gap="-", pad_at="C"): """ Integer-encode a list of protein sequences into a feature matrix, padding shorter sequences with gaps represented as zero vectors. - - Parameters: - ---------- - list_seq : List of str - List of protein sequences to encode. - alphabet : str, default='ARNDCEQGHILKMFPSTWYV' - The alphabet of amino acids used for encoding. The gap character is not part of the alphabet. - gap : str, default='_' - The character used to represent gaps in sequences. - pad_at : Literal['N', 'C'], default='C' - Specifies where to add the padding: - 'N' for N-terminus (beginning of the sequence), - 'C' for C-terminus (end of the sequence). - - Returns: - ------- - np.array - A numpy array where each row represents an encoded sequence, and each column represents a feature. - """ # Map amino acids to integers aa_to_int = {aa: idx + 1 for idx, aa in enumerate(alphabet)} @@ -45,14 +22,14 @@ def encode_integer(list_seq: List[str] = None, # Pad sequences padded_sequences = pad_sequences(list_seq, pad_at=pad_at, gap=gap) - - # Create integer encoding + # Create feature names max_length = len(padded_sequences[0]) + list_features = [f"P{i}" for i in range(1, max_length+1)] + # Create integer encoding feature_matrix = np.zeros((len(padded_sequences), max_length), dtype=int) for idx, seq in enumerate(padded_sequences): encoded_seq = [aa_to_int[aa] for aa in seq] feature_matrix[idx, :] = encoded_seq - - return feature_matrix + return feature_matrix, list_features diff --git a/aaanalysis/data_handling/_backend/seq_preproc/encode_one_hot.py b/aaanalysis/data_handling/_backend/seq_preproc/encode_one_hot.py index 8087856e..19c8f508 100644 --- a/aaanalysis/data_handling/_backend/seq_preproc/encode_one_hot.py +++ b/aaanalysis/data_handling/_backend/seq_preproc/encode_one_hot.py @@ -1,5 +1,5 @@ """ -This is a script for creating one-hot-encoding of sequences used as baseline representation. +This is a script for the backend of the SequenceProcessor().encode_one_hot() method. """ import pandas as pd from typing import Optional, Dict, Union, List, Tuple, Type, Literal @@ -14,35 +14,30 @@ def _one_hot_encode(amino_acid=None, alphabet=None, gap="_"): Encodes a single amino acid into a one-hot vector based on a specified alphabet. Returns a zero vector for gaps represented as '_'. """ - index_dict = {aa: i for i, aa in enumerate(alphabet)} + dict_aa_index = {aa: i for i, aa in enumerate(alphabet)} vector = np.zeros(len(alphabet), dtype=int) if amino_acid != gap: - if amino_acid in index_dict: - vector[index_dict[amino_acid]] = 1 - else: - raise ValueError(f"Unrecognized amino acid '{amino_acid}' not in alphabet.") + vector[dict_aa_index[amino_acid]] = 1 return vector # II Main Functions -# TODO finish, docu, test, example .. -def encode_one_hot(list_seq: List[str] = None, - alphabet: str = "ARNDCEQGHILKMFPSTWYV", - gap: str = "_", - pad_at: Literal["C", "N"] = "C", - ) -> np.array: +def encode_one_hot(list_seq=None, alphabet="ARNDCEQGHILKMFPSTWYV", gap="-", pad_at="C"): """ One-hot-encode a list of protein sequences into a feature matrix with padding shorter sequences with gaps represented as zero vectors. """ # Pad sequences padded_sequences = pad_sequences(list_seq, pad_at=pad_at, gap=gap) - # Create one-hot-encoding + # Create feature names max_length = len(padded_sequences[0]) + list_features = [f"{i}{aa}" for i in range(1, max_length+1) for aa in alphabet] + # Create one-hot-encoding num_amino_acids = len(alphabet) feature_matrix = np.zeros((len(padded_sequences), max_length * num_amino_acids), dtype=int) args = dict(alphabet=alphabet, gap=gap) for idx, seq in enumerate(padded_sequences): encoded_seq = [_one_hot_encode(amino_acid=aa, **args) for aa in seq] feature_matrix[idx, :] = np.array(encoded_seq).flatten() - return feature_matrix + return feature_matrix, list_features + diff --git a/aaanalysis/data_handling/_backend/seq_preproc/get_aa_window.py b/aaanalysis/data_handling/_backend/seq_preproc/get_aa_window.py index 629b1735..507e0b78 100644 --- a/aaanalysis/data_handling/_backend/seq_preproc/get_aa_window.py +++ b/aaanalysis/data_handling/_backend/seq_preproc/get_aa_window.py @@ -1,5 +1,5 @@ """ -This is a script for ... +This is a script for the backend of the SequenceProcessor().get_aa_window() method. """ import time import pandas as pd @@ -10,55 +10,15 @@ # II Main Functions -def get_aa_window(seq: str, pos_start: int, pos_stop: int = None, window_size: int = None, gap: str = '-', accept_gap: bool = True) -> str: - """ - Extracts a window of amino acids from a sequence, padding with gaps if necessary. - - Parameters: - ---------- - seq : str - The protein sequence from which to extract the window. - pos_start : int - The starting position of the window (1-based index). - pos_end : int, optional - The ending position of the window (1-based index). If None, window_size is used. - window_size : int, optional - The size of the window to extract. Only used if pos_end is None. - gap : str, default='-' - The character used to represent gaps. - accept_gap : bool, default=True - Whether to accept gaps in the window. If False, windows containing gaps are rejected. - - Returns: - ------- - str - The extracted window of amino acids, padded with gaps if necessary. - - Raises: - ------ - ValueError: - If both pos_end and window_size are None, or if the window contains gaps and accept_gap is False. - """ - if pos_stop is None and window_size is None: - raise ValueError("Either pos_end or window_size must be specified.") - +def get_aa_window(seq=None, pos_start=None, pos_stop=None, window_size=None, gap='-'): + """Extracts a window of amino acids from a sequence, padding with gaps if necessary.""" if pos_stop is None: pos_stop = pos_start + window_size - 1 - - # Convert 1-based positions to 0-based indices - pos_start -= 1 - pos_stop -= 1 - # Calculate the necessary padding if pos_end exceeds sequence length seq_length = len(seq) if pos_stop >= seq_length: seq += gap * (pos_stop - seq_length + 1) - # Extract the window window = seq[pos_start:pos_stop + 1] - - if not accept_gap and gap in window: - raise ValueError("The window contains gaps and accept_gap is set to False.") - return window diff --git a/aaanalysis/data_handling/_backend/seq_preproc/get_sliding_aa_window.py b/aaanalysis/data_handling/_backend/seq_preproc/get_sliding_aa_window.py index a335fc99..005afe70 100644 --- a/aaanalysis/data_handling/_backend/seq_preproc/get_sliding_aa_window.py +++ b/aaanalysis/data_handling/_backend/seq_preproc/get_sliding_aa_window.py @@ -1,53 +1,25 @@ """ -This is a script for ... +This is a script for the backend of the SequenceProcessor().get_sliding_aa_window() method. """ import pandas as pd from .get_aa_window import get_aa_window -# Settings -pd.set_option('expand_frame_repr', False) # Single line print for pd.Dataframe - # I Helper Functions # II Main Functions -def get_sliding_aa_window(seq: str, - slide_start: int, - slide_stop: int = None, - window_size: int = 5, - gap: str = '-', - accept_gap: bool = True): - """ - Extracts sliding list_windows of amino acids from a sequence. - - Parameters: - ---------- - seq : str - The protein sequence from which to extract the list_windows. - slide_start : int - The starting position for sliding window extraction (1-based index). - slide_end : int, optional - The ending position for sliding window extraction (1-based index). If None, extract all possible list_windows. - window_size : int, default=5 - The size of each window to extract. - gap : str, default='-' - The character used to represent gaps. - accept_gap : bool, default=True - Whether to accept gaps in the list_windows. If False, list_windows containing gaps are rejected. - - Returns: - ------- - List[str] - A list of extracted list_windows of amino acids. - """ +def get_sliding_aa_window(seq=None, slide_start=0, slide_stop=None, window_size=5, gap='-', index1=False): + """Extracts sliding list_windows of amino acids from a sequence""" if slide_stop is None: - slide_stop = len(seq) - if not accept_gap: - slide_stop -= window_size + slide_stop = len(seq) - 1 + if index1: + slide_stop += 1 + n_windows = slide_stop - window_size - slide_start + 1 list_windows = [] - for start in range(slide_start, slide_stop + 1): - aa_window = get_aa_window(seq, pos_start=start, window_size=window_size, gap=gap, accept_gap=accept_gap) + for start in range(slide_start, slide_start + n_windows + 1): + # Do not provide index1 again (it will be otherwise two time corrected) + aa_window = get_aa_window(seq, pos_start=start, window_size=window_size, gap=gap) list_windows.append(aa_window) return list_windows diff --git a/aaanalysis/data_handling/_seq_preproc.py b/aaanalysis/data_handling/_seq_preproc.py index b0906fca..ee2e0edf 100644 --- a/aaanalysis/data_handling/_seq_preproc.py +++ b/aaanalysis/data_handling/_seq_preproc.py @@ -2,9 +2,8 @@ This is a script for the frontend of the SequencePreprocessor class, a supportive class for preprocessing protein sequences. """ -from typing import Optional, Union, List, Literal +from typing import Optional, Union, List, Literal, Tuple import numpy as np -import pandas as pd import aaanalysis.utils as ut from ._backend.seq_preproc.encode_one_hot import encode_one_hot @@ -21,11 +20,12 @@ def check_gap(gap="_"): raise ValueError(f"'gap' ('{gap}') should be a single character.") -def check_match_list_seq_alphabet(list_seq=None, alphabet=None): +# Encoding check functions +def check_match_list_seq_alphabet(list_seq=None, alphabet=None, gap="-"): """Validate if all characters in the sequences are within the given alphabet""" all_chars = set(''.join(list_seq)) - if not all_chars.issubset(set(alphabet + '_')): - invalid_chars = all_chars - set(alphabet + '_') + if not all_chars.issubset(set(alphabet + gap)): + invalid_chars = all_chars - set(alphabet + gap) raise ValueError(f"Following amino acid(s) from 'list_seq' are not in 'alphabet': {invalid_chars}") @@ -35,6 +35,84 @@ def check_match_gap_alphabet(gap="_", alphabet=None): raise ValueError(f"'gap' ('{gap}') should not be contained in the 'alphabet' ('{alphabet}')") +# Window size check functions +def adjust_positions(start=None, stop=None, index1=False): + """Adjust positions depending on indexing mode""" + if index1: + start -= 1 + if stop is not None: + stop -= 1 + return start, stop + + +def check_match_pos_start_pos_stop(pos_start=None, pos_stop=None): + """Check if start position smaller than stop position""" + if pos_stop is not None and pos_start > pos_stop: + raise ValueError(f"'pos_start' ({pos_start}) should be smaller than 'pos_stop' ({pos_stop})") + + +def check_match_pos_stop_window_size(pos_stop=None, window_size=None): + """Check if one is given""" + if pos_stop is None and window_size is None: + raise ValueError("Either 'pos_end' or 'window_size' must be specified. Both are 'None'.") + if pos_stop is not None and window_size is not None: + raise ValueError(f"Either 'pos_end' ({pos_stop}) or 'window_size' ({window_size}) must be specified." + f" Both are given.") + + +def check_match_seq_pos(seq=None, pos_start=None, pos_stop=None): + """Check if pos_start matches length of sequence""" + seq_len = len(seq) + if pos_start >= seq_len: + raise ValueError(f"'pos_start' ({pos_start}) must be smaller than the sequence length ({seq_len})") + if pos_stop is not None and pos_stop >= seq_len: + raise ValueError(f"'pos_stop' ({pos_stop}) must be smaller than the sequence length ({seq_len})") + + +def check_match_seq_pos_start_window_size(seq=None, pos_start=None, window_size=None): + """Check if start position and window size do not extend the sequence length""" + if window_size is not None: + seq_len = len(seq) + pos_stop = pos_start + window_size + if pos_stop > seq_len: + raise ValueError(f"'pos_start' ({pos_start}) + 'window_size' ({window_size}) should be >= " + f"the sequence length ({seq_len})") + + +# Sliding window check functions +def check_match_slide_start_slide_stop(slide_start=None, slide_stop=None): + """Check if start sliding position smaller than stop position""" + if slide_stop is not None and slide_start > slide_stop: + raise ValueError(f"'slide_start' ({slide_start}) should be smaller than 'slide_stop' ({slide_stop})") + + +def check_match_slide_start_slide_stop_window_size(slide_start=None, slide_stop=None, window_size=None): + """Check if one is given""" + if slide_stop is not None: + min_window_size = slide_stop - slide_stop + if window_size < min_window_size: + raise ValueError(f"'window_size' ('{window_size}') should be smaller then the distance ({min_window_size})" + f" between 'slide_start' ('{slide_start}') and 'slide_stop' ({slide_stop}).") + + +def check_match_seq_slide(seq=None, slide_start=None, slide_stop=None): + """Check if slide_start matches length of sequence""" + seq_len = len(seq) + if slide_start >= seq_len: + raise ValueError(f"'slide_start' ({slide_start}) must be smaller than the sequence length ({seq_len})") + if slide_stop is not None and slide_stop >= seq_len: + raise ValueError(f"'slide_stop' ({slide_stop}) must be smaller than the sequence length ({seq_len})") + + +def check_match_seq_slide_start_window_size(seq=None, slide_start=None, window_size=None): + """Check if start position and window size do not extend the sequence length""" + seq_len = len(seq) + slide_stop = slide_start + window_size + if slide_stop > seq_len: + raise ValueError(f"'slide_start' ({slide_start}) + 'window_size' ({window_size}) should be >= " + f"the sequence length ({seq_len})") + + # II Main Functions # TODO finish SequencePreprocessor, test, docu # TODO manage aaanalysis[pro] (add info/warning in docu for every function/module whose dependencies are not installed) @@ -44,16 +122,20 @@ class SequencePreprocessor: This class provides methods for preprocessing protein sequences, including encoding and window extraction. """ + # Sequence encoding @staticmethod def encode_one_hot(list_seq: Union[List[str], str] = None, alphabet: str = "ARNDCEQGHILKMFPSTWYV", - gap: str = "_", + gap: str = "-", pad_at: Literal["C", "N"] = "C", - ) -> np.ndarray: + ) -> Tuple[np.ndarray, List[str]]: """ One-hot-encode a list of protein sequences into a feature matrix. - Padding of shorter sequences with gaps represented as zero vectors. + Each residue is represented by a binary vector of length equal to the alphabet size. + For each sequence position, the amino acid is set to 1 in its corresponding position in the vector, + while all other positions are set to 0. Gaps are represented by zero vectors. Shorter sequences are + padded with gaps either N- or C-terminally. Parameters ---------- @@ -61,17 +143,24 @@ def encode_one_hot(list_seq: Union[List[str], str] = None, List of protein sequences to encode. alphabet : str, default='ARNDCEQGHILKMFPSTWYV' The alphabet of amino acids used for encoding. - gap : str, default='_' + gap : str, default='-' The character used to represent gaps in sequences. pad_at : str, default='C' Specifies where to add the padding: - 'N' for N-terminus (beginning of the sequence), - 'C' for C-terminus (end of the sequence). + + - 'N' for N-terminus (beginning of the sequence), + - 'C' for C-terminus (end of the sequence). Returns ------- - np.ndarray - A numpy array where each row represents an encoded sequence. + X: array-like, shape (n_samples, n_residues*n_characters) + Feature matrix containing one-hot encoded position-wise representation of residues. + features : list of str + List of feature names corresponding to each position and amino acid in the encoded matrix. + + Examples + -------- + .. include:: examples/sp_encode_one_hot.rst """ # Check input list_seq = ut.check_list_like(name="list_seq", val=list_seq, @@ -81,66 +170,133 @@ def encode_one_hot(list_seq: Union[List[str], str] = None, check_gap(gap=gap) ut.check_str_options(name="pad_at", val=pad_at, list_str_options=["N", "C"]) check_match_gap_alphabet(gap=gap, alphabet=alphabet) - check_match_list_seq_alphabet(list_seq=list_seq, alphabet=alphabet) + check_match_list_seq_alphabet(list_seq=list_seq, alphabet=alphabet, gap=gap) # Create encoding - feature_matrix = encode_one_hot(list_seq=list_seq, alphabet=alphabet, gap=gap, pad_at=pad_at) - return feature_matrix + X, features = encode_one_hot(list_seq=list_seq, alphabet=alphabet, gap=gap, pad_at=pad_at) + return X, features @staticmethod - def encode_integer(list_seq: List[str], alphabet: str = "ARNDCEQGHILKMFPSTWYV", gap: str = "_", pad_at: Literal["C", "N"] = "C") -> np.ndarray: + def encode_integer(list_seq: Union[List[str], str] = None, + alphabet: str = "ARNDCEQGHILKMFPSTWYV", + gap: str = "-", + pad_at: Literal["C", "N"] = "C", + ) -> Tuple[np.ndarray, List[str]]: """ - Integer encodes a list of protein sequences into a feature matrix. + Integer-encode a list of protein sequences into a feature matrix. + + Each amino acid is represented by an integer between 1 and n, where n is the number of characters. + Gaps are represented by 0. Shorter sequences are padded with gaps either N- or C-terminally. Parameters ---------- - list_seq : List[str] + list_seq : list of str or str List of protein sequences to encode. alphabet : str, default='ARNDCEQGHILKMFPSTWYV' The alphabet of amino acids used for encoding. - gap : str, default='_' + gap : str, default='-' The character used to represent gaps in sequences. - pad_at : Literal['C', 'N'], default='C' - Specifies where to add the padding. + pad_at : str, default='C' + Specifies where to add the padding: + + - 'N' for N-terminus (beginning of the sequence), + - 'C' for C-terminus (end of the sequence). Returns ------- - np.ndarray - A numpy array where each row represents an encoded sequence. + X: array-like, shape (n_samples, n_residues) + Feature matrix containing one-hot encoded position-wise representation of residues. + features : list of str + List of feature names corresponding to each position in the encoded matrix. + + Examples + -------- + .. include:: examples/sp_encode_integer.rst """ - return encode_integer(list_seq, alphabet, gap, pad_at) + # Check input + list_seq = ut.check_list_like(name="list_seq", val=list_seq, + check_all_str_or_convertible=True, + accept_none=False, accept_str=True) + ut.check_str(name="alphabet", val=alphabet, accept_none=False) + check_gap(gap=gap) + ut.check_str_options(name="pad_at", val=pad_at, list_str_options=["N", "C"]) + check_match_gap_alphabet(gap=gap, alphabet=alphabet) + check_match_list_seq_alphabet(list_seq=list_seq, alphabet=alphabet, gap=gap) + # Create encoding + X, features = encode_integer(list_seq=list_seq, alphabet=alphabet, gap=gap, pad_at=pad_at) + return X, features @staticmethod - def get_aa_window(seq: str, pos_start: int, pos_stop: int = None, window_size: int = None, gap: str = '-', accept_gap: bool = True) -> str: + def get_aa_window(seq: str, + pos_start: int, + pos_stop: Optional[int] = None, + window_size: Optional[int] = None, + index1: bool = False, + gap: str = '-', + accept_gap: bool = True, + ) -> str: """ Extracts a window of amino acids from a sequence. + This window starts from a given start position (``pos_start``, starting from 1) + and stops either at a defined stop position (``pos_stop``) or after a number of + residues defined by ``window_size``. + Parameters ---------- seq : str The protein sequence from which to extract the window. pos_start : int - The starting position of the window (1-based index). + The starting position (>=0) of the window. pos_stop : int, optional - The ending position of the window (1-based index). If None, window_size is used. + The ending position (>=``pos_start``) of the window. If ``None``, ``window_size`` is used. window_size : int, optional - The size of the window to extract. Only used if pos_end is None. + The size of the window (>=1) to extract. Only used if ``pos_end`` is ``None``. + index1 : bool, default=False + Whether position index starts at 1 (if ``True``) or 0 (if ``False``), + where first amino acid is at position 1 or 0, respectively. gap : str, default='-' The character used to represent gaps. accept_gap : bool, default=True - Whether to accept gaps in the window. + Whether to accept gaps in the window. If ``True``, C-terminally padding is enabled. Returns ------- - str + window : str The extracted window of amino acids. + + Examples + -------- + .. include:: examples/sp_get_aa_window.rst """ - return get_aa_window(seq, pos_start, pos_stop, window_size, gap, accept_gap) + # Check input + ut.check_str(name="seq", val=seq, accept_none=False) + ut.check_bool(name="index1", val=index1, accept_none=False) + min_val_pos = 1 if index1 else 0 + str_add = f"If 'index1' is '{index1}'." + ut.check_number_range(name="pos_start", val=pos_start, min_val=min_val_pos, + accept_none=False, just_int=True, str_add=str_add) + ut.check_number_range(name="pos_stop", val=pos_stop, min_val=min_val_pos, + accept_none=True, just_int=True, str_add=str_add) + ut.check_number_range(name="window_size", val=window_size, min_val=1, accept_none=True, just_int=True) + check_gap(gap=gap) + ut.check_bool(name="accept_gap", val=accept_gap, accept_none=False) + pos_start, pos_stop = adjust_positions(start=pos_start, stop=pos_stop, index1=index1) + check_match_pos_start_pos_stop(pos_start=pos_start, pos_stop=pos_stop) + check_match_pos_stop_window_size(pos_stop=pos_stop, window_size=window_size) + if not accept_gap: + check_match_seq_pos(seq=seq, pos_start=pos_start, pos_stop=pos_stop) + check_match_seq_pos_start_window_size(seq=seq, pos_start=pos_start, window_size=window_size) + # Get amino acid window + window = get_aa_window(seq=seq, pos_start=pos_start, pos_stop=pos_stop, + window_size=window_size, gap=gap) + return window @staticmethod def get_sliding_aa_window(seq: str = None, slide_start: int = 1, - slide_stop: int = None, - window_size: int = 10, + slide_stop: Optional[int] = None, + window_size: int = 5, + index1: bool = False, gap: str = '-', accept_gap: bool = False ) -> List[str]: @@ -152,26 +308,47 @@ def get_sliding_aa_window(seq: str = None, seq : str The protein sequence from which to extract the windows. slide_start : int, default=1 - The starting position for sliding window extraction (1-based index). + The starting position (>=0) for sliding window extraction. slide_stop : int, optional - The ending position for sliding window extraction (1-based index). If None, extract all possible windows. - window_size : int, default=10 - The size of each window to extract. + The ending position (>=1) for sliding window extraction. If ``None``, extract all possible windows. + window_size : int, default=5 + The size of each window (>=1) to extract. + index1 : bool, default=False + Whether position index starts at 1 (if ``True``) or 0 (if ``False``), + where first amino acid is at position 1 or 0, respectively. gap : str, default='-' The character used to represent gaps. - accept_gap : bool, default=False - Whether to accept gaps in the amino acid windows. + accept_gap : bool, default=True + Whether to accept gaps in the window. If ``True``, C-terminally padding is enabled. Returns ------- - List[str] + list_windows : list of str A list of extracted windows of amino acids. + Examples + -------- + .. include:: examples/sp_get_sliding_aa_window.rst """ # Check input ut.check_str(name="seq", val=seq, accept_none=False) - ut.check_number_val(name="slide_start", val=slide_start, accept_none=False, just_int=True) - ut.check_number_val(name="slide_stop", val=slide_stop, accept_none=True, just_int=True) - + ut.check_bool(name="index1", val=index1, accept_none=False) + min_val_pos = 1 if index1 else 0 + str_add = f"If 'index1' is '{index1}'." + ut.check_number_range(name="slide_start", val=slide_start, min_val=min_val_pos, + accept_none=False, just_int=True, str_add=str_add) + ut.check_number_range(name="slide_stop", val=slide_stop, min_val=min_val_pos, + accept_none=True, just_int=True, str_add=str_add) + ut.check_number_range(name="window_size", val=window_size, min_val=1, accept_none=False, just_int=True) + check_gap(gap=gap) + ut.check_bool(name="accept_gap", val=accept_gap, accept_none=False) + slide_start, slide_stop = adjust_positions(start=slide_start, stop=slide_stop, index1=index1) + check_match_slide_start_slide_stop(slide_start=slide_start, slide_stop=slide_stop) + check_match_slide_start_slide_stop_window_size(slide_start=slide_start, slide_stop=slide_stop, + window_size=window_size) + if not accept_gap: + check_match_seq_slide(seq=seq, slide_start=slide_start, slide_stop=slide_stop) + check_match_seq_slide_start_window_size(seq=seq, slide_start=slide_start, window_size=window_size) # Get sliding windows - list_windows = get_sliding_aa_window(seq, slide_start, slide_stop, window_size, gap, accept_gap) + list_windows = get_sliding_aa_window(seq=seq, slide_start=slide_start, slide_stop=slide_stop, + window_size=window_size, gap=gap) return list_windows diff --git a/aaanalysis/feature_engineering/_backend/cpp/utils_feature.py b/aaanalysis/feature_engineering/_backend/cpp/utils_feature.py index cb889fce..e1ce2caf 100644 --- a/aaanalysis/feature_engineering/_backend/cpp/utils_feature.py +++ b/aaanalysis/feature_engineering/_backend/cpp/utils_feature.py @@ -232,7 +232,7 @@ def compute_feature_matrix(features_subset): # Feature creation if n_jobs == 1: # Process in a single thread/process - feat_matrix = compute_feature_matrix(features) + feature_matrix = compute_feature_matrix(features) else: # If n_jobs is not specified, decide it dynamically based on the number of features if n_jobs is None: @@ -240,8 +240,8 @@ def compute_feature_matrix(features_subset): # Use joblib to parallelize the computation results = Parallel(n_jobs=n_jobs)( delayed(compute_feature_matrix)(features_chunk) for features_chunk in np.array_split(features, n_jobs)) - feat_matrix = np.concatenate(results, axis=1) - return feat_matrix + feature_matrix = np.concatenate(results, axis=1) + return feature_matrix def get_df_pos_(df_feat=None, col_cat="category", col_val=None, value_type="count", start=None, stop=None): diff --git a/aaanalysis/feature_engineering/_numerical_feature.py b/aaanalysis/feature_engineering/_numerical_feature.py index c7efbc2d..b1225664 100644 --- a/aaanalysis/feature_engineering/_numerical_feature.py +++ b/aaanalysis/feature_engineering/_numerical_feature.py @@ -1,7 +1,6 @@ """ This is a script for the frontend of the NumericalFeature class, a supportive class for the CPP feature engineering, including scale and feature filtering methods. - """ import time import pandas as pd @@ -14,13 +13,6 @@ # I Helper Functions -def check_value_type(value_type=None): - """Check if value type is valid""" - list_value_type = ["min", "mean", "median", "max"] - if value_type not in list_value_type: - raise ValueError(f"'value_type' ('{value_type}') should be on of following: {list_value_type}") - - def check_match_df_scales_letter_new(df_scales=None, letter_new=None): """Check if new letter not already in df_scales""" alphabet = df_scales.index.to_list() @@ -65,7 +57,7 @@ def extend_alphabet(df_scales: pd.DataFrame = None, new_letter : str The new letter to be added to the alphabet. value_type : {'min', 'mean', 'median', 'max'}, default='mean' - The type of statistic to compute for the new letter (one of 'min', 'mean', 'median', 'max'). + The type of statistic to compute for the new letter. Returns ------- @@ -80,7 +72,8 @@ def extend_alphabet(df_scales: pd.DataFrame = None, df_scales = df_scales.copy() check_df_scales(df_scales=df_scales) ut.check_str(name="letter_new", val=new_letter) - check_value_type(value_type=value_type) + ut.check_str_options(name="value_type", val=value_type, accept_none=False, + list_str_options=["min", "mean", "median", "max"]) check_match_df_scales_letter_new(df_scales=df_scales, letter_new=new_letter) # Compute the statistic for each scale if value_type == "min": diff --git a/aaanalysis/feature_engineering/_sequence_feature.py b/aaanalysis/feature_engineering/_sequence_feature.py index 508f146d..8bfa39cc 100644 --- a/aaanalysis/feature_engineering/_sequence_feature.py +++ b/aaanalysis/feature_engineering/_sequence_feature.py @@ -485,8 +485,8 @@ def feature_matrix(self, Returns ------- - feat_matrix: array-like , shape (n_samples, n_features) - Feature values for samples. + X: array-like , shape (n_samples, n_features) + Feature matrix containing feature values for samples. Notes ----- @@ -512,12 +512,12 @@ def feature_matrix(self, if self.verbose: warn_creation_of_feature_matrix(features=features, df_parts=df_parts) # Create feature matrix using parallel processing - feat_matrix = get_feature_matrix_(features=features, - df_parts=df_parts, - df_scales=df_scales, - accept_gaps=accept_gaps, - n_jobs=n_jobs) - return feat_matrix + X = get_feature_matrix_(features=features, + df_parts=df_parts, + df_scales=df_scales, + accept_gaps=accept_gaps, + n_jobs=n_jobs) + return X def get_features(self, list_parts: Optional[List[str]] = None, diff --git a/docs/source/api.rst b/docs/source/api.rst index 74057118..2c0475bf 100755 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -28,6 +28,8 @@ Data Handling load_features read_fasta to_fasta + SequencePreprocessor + comp_seq_sim filter_seq .. _feature_engineering_api: