Skip to content

Commit

Permalink
Add first version of SequencePreprocessor (not tested and no examples)
Browse files Browse the repository at this point in the history
  • Loading branch information
breimanntools committed Jun 28, 2024
1 parent ca9e339 commit d729bdf
Show file tree
Hide file tree
Showing 10 changed files with 266 additions and 191 deletions.
1 change: 0 additions & 1 deletion aaanalysis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
"read_fasta",
"to_fasta",
# "comp_seq_sim", BioPython
# "comp_pw_seq_sim", BioPython
# "filter_seq", BioPython
"SequencePreprocessor",
"AAclust",
Expand Down
35 changes: 6 additions & 29 deletions aaanalysis/data_handling/_backend/seq_preproc/encode_integer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
This is a script for ...
This is a script for the backend of the SequenceProcessor().encode_integer() method.
"""
import time
import pandas as pd
Expand All @@ -11,48 +11,25 @@


# II Main Functions
def encode_integer(list_seq: List[str] = None,
alphabet: str = "ARNDCEQGHILKMFPSTWYV",
gap: str = "_",
pad_at: Literal["C", "N"] = "C",
) -> np.array:
def encode_integer(list_seq=None, alphabet="ARNDCEQGHILKMFPSTWYV", gap="-", pad_at="C"):
"""
Integer-encode a list of protein sequences into a feature matrix, padding shorter sequences
with gaps represented as zero vectors.
Parameters:
----------
list_seq : List of str
List of protein sequences to encode.
alphabet : str, default='ARNDCEQGHILKMFPSTWYV'
The alphabet of amino acids used for encoding. The gap character is not part of the alphabet.
gap : str, default='_'
The character used to represent gaps in sequences.
pad_at : Literal['N', 'C'], default='C'
Specifies where to add the padding:
'N' for N-terminus (beginning of the sequence),
'C' for C-terminus (end of the sequence).
Returns:
-------
np.array
A numpy array where each row represents an encoded sequence, and each column represents a feature.
"""
# Map amino acids to integers
aa_to_int = {aa: idx + 1 for idx, aa in enumerate(alphabet)}
aa_to_int[gap] = 0

# Pad sequences
padded_sequences = pad_sequences(list_seq, pad_at=pad_at, gap=gap)

# Create integer encoding
# Create feature names
max_length = len(padded_sequences[0])
list_features = [f"P{i}" for i in range(1, max_length+1)]
# Create integer encoding
feature_matrix = np.zeros((len(padded_sequences), max_length), dtype=int)
for idx, seq in enumerate(padded_sequences):
encoded_seq = [aa_to_int[aa] for aa in seq]
feature_matrix[idx, :] = encoded_seq

return feature_matrix
return feature_matrix, list_features


23 changes: 9 additions & 14 deletions aaanalysis/data_handling/_backend/seq_preproc/encode_one_hot.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
This is a script for creating one-hot-encoding of sequences used as baseline representation.
This is a script for the backend of the SequenceProcessor().encode_one_hot() method.
"""
import pandas as pd
from typing import Optional, Dict, Union, List, Tuple, Type, Literal
Expand All @@ -14,35 +14,30 @@ def _one_hot_encode(amino_acid=None, alphabet=None, gap="_"):
Encodes a single amino acid into a one-hot vector based on a specified alphabet.
Returns a zero vector for gaps represented as '_'.
"""
index_dict = {aa: i for i, aa in enumerate(alphabet)}
dict_aa_index = {aa: i for i, aa in enumerate(alphabet)}
vector = np.zeros(len(alphabet), dtype=int)
if amino_acid != gap:
if amino_acid in index_dict:
vector[index_dict[amino_acid]] = 1
else:
raise ValueError(f"Unrecognized amino acid '{amino_acid}' not in alphabet.")
vector[dict_aa_index[amino_acid]] = 1
return vector


# II Main Functions
# TODO finish, docu, test, example ..
def encode_one_hot(list_seq: List[str] = None,
alphabet: str = "ARNDCEQGHILKMFPSTWYV",
gap: str = "_",
pad_at: Literal["C", "N"] = "C",
) -> np.array:
def encode_one_hot(list_seq=None, alphabet="ARNDCEQGHILKMFPSTWYV", gap="-", pad_at="C"):
"""
One-hot-encode a list of protein sequences into a feature matrix with padding shorter sequences
with gaps represented as zero vectors.
"""
# Pad sequences
padded_sequences = pad_sequences(list_seq, pad_at=pad_at, gap=gap)
# Create one-hot-encoding
# Create feature names
max_length = len(padded_sequences[0])
list_features = [f"{i}{aa}" for i in range(1, max_length+1) for aa in alphabet]
# Create one-hot-encoding
num_amino_acids = len(alphabet)
feature_matrix = np.zeros((len(padded_sequences), max_length * num_amino_acids), dtype=int)
args = dict(alphabet=alphabet, gap=gap)
for idx, seq in enumerate(padded_sequences):
encoded_seq = [_one_hot_encode(amino_acid=aa, **args) for aa in seq]
feature_matrix[idx, :] = np.array(encoded_seq).flatten()
return feature_matrix
return feature_matrix, list_features

46 changes: 3 additions & 43 deletions aaanalysis/data_handling/_backend/seq_preproc/get_aa_window.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
This is a script for ...
This is a script for the backend of the SequenceProcessor().get_aa_window() method.
"""
import time
import pandas as pd
Expand All @@ -10,55 +10,15 @@


# II Main Functions
def get_aa_window(seq: str, pos_start: int, pos_stop: int = None, window_size: int = None, gap: str = '-', accept_gap: bool = True) -> str:
"""
Extracts a window of amino acids from a sequence, padding with gaps if necessary.
Parameters:
----------
seq : str
The protein sequence from which to extract the window.
pos_start : int
The starting position of the window (1-based index).
pos_end : int, optional
The ending position of the window (1-based index). If None, window_size is used.
window_size : int, optional
The size of the window to extract. Only used if pos_end is None.
gap : str, default='-'
The character used to represent gaps.
accept_gap : bool, default=True
Whether to accept gaps in the window. If False, windows containing gaps are rejected.
Returns:
-------
str
The extracted window of amino acids, padded with gaps if necessary.
Raises:
------
ValueError:
If both pos_end and window_size are None, or if the window contains gaps and accept_gap is False.
"""
if pos_stop is None and window_size is None:
raise ValueError("Either pos_end or window_size must be specified.")

def get_aa_window(seq=None, pos_start=None, pos_stop=None, window_size=None, gap='-'):
"""Extracts a window of amino acids from a sequence, padding with gaps if necessary."""
if pos_stop is None:
pos_stop = pos_start + window_size - 1

# Convert 1-based positions to 0-based indices
pos_start -= 1
pos_stop -= 1

# Calculate the necessary padding if pos_end exceeds sequence length
seq_length = len(seq)
if pos_stop >= seq_length:
seq += gap * (pos_stop - seq_length + 1)

# Extract the window
window = seq[pos_start:pos_stop + 1]

if not accept_gap and gap in window:
raise ValueError("The window contains gaps and accept_gap is set to False.")

return window

Original file line number Diff line number Diff line change
@@ -1,53 +1,25 @@
"""
This is a script for ...
This is a script for the backend of the SequenceProcessor().get_sliding_aa_window() method.
"""
import pandas as pd

from .get_aa_window import get_aa_window

# Settings
pd.set_option('expand_frame_repr', False) # Single line print for pd.Dataframe


# I Helper Functions


# II Main Functions
def get_sliding_aa_window(seq: str,
slide_start: int,
slide_stop: int = None,
window_size: int = 5,
gap: str = '-',
accept_gap: bool = True):
"""
Extracts sliding list_windows of amino acids from a sequence.
Parameters:
----------
seq : str
The protein sequence from which to extract the list_windows.
slide_start : int
The starting position for sliding window extraction (1-based index).
slide_end : int, optional
The ending position for sliding window extraction (1-based index). If None, extract all possible list_windows.
window_size : int, default=5
The size of each window to extract.
gap : str, default='-'
The character used to represent gaps.
accept_gap : bool, default=True
Whether to accept gaps in the list_windows. If False, list_windows containing gaps are rejected.
Returns:
-------
List[str]
A list of extracted list_windows of amino acids.
"""
def get_sliding_aa_window(seq=None, slide_start=0, slide_stop=None, window_size=5, gap='-', index1=False):
"""Extracts sliding list_windows of amino acids from a sequence"""
if slide_stop is None:
slide_stop = len(seq)
if not accept_gap:
slide_stop -= window_size
slide_stop = len(seq) - 1
if index1:
slide_stop += 1
n_windows = slide_stop - window_size - slide_start + 1
list_windows = []
for start in range(slide_start, slide_stop + 1):
aa_window = get_aa_window(seq, pos_start=start, window_size=window_size, gap=gap, accept_gap=accept_gap)
for start in range(slide_start, slide_start + n_windows + 1):
# Do not provide index1 again (it will be otherwise two time corrected)
aa_window = get_aa_window(seq, pos_start=start, window_size=window_size, gap=gap)
list_windows.append(aa_window)
return list_windows
Loading

0 comments on commit d729bdf

Please sign in to comment.