From 33d299cdda98580c4245d89db6ff7aea3195c483 Mon Sep 17 00:00:00 2001 From: christinehc Date: Tue, 29 Mar 2022 14:01:08 -0700 Subject: [PATCH 1/2] Update kmer vector generation with BoW/generator approach Created new KmerVec and KmerSet objects as generator and iterator object-based approaches to create kmer vectors, respectively. Updated the alphabet module for compatibility approach, including dictionaries for 1:1 alphabet mappings (alphabet.FULL_ALPHABETS) and functions to get alphabet parameters using alphabet identifier inputs (alphabet.get_alphabet and alphabet.get_alphabet_keys). --- snekmer/alphabet.py | 73 ++++++++++++++++++++++++++++ snekmer/vectorize.py | 113 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 186 insertions(+) create mode 100644 snekmer/vectorize.py diff --git a/snekmer/alphabet.py b/snekmer/alphabet.py index 1534730..0fd56d9 100644 --- a/snekmer/alphabet.py +++ b/snekmer/alphabet.py @@ -6,6 +6,9 @@ author(s): @biodataganache, @wichne """ +# imports +from typing import Mapping, Set, Union + # define standard amino acid alphabet StandardAlphabet = "AILMVFYWSTQNCHDEKRGP" AA_SELF_MAPPING = {a: a for a in StandardAlphabet} @@ -80,6 +83,17 @@ }, } +# reconfigure alphabet dict into "long-form" +FULL_ALPHABETS: dict = {a: {} for a in ALPHABETS.keys()} +for alphabet, mapping in ALPHABETS.items(): + for k, v in mapping.items(): + if k == "_keys": + continue + elif len(k) > 1: + FULL_ALPHABETS[alphabet].update({k[i]: v for i in range(len(k))}) + else: + FULL_ALPHABETS[alphabet].update({k: v}) + # create generic alphabet identifiers ALPHABET_ID = { f"RED{n}": {v: k for k, v in ALPHABETS[ALPHABET_ORDER[n]].items()} @@ -140,6 +154,65 @@ def check_valid(alphabet): return +def get_alphabet(alphabet: Union[str, int], mapping: dict = ALPHABETS) -> dict: + """Short summary. + + Parameters + ---------- + alphabet : Union[str, int] + Alphabet name (as str) or alphabet id (as int). + Must be one of the follwing: + 0: "hydro", + 1: "standard", + 2: "solvacc", + 3: "hydrocharge", + 4: "hydrostruct", + 5: "miqs" + mapping : dict + All alphabet maps (the default is ALPHABETS). + + Returns + ------- + dict + Dictionary map of amino acids to alphabet character. + + Raises + ------ + ValueError + Raised if alphabet not in pre-defined list. + + """ + check_valid(alphabet) + + if isinstance(alphabet, int): + alphabet = ALPHABET_ORDER[alphabet] + return mapping[alphabet] + + +def get_alphabet_keys( + alphabet: Union[str, int], mapping: dict = FULL_ALPHABETS +) -> Set[str]: + """Retrieve keys for specified alphabet. + + Parameters + ---------- + alphabet : Union[str, int] + Description of parameter `alphabet`. + mapping : Mapping[dict] + Description of parameter `mapping` (the default is FULL_ALPHABETS). + + Returns + ------- + dict + Description of returned object. + + """ + alphabet_map = get_alphabet(alphabet, mapping) + if "_keys" in alphabet_map.keys(): + alphabet_map.pop("_keys") + return set(alphabet_map.values()) + + # def add_alphabet(alphabet_name, mapping): # return diff --git a/snekmer/vectorize.py b/snekmer/vectorize.py new file mode 100644 index 0000000..0655f2e --- /dev/null +++ b/snekmer/vectorize.py @@ -0,0 +1,113 @@ +"""vectorize: Create kmer vectors. +author: @christinehc + +""" +import itertools +import numpy as np +from collections import Counter +from snekmer.alphabets import ALPHABET, FULL_ALPHABETS, get_alphabet_keys, get_alphabet +from typing import Union, Set + +# generate all possible kmer combinations +def _generate(alphabet: Union[str, int], k: int): + for c in itertools.product(alphabet, repeat=k): + yield "".join(c) + + +# iterator object for kmer basis set given alphabet and k +class KmerSet: + def __init__(self, alphabet: Union[str, int], k: int): + self.alphabet = alphabet + self.k = k + self._kmerlist = list(_generate(get_alphabet_keys(alphabet), k)) + + @property + def kmers(self): + return iter(self._kmerlist) + + +# manually reduce alphabet +def reduce( + string: str, alphabet: Union[str, int], mapping: dict = FULL_ALPHABETS +) -> str: + """Short summary. + + Parameters + ---------- + string : str + Description of parameter `string`. + alphabet : Union[str, int] + Description of parameter `alphabet`. + mapping : dict + Description of parameter `mapping` (the default is FULL_ALPHABETS). + + Returns + ------- + str + Description of returned object. + + Raises + ------ + ExceptionName + Why the exception is raised. + + """ + alphabet_map: dict = get_alphabet(alphabet, mapping=mapping) + return string.translate(string.maketrans(alphabet_map)) + + +class KmerVec: + def __init__(self, alphabet: Union[str, int], k: int): + self.alphabet = alphabet + self.k = k + self.kmer_gen = None + self.char_set = get_alphabet_keys(alphabet) + self.vector = None + self.kmer_set = KmerSet(alphabet, k) + + # iteratively get all kmers in a string + def _kmer_gen_str_limit(self, sequence): + """Generator object for segment in string format""" + i = 0 + n = len(sequence) - self.k + 1 + + # iterate character-by-character + while i < n: + kmer = sequence[i : i + self.k] + if set(kmer) <= self.char_set: + yield kmer + i += 1 + + # not used + @staticmethod + def _kmer_gen_str(sequence, k): + """Generator object for segment in string format""" + for n in range(0, len(sequence) - k + 1): + yield sequence[n : n + k] + + # apply alphabet reduction + @staticmethod + def _reduce(sequence: str, alphabet_map: dict) -> str: + return sequence.translate(sequence.maketrans(alphabet_map)) + + # generate kmer vectors with bag-of-words approach + def vectorize(self, sequence: str) -> np.ndarray: + # self.char_set = set(ALPHABETS[alphabet]["_keys"]) + N = len(self.char_set) ** self.k + # all_kmers = self._generate("".join(self.char_set), k) + # self.kmers = list(self._generate(list(self.char_set), k)) + + alphabet_map = get_alphabet(self.alphabet, mapping=FULL_ALPHABETS) + sequence = self._reduce(sequence, alphabet_map=alphabet_map) + kmers = list(self._kmer_gen_str_limit(sequence)) + kmer2count = Counter(kmers) + + # Convert to vector of counts + vector = np.zeros(N) + for i, word in enumerate(self.kmer_set.kmers): + vector[i] += kmer2count[word] + + # Convert to frequencies + # vector /= sum(kmer2count.values()) + + return vector From 8c873471123ab3eb52c617af866f30a42f627538 Mon Sep 17 00:00:00 2001 From: christinehc Date: Fri, 1 Apr 2022 12:56:47 -0700 Subject: [PATCH 2/2] Label AUCROC scores correctly (fix erroneous PRAUC labels) --- snekmer/rules/model.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snekmer/rules/model.smk b/snekmer/rules/model.smk index 0decf53..469eb6d 100644 --- a/snekmer/rules/model.smk +++ b/snekmer/rules/model.smk @@ -379,7 +379,7 @@ rule model: results["family"] += [family] * cv results["alphabet_name"] += [alphabet_name.lower()] * cv results["k"] += [config["k"]] * cv - results["scoring"] += ["pr_auc"] * cv + results["scoring"] += ["auc_roc"] * cv results["score"] += auc_rocs results["cv_split"] += [i + 1 for i in range(cv)]