Skip to content

Commit

Permalink
Merge pull request #45 from PNNL-CompBio/v1.0.0
Browse files Browse the repository at this point in the history
Fix bug which mislabeled AUCROC scores as PRAUC
  • Loading branch information
christinehc authored Apr 1, 2022
2 parents d815755 + 8c87347 commit deb4b98
Show file tree
Hide file tree
Showing 3 changed files with 187 additions and 1 deletion.
73 changes: 73 additions & 0 deletions snekmer/alphabet.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
author(s): @biodataganache, @wichne
"""
# imports
from typing import Mapping, Set, Union

# define standard amino acid alphabet
StandardAlphabet = "AILMVFYWSTQNCHDEKRGP"
AA_SELF_MAPPING = {a: a for a in StandardAlphabet}
Expand Down Expand Up @@ -80,6 +83,17 @@
},
}

# reconfigure alphabet dict into "long-form"
FULL_ALPHABETS: dict = {a: {} for a in ALPHABETS.keys()}
for alphabet, mapping in ALPHABETS.items():
for k, v in mapping.items():
if k == "_keys":
continue
elif len(k) > 1:
FULL_ALPHABETS[alphabet].update({k[i]: v for i in range(len(k))})
else:
FULL_ALPHABETS[alphabet].update({k: v})

# create generic alphabet identifiers
ALPHABET_ID = {
f"RED{n}": {v: k for k, v in ALPHABETS[ALPHABET_ORDER[n]].items()}
Expand Down Expand Up @@ -140,6 +154,65 @@ def check_valid(alphabet):
return


def get_alphabet(alphabet: Union[str, int], mapping: dict = ALPHABETS) -> dict:
"""Short summary.
Parameters
----------
alphabet : Union[str, int]
Alphabet name (as str) or alphabet id (as int).
Must be one of the follwing:
0: "hydro",
1: "standard",
2: "solvacc",
3: "hydrocharge",
4: "hydrostruct",
5: "miqs"
mapping : dict
All alphabet maps (the default is ALPHABETS).
Returns
-------
dict
Dictionary map of amino acids to alphabet character.
Raises
------
ValueError
Raised if alphabet not in pre-defined list.
"""
check_valid(alphabet)

if isinstance(alphabet, int):
alphabet = ALPHABET_ORDER[alphabet]
return mapping[alphabet]


def get_alphabet_keys(
alphabet: Union[str, int], mapping: dict = FULL_ALPHABETS
) -> Set[str]:
"""Retrieve keys for specified alphabet.
Parameters
----------
alphabet : Union[str, int]
Description of parameter `alphabet`.
mapping : Mapping[dict]
Description of parameter `mapping` (the default is FULL_ALPHABETS).
Returns
-------
dict
Description of returned object.
"""
alphabet_map = get_alphabet(alphabet, mapping)
if "_keys" in alphabet_map.keys():
alphabet_map.pop("_keys")
return set(alphabet_map.values())


# def add_alphabet(alphabet_name, mapping):
# return

Expand Down
2 changes: 1 addition & 1 deletion snekmer/rules/model.smk
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ rule model:
results["family"] += [family] * cv
results["alphabet_name"] += [alphabet_name.lower()] * cv
results["k"] += [config["k"]] * cv
results["scoring"] += ["pr_auc"] * cv
results["scoring"] += ["auc_roc"] * cv
results["score"] += auc_rocs
results["cv_split"] += [i + 1 for i in range(cv)]

Expand Down
113 changes: 113 additions & 0 deletions snekmer/vectorize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""vectorize: Create kmer vectors.
author: @christinehc
"""
import itertools
import numpy as np
from collections import Counter
from snekmer.alphabets import ALPHABET, FULL_ALPHABETS, get_alphabet_keys, get_alphabet
from typing import Union, Set

# generate all possible kmer combinations
def _generate(alphabet: Union[str, int], k: int):
for c in itertools.product(alphabet, repeat=k):
yield "".join(c)


# iterator object for kmer basis set given alphabet and k
class KmerSet:
def __init__(self, alphabet: Union[str, int], k: int):
self.alphabet = alphabet
self.k = k
self._kmerlist = list(_generate(get_alphabet_keys(alphabet), k))

@property
def kmers(self):
return iter(self._kmerlist)


# manually reduce alphabet
def reduce(
string: str, alphabet: Union[str, int], mapping: dict = FULL_ALPHABETS
) -> str:
"""Short summary.
Parameters
----------
string : str
Description of parameter `string`.
alphabet : Union[str, int]
Description of parameter `alphabet`.
mapping : dict
Description of parameter `mapping` (the default is FULL_ALPHABETS).
Returns
-------
str
Description of returned object.
Raises
------
ExceptionName
Why the exception is raised.
"""
alphabet_map: dict = get_alphabet(alphabet, mapping=mapping)
return string.translate(string.maketrans(alphabet_map))


class KmerVec:
def __init__(self, alphabet: Union[str, int], k: int):
self.alphabet = alphabet
self.k = k
self.kmer_gen = None
self.char_set = get_alphabet_keys(alphabet)
self.vector = None
self.kmer_set = KmerSet(alphabet, k)

# iteratively get all kmers in a string
def _kmer_gen_str_limit(self, sequence):
"""Generator object for segment in string format"""
i = 0
n = len(sequence) - self.k + 1

# iterate character-by-character
while i < n:
kmer = sequence[i : i + self.k]
if set(kmer) <= self.char_set:
yield kmer
i += 1

# not used
@staticmethod
def _kmer_gen_str(sequence, k):
"""Generator object for segment in string format"""
for n in range(0, len(sequence) - k + 1):
yield sequence[n : n + k]

# apply alphabet reduction
@staticmethod
def _reduce(sequence: str, alphabet_map: dict) -> str:
return sequence.translate(sequence.maketrans(alphabet_map))

# generate kmer vectors with bag-of-words approach
def vectorize(self, sequence: str) -> np.ndarray:
# self.char_set = set(ALPHABETS[alphabet]["_keys"])
N = len(self.char_set) ** self.k
# all_kmers = self._generate("".join(self.char_set), k)
# self.kmers = list(self._generate(list(self.char_set), k))

alphabet_map = get_alphabet(self.alphabet, mapping=FULL_ALPHABETS)
sequence = self._reduce(sequence, alphabet_map=alphabet_map)
kmers = list(self._kmer_gen_str_limit(sequence))
kmer2count = Counter(kmers)

# Convert to vector of counts
vector = np.zeros(N)
for i, word in enumerate(self.kmer_set.kmers):
vector[i] += kmer2count[word]

# Convert to frequencies
# vector /= sum(kmer2count.values())

return vector

0 comments on commit deb4b98

Please sign in to comment.