Skip to content

Commit

Permalink
Merge pull request #48 from PNNL-CompBio/v1.0.0
Browse files Browse the repository at this point in the history
[v0.1.2] Update `cluster` module. Addresses #46
  • Loading branch information
christinehc authored Apr 7, 2022
2 parents bc760a4 + 17ac450 commit 92b4b89
Show file tree
Hide file tree
Showing 7 changed files with 52 additions and 39 deletions.
2 changes: 1 addition & 1 deletion resources/tutorial/snekmer_demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2039,7 +2039,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
"version": "3.10.2"
}
},
"nbformat": 4,
Expand Down
2 changes: 1 addition & 1 deletion snekmer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@

# from . import walk

__version__ = "0.1.1"
__version__ = "0.1.2"
37 changes: 19 additions & 18 deletions snekmer/rules/cluster.smk
Original file line number Diff line number Diff line change
Expand Up @@ -106,29 +106,29 @@ use rule preprocess from process_input with:
data=join(out_dir, "processed", "full", "{nb}.json"),
desc=join(out_dir, "processed", "full", "{nb}_description.csv"),
log:
join(out_dir, "processed", "log", "{nb}.log"),
join(out_dir, "processed", "log", "{nb}.log")


# generate kmer features space from user params
use rule generate from kmerize with:
input:
params=join(out_dir, "processed", "full", "{nb}.json"),
output:
labels=join(out_dir, "labels", "full", "{nb}.txt"),
log:
join(out_dir, "labels", "log", "{nb}.log"),
# use rule generate from kmerize with:
# input:
# params=join(out_dir, "processed", "full", "{nb}.json"),
# output:
# labels=join(out_dir, "labels", "full", "{nb}.txt"),
# log:
# join(out_dir, "labels", "log", "{nb}.log")


# build kmer count vectors for each basis set
use rule vectorize_full from kmerize with:
input:
kmers=join(out_dir, "labels", "full", "{nb}.txt"),
params=join(out_dir, "processed", "{nb}.json"),
fasta=lambda wildcards: join("input", f"{wildcards.nb}.{FA_MAP[wildcards.nb]}"),
fasta=lambda wildcards: join("input", f"{wildcards.nb}.{FA_MAP[wildcards.nb]}")
log:
join(out_dir, "features", "log", "{nb}.log"),
join(out_dir, "features", "log", "{nb}.log")
output:
file=join(out_dir, "features", "full", "{nb}.json.gz"),
kmers=join(out_dir, "labels", "full", "{nb}.txt")


# [in-progress] kmer walk
Expand All @@ -140,8 +140,7 @@ use rule vectorize_full from kmerize with:
# UNSUPERVISED WORKFLOW
rule cluster:
input:
# kmers=join(out_dir, "labels", "{nb}.txt"),
files=expand(join(out_dir, "features", "full", "{fa}.json.gz"), fa=NON_BGS),
files=expand(join(out_dir, "features", "full", "{fa}.json.gz"), fa=NON_BGS)
output:
model=join(out_dir, "cluster", "{nb}.pkl"),
figs=directory(join(out_dir, "cluster", "figures", "{nb}")),
Expand All @@ -167,9 +166,12 @@ rule cluster:

# define feature matrix of kmer vectors not from background set
bg, non_bg = data[data["background"]], data[~data["background"]]
full_feature_matrix = skm.score.to_feature_matrix(data["vector"].values)
feature_matrix = skm.score.to_feature_matrix(non_bg["vector"].values)
bg_feature_matrix = skm.score.to_feature_matrix(bg["vector"].values)
full_feature_matrix = skm.utils.to_feature_matrix(data["vector"].values)
feature_matrix = skm.utils.to_feature_matrix(non_bg["vector"].values)

# currently not used
if len(bg) > 0:
bg_feature_matrix = skm.utils.to_feature_matrix(bg["vector"].values)

# fit and save clustering model
model = skm.cluster.KmerClustering(
Expand All @@ -178,7 +180,6 @@ rule cluster:
model.fit(full_feature_matrix)
with open(output.model, "wb") as f:
pickle.dump(model, f)
# fit_predict where?

# log time to compute clusters
skm.utils.log_runtime(log[0], start_time, step="clustering")
Expand All @@ -190,7 +191,7 @@ rule cluster:
fig.savefig(join(output.figs, "pca_explained_variance_curve.png"))
plt.close("all")

fig, ax = skm.plot.get_tsne_clusters(full_feature_matrix, model.labels_)
fig, ax = skm.plot.get_tsne_clusters(full_feature_matrix, model.model.labels_)
fig.savefig(join(output.figs, "tsne_clusters.png"))
plt.close("all")

Expand Down
27 changes: 18 additions & 9 deletions snekmer/rules/kmerize.smk
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ rule generate:

# generate labels only
labels = skm.transform.generate_labels(
config["k"],
params["k"],
alphabet=params["alphabet"],
filter_list=params["filter_list"],
)
Expand All @@ -71,12 +71,12 @@ rule vectorize:
log:
join("output", "features", "log", "{nb}.log"),
output:
files=expand(join("output", "features", "{{nb}}", "{fa}.json.gz"), fa=FAS),
files=expand(join("output", "features", "full", "{{nb}}", "{fa}.json.gz"), fa=FAS),
run:
start_time = datetime.now()

# get kmers for this particular set of sequences
kmers = skm.io.read_output_kmers(input.kmers)
kmers = ["".join(chars) for chars in itertools]

# read processed features
with open(input.params, "r") as f:
Expand All @@ -95,7 +95,7 @@ rule vectorize:
results["vector"] += [
skm.transform.vectorize_string(
seq,
config["k"],
params["k"],
params["alphabet"],
start=config["start"],
end=config["end"],
Expand All @@ -117,23 +117,29 @@ rule vectorize:

rule vectorize_full:
input:
kmers=join("output", "labels", "full", "{nb}.txt"),
# kmers=join("output", "labels", "full", "{nb}.txt"),
params=join("output", "processed", "{nb}.json"),
fasta=join("input", "{nb}.fasta"),
log:
join("output", "features", "log", "{nb}.log"),
output:
file=join("output", "features", "full", "{nb}.json.gz"),
kmers=join("output", "labels", "full", "{nb}.txt")
run:
start_time = datetime.now()

# get kmers for this particular set of sequences
kmers = skm.io.read_output_kmers(input.kmers)

# read processed features
with open(input.params, "r") as f:
params = json.load(f)

residues, _, _, _ = skm.transform.parse_mapping(params["alphabet"])

# generate full kmer lest
kmers = [
"".join(chars) for chars in product(residues, repeat=params["k"])
]


# revectorize based on full kmer list
# for i, fa in enumerate(fastas):
results = {"seq_id": [], "vector": []}
Expand All @@ -143,7 +149,7 @@ rule vectorize_full:
results["vector"] += [
skm.transform.vectorize_string(
seq,
config["k"],
params["k"],
params["alphabet"],
start=config["start"],
end=config["end"],
Expand All @@ -156,5 +162,8 @@ rule vectorize_full:
with gzip.open(output.file, "wt", encoding="ascii") as zipfile:
json.dump(results, zipfile)

if config["output"]["format"] == "simple":
skm.features.output_features(output.kmers, "matrix", labels=kmers)

# record script runtime
skm.utils.log_runtime(log[0], start_time)
8 changes: 5 additions & 3 deletions snekmer/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,7 +478,7 @@ def vectorize_string(
start, end = set_sequence_endpoints(sequence, k, start, end)

results = feature_dict or {}
if filter_list:
if filter_list is not None:
results = {key: 0 for key in filter_list}

for i in range(start, end):
Expand All @@ -494,7 +494,9 @@ def vectorize_string(
print(f"{i}\t{k_map}\t{k_string}\t1",)

# filter unrecognized characters or filter from list
if (len(k_string) < k) or (filter_list and k_string not in filter_list):
if (len(k_string) < k) or (
filter_list is not None and k_string not in filter_list
):
continue

# FILTER HERE
Expand All @@ -508,7 +510,7 @@ def vectorize_string(
results[k_string] = 0
results[k_string] += 1

if filter_list:
if filter_list is not None:
results = {item: results[item] for item in filter_list}
if return_dict:
return results
Expand Down
3 changes: 2 additions & 1 deletion snekmer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,8 @@ def to_feature_matrix(array):
2D array version of the 2D array-like input.
"""
return np.array([np.array(a, dtype=int) for a in array])
array = [np.array(a) for a in array]
return np.asarray(array)


def str_to_array(array):
Expand Down
12 changes: 6 additions & 6 deletions snekmer/vectorize.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@
"""
import itertools
import numpy as np
from collections import Counter
from snekmer.alphabets import ALPHABET, FULL_ALPHABETS, get_alphabet_keys, get_alphabet
from typing import Union, Set
from typing import Set, Union

import numpy as np

from snekmer.alphabets import ALPHABET, FULL_ALPHABETS, get_alphabet, get_alphabet_keys


# generate all possible kmer combinations
def _generate(alphabet: Union[str, int], k: int):
Expand Down Expand Up @@ -92,10 +95,7 @@ def _reduce(sequence: str, alphabet_map: dict) -> str:

# generate kmer vectors with bag-of-words approach
def vectorize(self, sequence: str) -> np.ndarray:
# self.char_set = set(ALPHABETS[alphabet]["_keys"])
N = len(self.char_set) ** self.k
# all_kmers = self._generate("".join(self.char_set), k)
# self.kmers = list(self._generate(list(self.char_set), k))

alphabet_map = get_alphabet(self.alphabet, mapping=FULL_ALPHABETS)
sequence = self._reduce(sequence, alphabet_map=alphabet_map)
Expand Down

0 comments on commit 92b4b89

Please sign in to comment.