diff --git a/resources/tutorial/snekmer_demo.ipynb b/resources/tutorial/snekmer_demo.ipynb index 88f94d4..683858c 100644 --- a/resources/tutorial/snekmer_demo.ipynb +++ b/resources/tutorial/snekmer_demo.ipynb @@ -2039,7 +2039,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.6" + "version": "3.10.2" } }, "nbformat": 4, diff --git a/snekmer/__init__.py b/snekmer/__init__.py index dc3fced..0da4edc 100644 --- a/snekmer/__init__.py +++ b/snekmer/__init__.py @@ -10,4 +10,4 @@ # from . import walk -__version__ = "0.1.1" +__version__ = "0.1.2" diff --git a/snekmer/rules/cluster.smk b/snekmer/rules/cluster.smk index 01d4a8a..8dec3b9 100644 --- a/snekmer/rules/cluster.smk +++ b/snekmer/rules/cluster.smk @@ -106,29 +106,29 @@ use rule preprocess from process_input with: data=join(out_dir, "processed", "full", "{nb}.json"), desc=join(out_dir, "processed", "full", "{nb}_description.csv"), log: - join(out_dir, "processed", "log", "{nb}.log"), + join(out_dir, "processed", "log", "{nb}.log") # generate kmer features space from user params -use rule generate from kmerize with: - input: - params=join(out_dir, "processed", "full", "{nb}.json"), - output: - labels=join(out_dir, "labels", "full", "{nb}.txt"), - log: - join(out_dir, "labels", "log", "{nb}.log"), +# use rule generate from kmerize with: +# input: +# params=join(out_dir, "processed", "full", "{nb}.json"), +# output: +# labels=join(out_dir, "labels", "full", "{nb}.txt"), +# log: +# join(out_dir, "labels", "log", "{nb}.log") # build kmer count vectors for each basis set use rule vectorize_full from kmerize with: input: - kmers=join(out_dir, "labels", "full", "{nb}.txt"), params=join(out_dir, "processed", "{nb}.json"), - fasta=lambda wildcards: join("input", f"{wildcards.nb}.{FA_MAP[wildcards.nb]}"), + fasta=lambda wildcards: join("input", f"{wildcards.nb}.{FA_MAP[wildcards.nb]}") log: - join(out_dir, "features", "log", "{nb}.log"), + join(out_dir, "features", "log", "{nb}.log") output: file=join(out_dir, "features", "full", "{nb}.json.gz"), + kmers=join(out_dir, "labels", "full", "{nb}.txt") # [in-progress] kmer walk @@ -140,8 +140,7 @@ use rule vectorize_full from kmerize with: # UNSUPERVISED WORKFLOW rule cluster: input: - # kmers=join(out_dir, "labels", "{nb}.txt"), - files=expand(join(out_dir, "features", "full", "{fa}.json.gz"), fa=NON_BGS), + files=expand(join(out_dir, "features", "full", "{fa}.json.gz"), fa=NON_BGS) output: model=join(out_dir, "cluster", "{nb}.pkl"), figs=directory(join(out_dir, "cluster", "figures", "{nb}")), @@ -167,9 +166,12 @@ rule cluster: # define feature matrix of kmer vectors not from background set bg, non_bg = data[data["background"]], data[~data["background"]] - full_feature_matrix = skm.score.to_feature_matrix(data["vector"].values) - feature_matrix = skm.score.to_feature_matrix(non_bg["vector"].values) - bg_feature_matrix = skm.score.to_feature_matrix(bg["vector"].values) + full_feature_matrix = skm.utils.to_feature_matrix(data["vector"].values) + feature_matrix = skm.utils.to_feature_matrix(non_bg["vector"].values) + + # currently not used + if len(bg) > 0: + bg_feature_matrix = skm.utils.to_feature_matrix(bg["vector"].values) # fit and save clustering model model = skm.cluster.KmerClustering( @@ -178,7 +180,6 @@ rule cluster: model.fit(full_feature_matrix) with open(output.model, "wb") as f: pickle.dump(model, f) - # fit_predict where? # log time to compute clusters skm.utils.log_runtime(log[0], start_time, step="clustering") @@ -190,7 +191,7 @@ rule cluster: fig.savefig(join(output.figs, "pca_explained_variance_curve.png")) plt.close("all") - fig, ax = skm.plot.get_tsne_clusters(full_feature_matrix, model.labels_) + fig, ax = skm.plot.get_tsne_clusters(full_feature_matrix, model.model.labels_) fig.savefig(join(output.figs, "tsne_clusters.png")) plt.close("all") diff --git a/snekmer/rules/kmerize.smk b/snekmer/rules/kmerize.smk index 7029542..f848db4 100644 --- a/snekmer/rules/kmerize.smk +++ b/snekmer/rules/kmerize.smk @@ -52,7 +52,7 @@ rule generate: # generate labels only labels = skm.transform.generate_labels( - config["k"], + params["k"], alphabet=params["alphabet"], filter_list=params["filter_list"], ) @@ -71,12 +71,12 @@ rule vectorize: log: join("output", "features", "log", "{nb}.log"), output: - files=expand(join("output", "features", "{{nb}}", "{fa}.json.gz"), fa=FAS), + files=expand(join("output", "features", "full", "{{nb}}", "{fa}.json.gz"), fa=FAS), run: start_time = datetime.now() # get kmers for this particular set of sequences - kmers = skm.io.read_output_kmers(input.kmers) + kmers = ["".join(chars) for chars in itertools] # read processed features with open(input.params, "r") as f: @@ -95,7 +95,7 @@ rule vectorize: results["vector"] += [ skm.transform.vectorize_string( seq, - config["k"], + params["k"], params["alphabet"], start=config["start"], end=config["end"], @@ -117,23 +117,29 @@ rule vectorize: rule vectorize_full: input: - kmers=join("output", "labels", "full", "{nb}.txt"), + # kmers=join("output", "labels", "full", "{nb}.txt"), params=join("output", "processed", "{nb}.json"), fasta=join("input", "{nb}.fasta"), log: join("output", "features", "log", "{nb}.log"), output: file=join("output", "features", "full", "{nb}.json.gz"), + kmers=join("output", "labels", "full", "{nb}.txt") run: start_time = datetime.now() - # get kmers for this particular set of sequences - kmers = skm.io.read_output_kmers(input.kmers) - # read processed features with open(input.params, "r") as f: params = json.load(f) + residues, _, _, _ = skm.transform.parse_mapping(params["alphabet"]) + + # generate full kmer lest + kmers = [ + "".join(chars) for chars in product(residues, repeat=params["k"]) + ] + + # revectorize based on full kmer list # for i, fa in enumerate(fastas): results = {"seq_id": [], "vector": []} @@ -143,7 +149,7 @@ rule vectorize_full: results["vector"] += [ skm.transform.vectorize_string( seq, - config["k"], + params["k"], params["alphabet"], start=config["start"], end=config["end"], @@ -156,5 +162,8 @@ rule vectorize_full: with gzip.open(output.file, "wt", encoding="ascii") as zipfile: json.dump(results, zipfile) + if config["output"]["format"] == "simple": + skm.features.output_features(output.kmers, "matrix", labels=kmers) + # record script runtime skm.utils.log_runtime(log[0], start_time) diff --git a/snekmer/transform.py b/snekmer/transform.py index cf4cdb7..6153815 100644 --- a/snekmer/transform.py +++ b/snekmer/transform.py @@ -478,7 +478,7 @@ def vectorize_string( start, end = set_sequence_endpoints(sequence, k, start, end) results = feature_dict or {} - if filter_list: + if filter_list is not None: results = {key: 0 for key in filter_list} for i in range(start, end): @@ -494,7 +494,9 @@ def vectorize_string( print(f"{i}\t{k_map}\t{k_string}\t1",) # filter unrecognized characters or filter from list - if (len(k_string) < k) or (filter_list and k_string not in filter_list): + if (len(k_string) < k) or ( + filter_list is not None and k_string not in filter_list + ): continue # FILTER HERE @@ -508,7 +510,7 @@ def vectorize_string( results[k_string] = 0 results[k_string] += 1 - if filter_list: + if filter_list is not None: results = {item: results[item] for item in filter_list} if return_dict: return results diff --git a/snekmer/utils.py b/snekmer/utils.py index f7b1312..e5ca7b5 100644 --- a/snekmer/utils.py +++ b/snekmer/utils.py @@ -243,7 +243,8 @@ def to_feature_matrix(array): 2D array version of the 2D array-like input. """ - return np.array([np.array(a, dtype=int) for a in array]) + array = [np.array(a) for a in array] + return np.asarray(array) def str_to_array(array): diff --git a/snekmer/vectorize.py b/snekmer/vectorize.py index 0655f2e..3e9fc9f 100644 --- a/snekmer/vectorize.py +++ b/snekmer/vectorize.py @@ -3,10 +3,13 @@ """ import itertools -import numpy as np from collections import Counter -from snekmer.alphabets import ALPHABET, FULL_ALPHABETS, get_alphabet_keys, get_alphabet -from typing import Union, Set +from typing import Set, Union + +import numpy as np + +from snekmer.alphabets import ALPHABET, FULL_ALPHABETS, get_alphabet, get_alphabet_keys + # generate all possible kmer combinations def _generate(alphabet: Union[str, int], k: int): @@ -92,10 +95,7 @@ def _reduce(sequence: str, alphabet_map: dict) -> str: # generate kmer vectors with bag-of-words approach def vectorize(self, sequence: str) -> np.ndarray: - # self.char_set = set(ALPHABETS[alphabet]["_keys"]) N = len(self.char_set) ** self.k - # all_kmers = self._generate("".join(self.char_set), k) - # self.kmers = list(self._generate(list(self.char_set), k)) alphabet_map = get_alphabet(self.alphabet, mapping=FULL_ALPHABETS) sequence = self._reduce(sequence, alphabet_map=alphabet_map)