-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
executable file
·50 lines (42 loc) · 1.76 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import numpy as np
import csv
import sourmash
def load_hashes(filename):
"""
Helper function that loads the hash_to_col_idx.csv file and returns a dictionary mapping hashes to indices in the
training dictionary. filename should point to a CSV file with two columns: hash, col_idx.
:param filename: string (location of the hash_to_col_idx.csv file)
:return: dictionary mapping hashes to indicies
"""
with open(filename, mode='r') as infile:
next(infile)
reader = csv.reader(infile)
hashes = {int(rows[0]): int(rows[1]) for rows in reader}
return hashes
def load_signature_with_ksize(filename, ksize):
"""
Helper function that loads the signature for a given kmer size from the provided signature file. Filename should point to a .sig file. Raises exception if given kmer size is not present in the file.
:param filename: string (location of the signature file)
:param ksize: kmer size
:return: sourmash signature
"""
sketches = list(sourmash.load_file_as_signatures(filename))
for sig in sketches:
if sig.minhash.ksize == ksize:
return sig
raise ValueError(f'File {filename} does not contain sketch for ksize = {ksize}.')
def signatures_mismatch_ksize(signatures, ksize):
for sig in signatures:
if sig.minhash.ksize != ksize:
return sig
return False
def get_num_kmers(signature, scale= True):
"""
Helper function that estimates the total number of kmers in a given sample.
:param signature: sourmash signature
:return: int (estimated total number of kmers)
"""
num_kmers = signature.minhash.mean_abundance * len(signature.minhash.hashes)
if scale:
num_kmers *= signature.minhash.scaled
return np.round(num_kmers)