This repository has been archived by the owner on Mar 21, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_stats_matrices.py
130 lines (107 loc) · 4.45 KB
/
create_stats_matrices.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""
@summary: This script generates matrix objects containing phylo stats for a PAM
and tree
"""
import argparse
import numpy as np
from lm_munge import munge_lm_tree
from matrix import Matrix
from mnnd import mnnd
from mpd import mpd
from pd import pd
from spd import spd
from tree_reader import read_tree_string
# .............................................................................
def get_site_statistics(tree, pam, squid_dict):
"""
@summary: Calculate site statistics
@note: To add additional stats:
1. Add an additional numpy array
2. Add stat in "Site statistics"
3. Add stat array and header in return
"""
num_rows, num_cols = pam.data.shape
squid_headers = pam.getColumnHeaders()
# Stat arrays
# ..............
pd_stat = np.zeros((num_rows, 1), dtype=np.float)
spd_stat = np.zeros((num_rows, 1), dtype=np.float)
mnnd_stat = np.zeros((num_rows, 1), dtype=np.float)
mpd_stat = np.zeros((num_rows, 1), dtype=np.float)
ndsdict = {}
for i in tree.leaves():
ndsdict[i.label] = i
for i in xrange(num_rows):
# Get the species names that are present at this site
present_squids = [
squid_headers[j] for j in xrange(num_cols) if int(pam.data[i,j])]
# Get the species names for the present squids
sp_tips = []
for squid in present_squids:
try:
if squid_dict.has_key(squid):
sp_tips.append(ndsdict[squid_dict[squid]])
except:
print('Could not find: {}'.format(squid_dict[squid]))
# Site statistics
# ..............
if len(sp_tips) > 0:
pd_stat[i,0] = pd(tree, sp_tips)
spd_stat[i,0] = spd(tree, sp_tips)
mnnd_stat[i,0] = mnnd(tree, sp_tips)
mpd_stat[i,0] = mpd(tree, sp_tips)
else:
pd_stat[i,0] = 0.0
spd_stat[i,0] = 0.0
mnnd_stat[i,0] = 0.0
mpd_stat[i,0] = 0.0
# TODO: Add any additional statistics
# Return statistics
# Site statistics np.arrays
site_stats = [pd_stat, spd_stat, mnnd_stat, mpd_stat]
# Site statistic headers
stat_headers = ['PD', 'Sum Phylo Distances', 'Mean nearest neighbor dist', 'Mean phylo distance']
return Matrix(np.concatenate(site_stats, axis=1),
headers={'0' : pam.getRowHeaders(),
'1' : stat_headers})
# .............................................................................
def get_species_statistics(tree, pam, squid_dict):
"""
@todo: Fill in if we have any species based statistics
"""
return None
# .............................................................................
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Calculate phylogenetic statistics for a PAM and a tree')
parser.add_argument('pam_matrix_filename', type=str,
help='File path to PAM matrix (in LM format)')
parser.add_argument('tree_filename', type=str,
help='File path to tree (NEXUS)')
parser.add_argument('-sites', '--sites_filename', type=str,
help='If provided, write site stats to this location (CSV)')
parser.add_argument('-sp', '--species_filename', type=str,
help='If provided, write species stats to this location (CSV)')
args = parser.parse_args()
# Read in data
pam = Matrix.load(args.pam_matrix_filename)
# We use a hack function to get the data out of the tree, in the LM code
# this is done with dendropy. I just didn't want to add any dependencies
tree_newick, squid_dict = munge_lm_tree(args.tree_filename)
#print tree_newick
#print len(squid_dict.keys())
# Read the newick tree
tree = read_tree_string(tree_newick)
# Generate stats matrices
site_stats = get_site_statistics(tree, pam, squid_dict)
species_stats = get_species_statistics(tree, pam, squid_dict)
# Write matrices if desired
if args.sites_filename is not None:
with open(args.sites_filename, 'w') as outF:
site_stats.writeCSV(outF)
else:
print site_stats.data
if args.species_filename is not None:
raise Exception, 'No species stats are currently implemented'
with open(args.species_filename, 'w') as outF:
species_stats.writeCSV(outF)