-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdeduce_substiution_matrices.py
127 lines (105 loc) · 6.04 KB
/
deduce_substiution_matrices.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import pandas as pd
import project_utilities as utils
# string headers of variant DataFrame
REF_CODON = 'REF_CODON'
ALT_CODON = 'ALT_CODON'
REF_NUC = 'REF_NUCLEOTIDE'
ALT_NUC = 'ALT_NUCLEOTIDE'
AF = 'ALLELE_FREQUENCY'
def count_af(substitution_matrix, variant_df, subject_lst, subject_headers):
"""
Calculate substitution matrix, not normalized.
:param substitution_matrix: empty DataFrame which will contain summed substitution frequencies
:param variant_df: DataFrame which contains aggregated variant information: for each genomic position,
it should contain at least the source codon, target codon, source nucleotide, target nucleotide, and the variant's allele frequency
:param subject_lst: list of codons\nucleotides
:param subject_header: tuple of header of reference and alternative columns
:return: not normalized substitution matrix
"""
ref_str, alt_str = subject_headers # get header string for DataFrame column access
for ref in subject_lst:
ref_df = variant_df[variant_df[ref_str] == ref] # use only variant in which the reference is ref
for alt in subject_lst:
if alt == ref: # skip synonymous as data does not contain synonymous substitutions
continue
ref_alt_df = ref_df[ref_df[alt_str] == alt] # use only variant in which the alternative is alt
substitution_matrix.at[ref, alt] = ref_alt_df[
AF].sum() # sum corresponding allele frequencies into substitution matrix
return substitution_matrix
def get_nuc_counts(codon_counts):
"""
Count the number of appearances of each nucleotide in the reference genome
:param codon_counts: A pandas Series containing the number of appearances of each codon in the reference genome
:return: pandas Series containing the number of appearances of each nucleotide in the reference genome
"""
nuc_counts = pd.Series(0, index=utils.nucleotides_lst)
for codon in codon_counts.index:
for nuc in codon:
nuc_counts[nuc] += codon_counts.loc[codon]
return nuc_counts
def normalise_matrix(substitution_matrix, counts):
"""
Row-normalisation of the substitution matrix
:param substitution_matrix: DataFrame which contains summed substitution frequencies
:param counts: a pandas Series containing the number of appearances of each codon, for normalization
:return: row-normalised DataFrame
"""
for i in substitution_matrix.index:
curr_counts = int(counts[i])
if curr_counts != 0:
substitution_matrix.loc[i, :] /= curr_counts
substitution_matrix.at[i, i] = 1 - substitution_matrix.loc[i, :].sum()
else:
substitution_matrix.loc[i, :] = 0
return substitution_matrix
def calculate_substitution_matrix(variant_df, subject_lst, subject_header, normalize=True, counts=None):
"""
calculate a nucleotide/codon substitution matrix according to aggregated variant information
:param variant_df: DataFrame which contains aggregated variant information: for each genomic position,
it should contain at least the source codon, target codon, source nucleotide, target nucleotide, and the variant's allele frequency
:param subject_lst: list of codons\nucleotides
:param subject_header: tuple of header of reference and alternative columns
:param counts: a pandas Series containing the number of appearances of each codon/nucleotide, for normalization. Defult value is None.
Will be used only if normalize is set to True.
:param normalize: boolean indicating whether the substitution matrix is to be normalized.
:return: substitution matrix
"""
substitution_matrix = utils.create_zero_matrix(subject_lst)
substitution_matrix = count_af(substitution_matrix, variant_df, subject_lst, subject_header)
if normalize:
substitution_matrix = normalise_matrix(substitution_matrix, counts)
return substitution_matrix
def calculate_hc1(variant_df, normalize=True, codon_counts=None):
"""
calculate a codon substitution matrix according to aggregated variant information
:param variant_df: DataFrame which contains aggregated variant information: for each genomic position,
it should contain at least the source codon, target codon, source nucleotide, target nucleotide, and the variant's allele frequency
:param codon_counts: a pandas Series containing the number of appearances of each codon, for normalization. Defult value is None.
Will be used only if normalize is set to True.
:param normalize: boolean indicating whether the substitution matrix is to be normalized.
:return: codon substitution matrix
"""
return calculate_substitution_matrix(variant_df,
utils.codon_lst_non_stop,
(REF_CODON, ALT_CODON),
normalize=normalize,
counts=codon_counts)
def calculate_hn1(variant_df, normalize=True, codon_counts=None):
"""
calculate a nucleotide substitution matrix according to aggregated variant information
:param variant_df: DataFrame which contains aggregated variant information: for each genomic position,
it should contain at least the source codon, target codon, source nucleotide, target nucleotide, and the variant's allele frequency
:param normalize: boolean indicating whether the substitution matrix is to be normalized.
:param nuc_counts: a pandas Series containing the number of appearances of each nucleotide, for normalization. Defult value is None.
Will be used only if normalize is set to True.
:return: nucleotide substitution matrix
"""
if normalize:
nuc_counts = get_nuc_counts(codon_counts)
else:
nuc_counts = None
return calculate_substitution_matrix(variant_df,
utils.nucleotides_lst,
(REF_NUC, ALT_NUC),
normalize=normalize,
counts=nuc_counts)