Skip to content

Commit

Permalink
Merge pull request #150 from andersen-lab/plot-covariants-fixes
Browse files Browse the repository at this point in the history
Plot-covariants-fixes
  • Loading branch information
joshuailevy authored Jun 7, 2023
2 parents 2761b2e + 518a323 commit 6a72be5
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 50 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ This outputs to a tsv file that includes the mutations present in each set of co
```
freyja plot-covariants [covariant_file.tsv] --output [plot-filename(.pdf,.png,etc.)]
```
This generates an informative heatmap visualization showing variants as well as coverage across different reads. Each covariant pattern is listed in the format CPx(# occurrences of pattern x). The threshold `--min_mutations` sets a minimum number of mutations per set of covariants for inclusion in the plot. The flag `--nt_muts` can be included to provide nucleotide-specific information in thx plot (e.g. C22995A(S:T478K) as opposed to S:T478K), making it possible to view multi-nucleotide variants.
This generates an informative heatmap visualization showing variants as well as coverage across different reads. Each covariant pattern is listed in the format CPx(# occurrences of pattern x). `--num_clusters` sets the number of patterns (i.e. rows in the plot) to display, while the threshold `--min_mutations` sets a minimum number of mutations per set of covariants for inclusion in the plot. The flag `--nt_muts` can be included to provide nucleotide-specific information in the plot (e.g. C22995A(S:T478K) as opposed to S:T478K), making it possible to view multi-nucleotide variants.

Example output:
| **Sorted by Count** | **Sorted by Site** |
Expand Down
12 changes: 7 additions & 5 deletions freyja/_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@


@click.group()
@click.version_option('1.4.3')
@click.version_option('1.4.4')
def cli():
pass

Expand Down Expand Up @@ -571,13 +571,15 @@ def covariants(input_bam, min_site, max_site, output,
@cli.command()
@click.argument('covar_file', type=click.Path(exists=True))
@click.option('--output', default='covariants_plot.pdf')
@click.option('--num_clusters', default=10,
help='number of clusters to plot (starting with greatest count)')
@click.option('--min_mutations', default=1,
help='Minimum number of unique covariants to display')
help='minimum number of mutations in a cluster to be plotted')
@click.option('--nt_muts', is_flag=True,
help=('if included, include nucleotide mutations in mut labels'
help=('if included, include nucleotide mutations in x-labels'
))
def plot_covariants(covar_file, output, min_mutations, nt_muts):
_plot_covariants(covar_file, output, min_mutations, nt_muts)
def plot_covariants(covar_file, output, num_clusters, min_mutations, nt_muts):
_plot_covariants(covar_file, output, num_clusters, min_mutations, nt_muts)


if __name__ == '__main__':
Expand Down
115 changes: 77 additions & 38 deletions freyja/read_analysis_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,7 +539,7 @@ def get_gene(locus):
f'{aa_locus+del_length-1})'
)
else:
aa_mut = f'{deletion}({gene}:DEL{aa_locus})'
aa_mut = f'{deletion_string}({gene}:DEL{aa_locus})'

if deletion_string not in nt_to_aa:
nt_to_aa[deletion_string] = aa_mut
Expand Down Expand Up @@ -644,46 +644,85 @@ def get_gene(locus):
return df


def plot_covariants(covar_file, output, min_mutations, nt_muts):
def filter_covariants_output(cluster, min_mutation_count, nt_mut):
cluster_final = []

if not nt_mut:
for variant in cluster:
if '*' in variant or 'INS' in variant:
continue # Ignore stop codons and insertions for now
elif 'DEL' in variant \
and int(variant.split(")(")[0].split(",")[1]) % 3 != 0:
continue # Ignore frameshift mutations
else:
cluster_final.append(variant)
else: # nt covariants
for variant in cluster:
if ',' in variant:
if any([nt in variant for nt in ['A,C,G,T']]): # Insertion
continue
if int(variant.split(',')[1].split(')')[0]) % 3 != 0:
continue
cluster_final.append(variant)

# Remove duplicates while preserving order
cluster_final = list(dict.fromkeys(cluster_final))

if len(cluster_final) < min_mutation_count:
return pd.NA
return cluster_final


def plot_covariants(covar_file, output, num_clusters, min_mutations, nt_muts):

# Define columns (mutations in samples)
covars = pd.read_csv(covar_file, sep='\t', header=0)
covars['Covariants'] = covars['Covariants']\
.str.replace(', ', ',')\
.str.split(' ')\
.apply(filter_covariants_output, args=(min_mutations, nt_muts))\

covars[[covars['Covariants'] == 'nan']] = pd.NA
covars = covars.dropna()\
.head(num_clusters)

# Merge rows with identical covariants and add their counts
covars = covars.groupby(covars['Covariants'].astype(str))\
.agg(
{'Count': 'sum', 'Coverage_start': 'first', 'Coverage_end': 'first'}
)\
.reset_index()\
.sort_values('Count', ascending=False)

# Get all mutations found in the sample
all_muts = []
for sublist in covars['Covariants'].to_list():
for mut in sublist.strip('][').split(', '):
if mut[1:-1] not in all_muts:
all_muts.append(mut[1:-1])

patterns = [pattern.strip('][').split(', ')
for pattern in covars['Covariants'].to_list()]

counts = dict(zip([str(pattern)
for pattern in patterns], covars['Count'].to_list()))

coverage_start = covars['Coverage_start']
coverage_end = covars['Coverage_end']
coverage_ranges = dict(zip([str(pattern) for pattern in patterns],
zip(coverage_start, coverage_end)))

if nt_muts:
colnames = all_muts
sites = {mut: nt_position(mut) for mut in all_muts}
else:
colnames = [mut.split(')(')[1][:-1] if 'DEL' in mut
else mut.split('(')[1][:-1] for mut in all_muts]
sites = {mut.split(')(')[1][:-1] if 'DEL' in mut
else mut.split('(')[1][:-1]: nt_position(mut)
for mut in all_muts}

colnames = sorted(colnames, key=lambda x: sites[x])

# Get covariants and unique mutations
all_nt_muts = []
patterns = []
for c in covars.iloc[:, 0]:
sample = c.split(' ')
for mut in sample:
if mut not in all_nt_muts:
all_nt_muts.append(mut)
patterns.append(sample)
if ':' not in all_nt_muts[0]:
nt_muts = True

coverage_start = [int(i) for i in covars.iloc[:, 2]]
coverage_end = [int(i) for i in covars.iloc[:, 3]]
coverage_ranges = {str(patterns[i]): (
coverage_start[i], coverage_end[i]) for i in range(len(patterns))}

counts = {str(patterns[i]): covars.iloc[:, 1][i]
for i in range(len(patterns))}

all_nt_muts = sorted(all_nt_muts, key=nt_position)
colnames = []
sites = {}
for mut in all_nt_muts: # remove duplicates
nt_site = nt_position(mut)
if nt_muts:
aa_site = mut
else:
if ',' in mut:
aa_site = mut.split(')(')[1][:-1]
else:
aa_site = mut.split('(')[1][:-1]
if aa_site not in colnames:
colnames.append(aa_site)
sites[aa_site] = nt_site
data = {}
for pattern in enumerate(patterns):
if len(pattern[1]) >= min_mutations:
Expand Down
13 changes: 7 additions & 6 deletions freyja/tests/test_read_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,8 @@ def test_covariants(self):
'21563', '25384', '--gff-file',
'freyja/data/NC_045512_Hu-1.gff', '--output',
'freyja/data/test_covar.tsv']
subprocess.run(cmd)
err = subprocess.run(cmd)
print(err)
df = pd.read_csv('freyja/data/test_covar.tsv', sep='\t')

patterns = []
Expand All @@ -187,10 +188,10 @@ def test_covariants(self):

# Test spans_region flag
cmd = ['freyja', 'covariants', self.input_bam,
'23550', '23700', '--output', 'freyja/data/test_covar.tsv',
'23550', '23700', '--output', 'freyja/data/test_covar_span.tsv',
'--spans_region']
subprocess.run(cmd)
df = pd.read_csv('freyja/data/test_covar.tsv', sep='\t')
df = pd.read_csv('freyja/data/test_covar_span.tsv', sep='\t')
patterns = []
for c in df.iloc[:, 0]:
patterns.append(c.split(' '))
Expand All @@ -202,11 +203,11 @@ def test_covariants(self):
self.assertTrue(int(mut[1:6]) >= cov_start[i] and
int(mut[1:6]) <= cov_end[i])
self.assertTrue(df.shape[0] == 2)
os.remove('freyja/data/test_covar.tsv')
os.remove('freyja/data/test_covar_span.tsv')

def test_plot_covariants(self):
# Test plot-covariants
cmd = ['freyja', 'plot-covariants',
'freyja/data/example_covariants0.tsv',
'freyja/data/test_covar.tsv',
'--output', 'freyja/data/test_covar_plot.png']
subprocess.run(cmd)

Expand Down

0 comments on commit 6a72be5

Please sign in to comment.