From 4be49829163a3878cff089753ee51ff13caa6325 Mon Sep 17 00:00:00 2001 From: dpilz2001 Date: Fri, 2 Jun 2023 07:05:07 -0700 Subject: [PATCH 1/5] fixed parsing updated covariants output --- freyja/read_analysis_tools.py | 76 +++++++++++++++++++++++------------ 1 file changed, 51 insertions(+), 25 deletions(-) diff --git a/freyja/read_analysis_tools.py b/freyja/read_analysis_tools.py index 2f348f33..8cde8110 100644 --- a/freyja/read_analysis_tools.py +++ b/freyja/read_analysis_tools.py @@ -539,7 +539,7 @@ def get_gene(locus): f'{aa_locus+del_length-1})' ) else: - aa_mut = f'{deletion}({gene}:DEL{aa_locus})' + aa_mut = f'{deletion_string}({gene}:DEL{aa_locus})' if deletion_string not in nt_to_aa: nt_to_aa[deletion_string] = aa_mut @@ -644,32 +644,58 @@ def get_gene(locus): return df +def filter_covariants_output(cluster): + cluster_final = [] + for variant in cluster: + if ":" in variant: + if "*" in variant or "INS" in variant: + continue # Ignore stop codons and insertions + elif "DEL" in variant: + # if len(variant.split(")(")[0].split(",")) % 3 != 0: + continue # Ignore deletions + # else: + # if variant not in cluster_final: + # cluster_final.append(variant.split(")(")[1][:-1]) + else: + if variant not in cluster_final: + cluster_final.append(variant) # SNV + if len(cluster_final) == 0: + return pd.NA + return cluster_final + + def plot_covariants(covar_file, output, min_mutations, nt_muts): # Define columns (mutations in samples) covars = pd.read_csv(covar_file, sep='\t', header=0) + covars['Covariants'] = covars['Covariants'] \ + .str.split(' ') \ + .apply(filter_covariants_output) + covars = covars.dropna() + + # Merge rows with identical covariants and add their counts + covars = covars.groupby(covars['Covariants'].astype( + str)).agg({'Count': 'sum', 'Coverage_start': 'first', 'Coverage_end': 'first'}).reset_index() + + covars = covars.sort_values('Count', ascending=False) - # Get covariants and unique mutations + # Get all mutations found in the sample all_nt_muts = [] - patterns = [] - for c in covars.iloc[:, 0]: - sample = c.split(' ') - for mut in sample: - if mut not in all_nt_muts: - all_nt_muts.append(mut) - patterns.append(sample) - if ':' not in all_nt_muts[0]: - nt_muts = True - - coverage_start = [int(i) for i in covars.iloc[:, 2]] - coverage_end = [int(i) for i in covars.iloc[:, 3]] - coverage_ranges = {str(patterns[i]): ( - coverage_start[i], coverage_end[i]) for i in range(len(patterns))} - - counts = {str(patterns[i]): covars.iloc[:, 1][i] - for i in range(len(patterns))} - - all_nt_muts = sorted(all_nt_muts, key=nt_position) + for sublist in covars['Covariants'].to_list(): + for mut in sublist.strip('][').split(', '): + if mut[1:-1] not in all_nt_muts: + all_nt_muts.append(mut[1:-1]) + + patterns = covars['Covariants'].to_list() + patterns = [pattern.strip('][').split(', ') for pattern in patterns] + counts = dict(zip([str(pattern) + for pattern in patterns], covars['Count'].to_list())) + coverage_start = covars['Coverage_start'] + coverage_end = covars['Coverage_end'] + + coverage_ranges = dict(zip([str(pattern) + for pattern in patterns], zip(coverage_start, coverage_end))) + colnames = [] sites = {} for mut in all_nt_muts: # remove duplicates @@ -677,13 +703,13 @@ def plot_covariants(covar_file, output, min_mutations, nt_muts): if nt_muts: aa_site = mut else: - if ',' in mut: - aa_site = mut.split(')(')[1][:-1] - else: - aa_site = mut.split('(')[1][:-1] + aa_site = mut.split('(')[1][:-1] if aa_site not in colnames: colnames.append(aa_site) sites[aa_site] = nt_site + + # Sort colnames + colnames = sorted(colnames, key=lambda x: x.split(':')[1][1:-1]) data = {} for pattern in enumerate(patterns): if len(pattern[1]) >= min_mutations: From 3863bb57b2676cbc4bbd85b6ec3a2e9d35c42c61 Mon Sep 17 00:00:00 2001 From: dpilz2001 Date: Fri, 2 Jun 2023 12:21:12 -0700 Subject: [PATCH 2/5] refactor getting colnames/sites dict --- freyja/read_analysis_tools.py | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/freyja/read_analysis_tools.py b/freyja/read_analysis_tools.py index 8cde8110..5d31a2c9 100644 --- a/freyja/read_analysis_tools.py +++ b/freyja/read_analysis_tools.py @@ -665,6 +665,7 @@ def filter_covariants_output(cluster): def plot_covariants(covar_file, output, min_mutations, nt_muts): + # TODO - add option to plot only top X mutations # Define columns (mutations in samples) covars = pd.read_csv(covar_file, sep='\t', header=0) @@ -680,33 +681,28 @@ def plot_covariants(covar_file, output, min_mutations, nt_muts): covars = covars.sort_values('Count', ascending=False) # Get all mutations found in the sample - all_nt_muts = [] + all_muts = [] for sublist in covars['Covariants'].to_list(): for mut in sublist.strip('][').split(', '): - if mut[1:-1] not in all_nt_muts: - all_nt_muts.append(mut[1:-1]) + if mut[1:-1] not in all_muts: + all_muts.append(mut[1:-1]) + + patterns = [pattern.strip('][').split(', ') for pattern in covars['Covariants'].to_list()] - patterns = covars['Covariants'].to_list() - patterns = [pattern.strip('][').split(', ') for pattern in patterns] counts = dict(zip([str(pattern) for pattern in patterns], covars['Count'].to_list())) + coverage_start = covars['Coverage_start'] coverage_end = covars['Coverage_end'] - coverage_ranges = dict(zip([str(pattern) for pattern in patterns], zip(coverage_start, coverage_end))) - colnames = [] - sites = {} - for mut in all_nt_muts: # remove duplicates - nt_site = nt_position(mut) - if nt_muts: - aa_site = mut - else: - aa_site = mut.split('(')[1][:-1] - if aa_site not in colnames: - colnames.append(aa_site) - sites[aa_site] = nt_site + if nt_muts: + colnames = all_muts + sites = {mut: nt_position(mut) for mut in all_muts} + else: + colnames = [mut.split('(')[1][:-1] for mut in all_muts] + sites = {mut.split('(')[1][:-1]: nt_position(mut) for mut in all_muts} # Sort colnames colnames = sorted(colnames, key=lambda x: x.split(':')[1][1:-1]) From 6176a1e71b4bc3d35b812b659d94d46547a1aae5 Mon Sep 17 00:00:00 2001 From: dpilz2001 Date: Wed, 7 Jun 2023 10:56:08 -0700 Subject: [PATCH 3/5] add num_clusters option, update version number --- freyja/_cli.py | 12 ++--- freyja/read_analysis_tools.py | 82 +++++++++++++++++++++-------------- 2 files changed, 56 insertions(+), 38 deletions(-) diff --git a/freyja/_cli.py b/freyja/_cli.py index 85fdb424..f4628509 100644 --- a/freyja/_cli.py +++ b/freyja/_cli.py @@ -23,7 +23,7 @@ @click.group() -@click.version_option('1.4.3') +@click.version_option('1.4.4') def cli(): pass @@ -571,13 +571,15 @@ def covariants(input_bam, min_site, max_site, output, @cli.command() @click.argument('covar_file', type=click.Path(exists=True)) @click.option('--output', default='covariants_plot.pdf') +@click.option('--num_clusters', default=10, + help='number of clusters to plot (starting with greatest count)') @click.option('--min_mutations', default=1, - help='Minimum number of unique covariants to display') + help='minimum number of mutations in a cluster to be plotted') @click.option('--nt_muts', is_flag=True, - help=('if included, include nucleotide mutations in mut labels' + help=('if included, include nucleotide mutations in x-labels' )) -def plot_covariants(covar_file, output, min_mutations, nt_muts): - _plot_covariants(covar_file, output, min_mutations, nt_muts) +def plot_covariants(covar_file, output, num_clusters, min_mutations, nt_muts): + _plot_covariants(covar_file, output, num_clusters, min_mutations, nt_muts) if __name__ == '__main__': diff --git a/freyja/read_analysis_tools.py b/freyja/read_analysis_tools.py index 5d31a2c9..b17bbb16 100644 --- a/freyja/read_analysis_tools.py +++ b/freyja/read_analysis_tools.py @@ -644,41 +644,53 @@ def get_gene(locus): return df -def filter_covariants_output(cluster): +def filter_covariants_output(cluster, min_mutation_count, nt_mut): cluster_final = [] - for variant in cluster: - if ":" in variant: - if "*" in variant or "INS" in variant: - continue # Ignore stop codons and insertions - elif "DEL" in variant: - # if len(variant.split(")(")[0].split(",")) % 3 != 0: - continue # Ignore deletions - # else: - # if variant not in cluster_final: - # cluster_final.append(variant.split(")(")[1][:-1]) + + if not nt_mut: + for variant in cluster: + if '*' in variant or 'INS' in variant: + continue # Ignore stop codons and insertions for now + elif 'DEL' in variant \ + and int(variant.split(")(")[0].split(",")[1]) % 3 != 0: + continue # Ignore frameshift mutations else: - if variant not in cluster_final: - cluster_final.append(variant) # SNV - if len(cluster_final) == 0: + cluster_final.append(variant) + else: # nt covariants + for variant in cluster: + if ',' in variant: + if any([nt in variant for nt in ['A,C,G,T']]): # Insertion + continue + if int(variant.split(',')[1].split(')')[0]) % 3 != 0: + continue + cluster_final.append(variant) + + # Remove duplicates while preserving order + cluster_final = list(dict.fromkeys(cluster_final)) + + if len(cluster_final) < min_mutation_count: return pd.NA return cluster_final -def plot_covariants(covar_file, output, min_mutations, nt_muts): - # TODO - add option to plot only top X mutations +def plot_covariants(covar_file, output, num_clusters, min_mutations, nt_muts): - # Define columns (mutations in samples) covars = pd.read_csv(covar_file, sep='\t', header=0) - covars['Covariants'] = covars['Covariants'] \ - .str.split(' ') \ - .apply(filter_covariants_output) - covars = covars.dropna() + covars['Covariants'] = covars['Covariants']\ + .str.split(' ')\ + .apply(filter_covariants_output, args=(min_mutations, nt_muts))\ - # Merge rows with identical covariants and add their counts - covars = covars.groupby(covars['Covariants'].astype( - str)).agg({'Count': 'sum', 'Coverage_start': 'first', 'Coverage_end': 'first'}).reset_index() + covars[[covars['Covariants'] == 'nan']] = pd.NA + covars = covars.dropna()\ + .head(num_clusters) - covars = covars.sort_values('Count', ascending=False) + # Merge rows with identical covariants and add their counts + covars = covars.groupby(covars['Covariants'].astype(str))\ + .agg( + {'Count': 'sum', 'Coverage_start': 'first', 'Coverage_end': 'first'} + )\ + .reset_index()\ + .sort_values('Count', ascending=False) # Get all mutations found in the sample all_muts = [] @@ -687,25 +699,29 @@ def plot_covariants(covar_file, output, min_mutations, nt_muts): if mut[1:-1] not in all_muts: all_muts.append(mut[1:-1]) - patterns = [pattern.strip('][').split(', ') for pattern in covars['Covariants'].to_list()] + patterns = [pattern.strip('][').split(', ') + for pattern in covars['Covariants'].to_list()] counts = dict(zip([str(pattern) for pattern in patterns], covars['Count'].to_list())) - + coverage_start = covars['Coverage_start'] coverage_end = covars['Coverage_end'] - coverage_ranges = dict(zip([str(pattern) - for pattern in patterns], zip(coverage_start, coverage_end))) + coverage_ranges = dict(zip([str(pattern) for pattern in patterns], + zip(coverage_start, coverage_end))) if nt_muts: colnames = all_muts sites = {mut: nt_position(mut) for mut in all_muts} else: - colnames = [mut.split('(')[1][:-1] for mut in all_muts] - sites = {mut.split('(')[1][:-1]: nt_position(mut) for mut in all_muts} + colnames = [mut.split(')(')[1][:-1] if 'DEL' in mut + else mut.split('(')[1][:-1] for mut in all_muts] + sites = {mut.split(')(')[1][:-1] if 'DEL' in mut + else mut.split('(')[1][:-1]: nt_position(mut) + for mut in all_muts} + + colnames = sorted(colnames, key=lambda x: sites[x]) - # Sort colnames - colnames = sorted(colnames, key=lambda x: x.split(':')[1][1:-1]) data = {} for pattern in enumerate(patterns): if len(pattern[1]) >= min_mutations: From 16c0cd7eb577f590c55b305e0058d78713623d7b Mon Sep 17 00:00:00 2001 From: dpilz2001 Date: Wed, 7 Jun 2023 11:09:54 -0700 Subject: [PATCH 4/5] update tests, README --- README.md | 2 +- freyja/tests/test_read_analysis.py | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index dc129a35..1c9dfbc1 100644 --- a/README.md +++ b/README.md @@ -149,7 +149,7 @@ This outputs to a tsv file that includes the mutations present in each set of co ``` freyja plot-covariants [covariant_file.tsv] --output [plot-filename(.pdf,.png,etc.)] ``` -This generates an informative heatmap visualization showing variants as well as coverage across different reads. Each covariant pattern is listed in the format CPx(# occurrences of pattern x). The threshold `--min_mutations` sets a minimum number of mutations per set of covariants for inclusion in the plot. The flag `--nt_muts` can be included to provide nucleotide-specific information in thx plot (e.g. C22995A(S:T478K) as opposed to S:T478K), making it possible to view multi-nucleotide variants. +This generates an informative heatmap visualization showing variants as well as coverage across different reads. Each covariant pattern is listed in the format CPx(# occurrences of pattern x). `--num_clusters` sets the number of patterns (i.e. rows in the plot) to display, while the threshold `--min_mutations` sets a minimum number of mutations per set of covariants for inclusion in the plot. The flag `--nt_muts` can be included to provide nucleotide-specific information in the plot (e.g. C22995A(S:T478K) as opposed to S:T478K), making it possible to view multi-nucleotide variants. Example output: | **Sorted by Count** | **Sorted by Site** | diff --git a/freyja/tests/test_read_analysis.py b/freyja/tests/test_read_analysis.py index 502b4331..dce42a00 100644 --- a/freyja/tests/test_read_analysis.py +++ b/freyja/tests/test_read_analysis.py @@ -166,7 +166,8 @@ def test_covariants(self): '21563', '25384', '--gff-file', 'freyja/data/NC_045512_Hu-1.gff', '--output', 'freyja/data/test_covar.tsv'] - subprocess.run(cmd) + err = subprocess.run(cmd) + print(err) df = pd.read_csv('freyja/data/test_covar.tsv', sep='\t') patterns = [] @@ -187,10 +188,10 @@ def test_covariants(self): # Test spans_region flag cmd = ['freyja', 'covariants', self.input_bam, - '23550', '23700', '--output', 'freyja/data/test_covar.tsv', + '23550', '23700', '--output', 'freyja/data/test_covar_span.tsv', '--spans_region'] subprocess.run(cmd) - df = pd.read_csv('freyja/data/test_covar.tsv', sep='\t') + df = pd.read_csv('freyja/data/test_covar_span.tsv', sep='\t') patterns = [] for c in df.iloc[:, 0]: patterns.append(c.split(' ')) @@ -202,11 +203,11 @@ def test_covariants(self): self.assertTrue(int(mut[1:6]) >= cov_start[i] and int(mut[1:6]) <= cov_end[i]) self.assertTrue(df.shape[0] == 2) - os.remove('freyja/data/test_covar.tsv') + os.remove('freyja/data/test_covar_span.tsv') - def test_plot_covariants(self): + # Test plot-covariants cmd = ['freyja', 'plot-covariants', - 'freyja/data/example_covariants0.tsv', + 'freyja/data/test_covar.tsv', '--output', 'freyja/data/test_covar_plot.png'] subprocess.run(cmd) From 518a323ac7e49a2e20e8b70c2028b7a3d7793830 Mon Sep 17 00:00:00 2001 From: dpilz2001 Date: Wed, 7 Jun 2023 13:37:54 -0700 Subject: [PATCH 5/5] fix issue with spaces in mutation strings --- freyja/read_analysis_tools.py | 1 + 1 file changed, 1 insertion(+) diff --git a/freyja/read_analysis_tools.py b/freyja/read_analysis_tools.py index b17bbb16..2437e684 100644 --- a/freyja/read_analysis_tools.py +++ b/freyja/read_analysis_tools.py @@ -677,6 +677,7 @@ def plot_covariants(covar_file, output, num_clusters, min_mutations, nt_muts): covars = pd.read_csv(covar_file, sep='\t', header=0) covars['Covariants'] = covars['Covariants']\ + .str.replace(', ', ',')\ .str.split(' ')\ .apply(filter_covariants_output, args=(min_mutations, nt_muts))\