From 4be49829163a3878cff089753ee51ff13caa6325 Mon Sep 17 00:00:00 2001
From: dpilz2001 <dpilz@ucsd.edu>
Date: Fri, 2 Jun 2023 07:05:07 -0700
Subject: [PATCH 1/5] fixed parsing updated covariants output

---
 freyja/read_analysis_tools.py | 76 +++++++++++++++++++++++------------
 1 file changed, 51 insertions(+), 25 deletions(-)

diff --git a/freyja/read_analysis_tools.py b/freyja/read_analysis_tools.py
index 2f348f33..8cde8110 100644
--- a/freyja/read_analysis_tools.py
+++ b/freyja/read_analysis_tools.py
@@ -539,7 +539,7 @@ def get_gene(locus):
                             f'{aa_locus+del_length-1})'
                         )
                     else:
-                        aa_mut = f'{deletion}({gene}:DEL{aa_locus})'
+                        aa_mut = f'{deletion_string}({gene}:DEL{aa_locus})'
 
                     if deletion_string not in nt_to_aa:
                         nt_to_aa[deletion_string] = aa_mut
@@ -644,32 +644,58 @@ def get_gene(locus):
     return df
 
 
+def filter_covariants_output(cluster):
+    cluster_final = []
+    for variant in cluster:
+        if ":" in variant:
+            if "*" in variant or "INS" in variant:
+                continue  # Ignore stop codons and insertions
+            elif "DEL" in variant:
+                # if len(variant.split(")(")[0].split(",")) % 3 != 0:
+                continue  # Ignore deletions
+                # else:
+                #     if variant not in cluster_final:
+                #         cluster_final.append(variant.split(")(")[1][:-1])
+            else:
+                if variant not in cluster_final:
+                    cluster_final.append(variant)  # SNV
+    if len(cluster_final) == 0:
+        return pd.NA
+    return cluster_final
+
+
 def plot_covariants(covar_file, output, min_mutations, nt_muts):
 
     # Define columns (mutations in samples)
     covars = pd.read_csv(covar_file, sep='\t', header=0)
+    covars['Covariants'] = covars['Covariants'] \
+        .str.split(' ') \
+        .apply(filter_covariants_output)
+    covars = covars.dropna()
+
+    # Merge rows with identical covariants and add their counts
+    covars = covars.groupby(covars['Covariants'].astype(
+        str)).agg({'Count': 'sum', 'Coverage_start': 'first', 'Coverage_end': 'first'}).reset_index()
+
+    covars = covars.sort_values('Count', ascending=False)
 
-    # Get covariants and unique mutations
+    # Get all mutations found in the sample
     all_nt_muts = []
-    patterns = []
-    for c in covars.iloc[:, 0]:
-        sample = c.split(' ')
-        for mut in sample:
-            if mut not in all_nt_muts:
-                all_nt_muts.append(mut)
-        patterns.append(sample)
-    if ':' not in all_nt_muts[0]:
-        nt_muts = True
-
-    coverage_start = [int(i) for i in covars.iloc[:, 2]]
-    coverage_end = [int(i) for i in covars.iloc[:, 3]]
-    coverage_ranges = {str(patterns[i]): (
-        coverage_start[i], coverage_end[i]) for i in range(len(patterns))}
-
-    counts = {str(patterns[i]): covars.iloc[:, 1][i]
-              for i in range(len(patterns))}
-
-    all_nt_muts = sorted(all_nt_muts, key=nt_position)
+    for sublist in covars['Covariants'].to_list():
+        for mut in sublist.strip('][').split(', '):
+            if mut[1:-1] not in all_nt_muts:
+                all_nt_muts.append(mut[1:-1])
+
+    patterns = covars['Covariants'].to_list()
+    patterns = [pattern.strip('][').split(', ') for pattern in patterns]
+    counts = dict(zip([str(pattern)
+                  for pattern in patterns], covars['Count'].to_list()))
+    coverage_start = covars['Coverage_start']
+    coverage_end = covars['Coverage_end']
+
+    coverage_ranges = dict(zip([str(pattern)
+                                for pattern in patterns], zip(coverage_start, coverage_end)))
+
     colnames = []
     sites = {}
     for mut in all_nt_muts:  # remove duplicates
@@ -677,13 +703,13 @@ def plot_covariants(covar_file, output, min_mutations, nt_muts):
         if nt_muts:
             aa_site = mut
         else:
-            if ',' in mut:
-                aa_site = mut.split(')(')[1][:-1]
-            else:
-                aa_site = mut.split('(')[1][:-1]
+            aa_site = mut.split('(')[1][:-1]
         if aa_site not in colnames:
             colnames.append(aa_site)
             sites[aa_site] = nt_site
+
+    # Sort colnames
+    colnames = sorted(colnames, key=lambda x: x.split(':')[1][1:-1])
     data = {}
     for pattern in enumerate(patterns):
         if len(pattern[1]) >= min_mutations:

From 3863bb57b2676cbc4bbd85b6ec3a2e9d35c42c61 Mon Sep 17 00:00:00 2001
From: dpilz2001 <dpilz@ucsd.edu>
Date: Fri, 2 Jun 2023 12:21:12 -0700
Subject: [PATCH 2/5] refactor getting colnames/sites dict

---
 freyja/read_analysis_tools.py | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/freyja/read_analysis_tools.py b/freyja/read_analysis_tools.py
index 8cde8110..5d31a2c9 100644
--- a/freyja/read_analysis_tools.py
+++ b/freyja/read_analysis_tools.py
@@ -665,6 +665,7 @@ def filter_covariants_output(cluster):
 
 
 def plot_covariants(covar_file, output, min_mutations, nt_muts):
+    # TODO - add option to plot only top X mutations
 
     # Define columns (mutations in samples)
     covars = pd.read_csv(covar_file, sep='\t', header=0)
@@ -680,33 +681,28 @@ def plot_covariants(covar_file, output, min_mutations, nt_muts):
     covars = covars.sort_values('Count', ascending=False)
 
     # Get all mutations found in the sample
-    all_nt_muts = []
+    all_muts = []
     for sublist in covars['Covariants'].to_list():
         for mut in sublist.strip('][').split(', '):
-            if mut[1:-1] not in all_nt_muts:
-                all_nt_muts.append(mut[1:-1])
+            if mut[1:-1] not in all_muts:
+                all_muts.append(mut[1:-1])
+
+    patterns = [pattern.strip('][').split(', ') for pattern in covars['Covariants'].to_list()]
 
-    patterns = covars['Covariants'].to_list()
-    patterns = [pattern.strip('][').split(', ') for pattern in patterns]
     counts = dict(zip([str(pattern)
                   for pattern in patterns], covars['Count'].to_list()))
+    
     coverage_start = covars['Coverage_start']
     coverage_end = covars['Coverage_end']
-
     coverage_ranges = dict(zip([str(pattern)
                                 for pattern in patterns], zip(coverage_start, coverage_end)))
 
-    colnames = []
-    sites = {}
-    for mut in all_nt_muts:  # remove duplicates
-        nt_site = nt_position(mut)
-        if nt_muts:
-            aa_site = mut
-        else:
-            aa_site = mut.split('(')[1][:-1]
-        if aa_site not in colnames:
-            colnames.append(aa_site)
-            sites[aa_site] = nt_site
+    if nt_muts:
+        colnames = all_muts
+        sites = {mut: nt_position(mut) for mut in all_muts}
+    else:
+        colnames = [mut.split('(')[1][:-1] for mut in all_muts]
+        sites = {mut.split('(')[1][:-1]: nt_position(mut) for mut in all_muts}
 
     # Sort colnames
     colnames = sorted(colnames, key=lambda x: x.split(':')[1][1:-1])

From 6176a1e71b4bc3d35b812b659d94d46547a1aae5 Mon Sep 17 00:00:00 2001
From: dpilz2001 <dpilz@ucsd.edu>
Date: Wed, 7 Jun 2023 10:56:08 -0700
Subject: [PATCH 3/5] add num_clusters option, update version number

---
 freyja/_cli.py                | 12 ++---
 freyja/read_analysis_tools.py | 82 +++++++++++++++++++++--------------
 2 files changed, 56 insertions(+), 38 deletions(-)

diff --git a/freyja/_cli.py b/freyja/_cli.py
index 85fdb424..f4628509 100644
--- a/freyja/_cli.py
+++ b/freyja/_cli.py
@@ -23,7 +23,7 @@
 
 
 @click.group()
-@click.version_option('1.4.3')
+@click.version_option('1.4.4')
 def cli():
     pass
 
@@ -571,13 +571,15 @@ def covariants(input_bam, min_site, max_site, output,
 @cli.command()
 @click.argument('covar_file', type=click.Path(exists=True))
 @click.option('--output', default='covariants_plot.pdf')
+@click.option('--num_clusters', default=10,
+              help='number of clusters to plot (starting with greatest count)')
 @click.option('--min_mutations', default=1,
-              help='Minimum number of unique covariants to display')
+              help='minimum number of mutations in a cluster to be plotted')
 @click.option('--nt_muts', is_flag=True,
-              help=('if included, include nucleotide mutations in mut labels'
+              help=('if included, include nucleotide mutations in x-labels'
                     ))
-def plot_covariants(covar_file, output, min_mutations, nt_muts):
-    _plot_covariants(covar_file, output, min_mutations, nt_muts)
+def plot_covariants(covar_file, output, num_clusters, min_mutations, nt_muts):
+    _plot_covariants(covar_file, output, num_clusters, min_mutations, nt_muts)
 
 
 if __name__ == '__main__':
diff --git a/freyja/read_analysis_tools.py b/freyja/read_analysis_tools.py
index 5d31a2c9..b17bbb16 100644
--- a/freyja/read_analysis_tools.py
+++ b/freyja/read_analysis_tools.py
@@ -644,41 +644,53 @@ def get_gene(locus):
     return df
 
 
-def filter_covariants_output(cluster):
+def filter_covariants_output(cluster, min_mutation_count, nt_mut):
     cluster_final = []
-    for variant in cluster:
-        if ":" in variant:
-            if "*" in variant or "INS" in variant:
-                continue  # Ignore stop codons and insertions
-            elif "DEL" in variant:
-                # if len(variant.split(")(")[0].split(",")) % 3 != 0:
-                continue  # Ignore deletions
-                # else:
-                #     if variant not in cluster_final:
-                #         cluster_final.append(variant.split(")(")[1][:-1])
+
+    if not nt_mut:
+        for variant in cluster:
+            if '*' in variant or 'INS' in variant:
+                continue  # Ignore stop codons and insertions for now
+            elif 'DEL' in variant \
+                    and int(variant.split(")(")[0].split(",")[1]) % 3 != 0:
+                continue  # Ignore frameshift mutations
             else:
-                if variant not in cluster_final:
-                    cluster_final.append(variant)  # SNV
-    if len(cluster_final) == 0:
+                cluster_final.append(variant)
+    else:  # nt covariants
+        for variant in cluster:
+            if ',' in variant:
+                if any([nt in variant for nt in ['A,C,G,T']]):  # Insertion
+                    continue
+                if int(variant.split(',')[1].split(')')[0]) % 3 != 0:
+                    continue
+            cluster_final.append(variant)
+
+    # Remove duplicates while preserving order
+    cluster_final = list(dict.fromkeys(cluster_final))
+
+    if len(cluster_final) < min_mutation_count:
         return pd.NA
     return cluster_final
 
 
-def plot_covariants(covar_file, output, min_mutations, nt_muts):
-    # TODO - add option to plot only top X mutations
+def plot_covariants(covar_file, output, num_clusters, min_mutations, nt_muts):
 
-    # Define columns (mutations in samples)
     covars = pd.read_csv(covar_file, sep='\t', header=0)
-    covars['Covariants'] = covars['Covariants'] \
-        .str.split(' ') \
-        .apply(filter_covariants_output)
-    covars = covars.dropna()
+    covars['Covariants'] = covars['Covariants']\
+        .str.split(' ')\
+        .apply(filter_covariants_output, args=(min_mutations, nt_muts))\
 
-    # Merge rows with identical covariants and add their counts
-    covars = covars.groupby(covars['Covariants'].astype(
-        str)).agg({'Count': 'sum', 'Coverage_start': 'first', 'Coverage_end': 'first'}).reset_index()
+    covars[[covars['Covariants'] == 'nan']] = pd.NA
+    covars = covars.dropna()\
+                   .head(num_clusters)
 
-    covars = covars.sort_values('Count', ascending=False)
+    # Merge rows with identical covariants and add their counts
+    covars = covars.groupby(covars['Covariants'].astype(str))\
+        .agg(
+        {'Count': 'sum', 'Coverage_start': 'first', 'Coverage_end': 'first'}
+    )\
+        .reset_index()\
+        .sort_values('Count', ascending=False)
 
     # Get all mutations found in the sample
     all_muts = []
@@ -687,25 +699,29 @@ def plot_covariants(covar_file, output, min_mutations, nt_muts):
             if mut[1:-1] not in all_muts:
                 all_muts.append(mut[1:-1])
 
-    patterns = [pattern.strip('][').split(', ') for pattern in covars['Covariants'].to_list()]
+    patterns = [pattern.strip('][').split(', ')
+                for pattern in covars['Covariants'].to_list()]
 
     counts = dict(zip([str(pattern)
                   for pattern in patterns], covars['Count'].to_list()))
-    
+
     coverage_start = covars['Coverage_start']
     coverage_end = covars['Coverage_end']
-    coverage_ranges = dict(zip([str(pattern)
-                                for pattern in patterns], zip(coverage_start, coverage_end)))
+    coverage_ranges = dict(zip([str(pattern) for pattern in patterns],
+                               zip(coverage_start, coverage_end)))
 
     if nt_muts:
         colnames = all_muts
         sites = {mut: nt_position(mut) for mut in all_muts}
     else:
-        colnames = [mut.split('(')[1][:-1] for mut in all_muts]
-        sites = {mut.split('(')[1][:-1]: nt_position(mut) for mut in all_muts}
+        colnames = [mut.split(')(')[1][:-1] if 'DEL' in mut
+                    else mut.split('(')[1][:-1] for mut in all_muts]
+        sites = {mut.split(')(')[1][:-1] if 'DEL' in mut
+                 else mut.split('(')[1][:-1]: nt_position(mut)
+                 for mut in all_muts}
+
+    colnames = sorted(colnames, key=lambda x: sites[x])
 
-    # Sort colnames
-    colnames = sorted(colnames, key=lambda x: x.split(':')[1][1:-1])
     data = {}
     for pattern in enumerate(patterns):
         if len(pattern[1]) >= min_mutations:

From 16c0cd7eb577f590c55b305e0058d78713623d7b Mon Sep 17 00:00:00 2001
From: dpilz2001 <dpilz@ucsd.edu>
Date: Wed, 7 Jun 2023 11:09:54 -0700
Subject: [PATCH 4/5] update tests, README

---
 README.md                          |  2 +-
 freyja/tests/test_read_analysis.py | 13 +++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index dc129a35..1c9dfbc1 100644
--- a/README.md
+++ b/README.md
@@ -149,7 +149,7 @@ This outputs to a tsv file that includes the mutations present in each set of co
 ```
 freyja plot-covariants [covariant_file.tsv] --output [plot-filename(.pdf,.png,etc.)]
 ```
-This generates an informative heatmap visualization showing variants as well as coverage across different reads. Each covariant pattern is listed in the format CPx(# occurrences of pattern x). The threshold `--min_mutations` sets a minimum number of mutations per set of covariants for inclusion in the plot. The flag `--nt_muts` can be included to provide nucleotide-specific information in thx plot (e.g. C22995A(S:T478K) as opposed to S:T478K), making it possible to view multi-nucleotide variants.
+This generates an informative heatmap visualization showing variants as well as coverage across different reads. Each covariant pattern is listed in the format CPx(# occurrences of pattern x). `--num_clusters` sets the number of patterns (i.e. rows in the plot) to display, while the threshold `--min_mutations` sets a minimum number of mutations per set of covariants for inclusion in the plot. The flag `--nt_muts` can be included to provide nucleotide-specific information in the plot (e.g. C22995A(S:T478K) as opposed to S:T478K), making it possible to view multi-nucleotide variants.
 
 Example output: 
 | **Sorted by Count** | **Sorted by Site** |
diff --git a/freyja/tests/test_read_analysis.py b/freyja/tests/test_read_analysis.py
index 502b4331..dce42a00 100644
--- a/freyja/tests/test_read_analysis.py
+++ b/freyja/tests/test_read_analysis.py
@@ -166,7 +166,8 @@ def test_covariants(self):
                '21563', '25384', '--gff-file',
                'freyja/data/NC_045512_Hu-1.gff', '--output',
                'freyja/data/test_covar.tsv']
-        subprocess.run(cmd)
+        err = subprocess.run(cmd)
+        print(err)
         df = pd.read_csv('freyja/data/test_covar.tsv', sep='\t')
 
         patterns = []
@@ -187,10 +188,10 @@ def test_covariants(self):
 
         # Test spans_region flag
         cmd = ['freyja', 'covariants', self.input_bam,
-               '23550', '23700', '--output', 'freyja/data/test_covar.tsv',
+               '23550', '23700', '--output', 'freyja/data/test_covar_span.tsv',
                '--spans_region']
         subprocess.run(cmd)
-        df = pd.read_csv('freyja/data/test_covar.tsv', sep='\t')
+        df = pd.read_csv('freyja/data/test_covar_span.tsv', sep='\t')
         patterns = []
         for c in df.iloc[:, 0]:
             patterns.append(c.split(' '))
@@ -202,11 +203,11 @@ def test_covariants(self):
                 self.assertTrue(int(mut[1:6]) >= cov_start[i] and
                                 int(mut[1:6]) <= cov_end[i])
         self.assertTrue(df.shape[0] == 2)
-        os.remove('freyja/data/test_covar.tsv')
+        os.remove('freyja/data/test_covar_span.tsv')
 
-    def test_plot_covariants(self):
+        # Test plot-covariants
         cmd = ['freyja', 'plot-covariants',
-               'freyja/data/example_covariants0.tsv',
+               'freyja/data/test_covar.tsv',
                '--output', 'freyja/data/test_covar_plot.png']
         subprocess.run(cmd)
 

From 518a323ac7e49a2e20e8b70c2028b7a3d7793830 Mon Sep 17 00:00:00 2001
From: dpilz2001 <dpilz@ucsd.edu>
Date: Wed, 7 Jun 2023 13:37:54 -0700
Subject: [PATCH 5/5] fix issue with spaces in mutation strings

---
 freyja/read_analysis_tools.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/freyja/read_analysis_tools.py b/freyja/read_analysis_tools.py
index b17bbb16..2437e684 100644
--- a/freyja/read_analysis_tools.py
+++ b/freyja/read_analysis_tools.py
@@ -677,6 +677,7 @@ def plot_covariants(covar_file, output, num_clusters, min_mutations, nt_muts):
 
     covars = pd.read_csv(covar_file, sep='\t', header=0)
     covars['Covariants'] = covars['Covariants']\
+        .str.replace(', ', ',')\
         .str.split(' ')\
         .apply(filter_covariants_output, args=(min_mutations, nt_muts))\