Merge pull request #358 from ConesaLab/SQANTI-reads-devel

Fixing stuff for paper revision
ConesaLab · Nov 26, 2024 · 5c6544a · 5c6544a
2 parents 705ed94 + 2b77897
commit 5c6544a
Show file tree

Hide file tree

Showing 2 changed files with 74 additions and 12 deletions.
diff --git a/sqanti_reads.py b/sqanti_reads.py
@@ -183,6 +183,7 @@ def main():
     parser.add_argument('-fl','--factor_level', type=str, dest="FACTORLVL", required=False, help='Factor level to evaluate for underannotation', default = None)
     parser.add_argument('--all_tables', dest="ALLTABLES", action='store_true', help='Export all output tables. Default tables are gene counts, ujc counts, length_summary, cv and and underannotated gene tables')
     parser.add_argument('--pca_tables', dest="PCATABLES", action='store_true', help='Export table for making PCA plots')
+    parser.add_argument('--report', type=str, choices = ["pdf", "html", "both"], default = 'pdf', help = "\t\tDefault: pdf")
     parser.add_argument('--verbose', help = 'If verbose is run, it will print all steps, by default it is FALSE', action="store_true")
     parser.add_argument('-v', '--version', help="Display program version number.", action='version', version='sqanti-reads '+str(__version__))
 
@@ -200,7 +201,7 @@ def main():
     # Run plotting script
     plotting_script_path = os.path.join(os.path.dirname(__file__), 'utilities', 'sqanti_reads_tables_and_plots_02ndk.py')
 
-    cmd_plotting = f"python {plotting_script_path} --ref {args.annotation} --design {args.inDESIGN} -o {args.dir} --gene-expression {args.ANNOTEXP} --jxn-expression {args.JXNEXP} --perc-coverage {args.PERCCOV} --perc-junctions {args.PERCMAXJXN}"
+    cmd_plotting = f"python {plotting_script_path} --ref {args.annotation} --design {args.inDESIGN} -o {args.dir} --gene-expression {args.ANNOTEXP} --jxn-expression {args.JXNEXP} --perc-coverage {args.PERCCOV} --perc-junctions {args.PERCMAXJXN} --report {args.report}"
     if args.inFACTOR:
         cmd_plotting = cmd_plotting + f" --factor {args.inFACTOR}"
     if args.FACTORLVL != None:

diff --git a/utilities/sqanti_reads_tables_and_plots_02ndk.py b/utilities/sqanti_reads_tables_and_plots_02ndk.py
@@ -19,6 +19,10 @@
 from sklearn.decomposition import PCA
 from scipy.cluster.hierarchy import linkage, leaves_list
 from matplotlib.ticker import FixedLocator
+from pdf2image import convert_from_path
+import base64
+from jinja2 import Template
+import io
 
 ## Update options
 def getOptions():
@@ -37,6 +41,7 @@ def getOptions():
     parser.add_argument('-fl','--factor-level', type=str, dest="FACTORLVL", required=False, help='Factor level to evaluate for underannotation', default = None)
     parser.add_argument('--all-tables', dest="ALLTABLES", action='store_true', help='Export all output tables. Default tables are gene counts, ujc counts, length_summary, cv and cand underannotated gene tables')
     parser.add_argument('--pca-tables', dest="PCATABLES", action='store_true', help='Export table for making PCA plots')
+    parser.add_argument('--report', type=str, choices = ["pdf", "html", "both"], default = 'pdf', help = "\t\tDefault: pdf")
 
     args = parser.parse_args()
     return args
@@ -754,7 +759,7 @@ def categorize_gene(group):
         #plt.title('Number of Genes in each annotation category')
         plt.xlabel('Gene Category')
         plt.ylabel('Number of Genes')
-        plt.xticks(rotation=45, ha="right")
+        plt.xticks(rotation=90, ha="right")
         plt.gca().spines['top'].set_visible(False)
         plt.gca().spines['right'].set_visible(False)
         # Add the counts on top of each bar
@@ -1281,6 +1286,8 @@ def plot_side_by_side_bars(*args, **kwargs):
     "mono_in_multi": '#aec6cf'
     }
 
+    cat_order = ["FSM", "ISM", "NIC", "NNC", "AS", "FUS", "GENIC", "GI", "INTER"]
+
     #Define sample color palette
 
     unique_sampleIDs = all_gene_percs_long_DF['sampleID'].unique()
@@ -1313,7 +1320,8 @@ def plot_side_by_side_bars(*args, **kwargs):
             alpha=0.6, 
             legend=True,
             height=8, 
-            aspect=0.7 
+            aspect=0.7,
+            order = cat_order
         )
         g.set_xticklabels(rotation=0)
         g.set_axis_labels("Structural Category", "Percentages")
@@ -1346,7 +1354,8 @@ def plot_side_by_side_bars(*args, **kwargs):
             alpha=0.6, 
             legend=True,
             height=8, 
-            aspect=0.7 
+            aspect=0.7,
+            order = cat_order
         )
         g.set_xticklabels(rotation=0)
         g.set_axis_labels("Structural Category", "Percentages")
@@ -1964,6 +1973,9 @@ def plot_pdf(out_path, all_gene_percs_long_DF, annot_gene_percs_long_DF, all_gen
     "mono_in_multi": '#aec6cf'
     }
 
+    cat_order = ["FSM", "ISM", "NIC", "NNC", "AS", "FUS", "GENIC", "GI", "INTER"]
+    cat_order_stacked = ["INTER", "GI", "GENIC", "FUS", "AS", "NNC", "NIC", "ISM", "FSM"]
+
     #Define sample color palette
 
     unique_sampleIDs = all_gene_percs_long_DF['sampleID'].unique()
@@ -1984,8 +1996,8 @@ def plot_pdf(out_path, all_gene_percs_long_DF, annot_gene_percs_long_DF, all_gen
         palette = sample_color_palette
         plt.figure(figsize=(10, 7.5))
         sns.stripplot(x='Category', y='Percentage', data=all_gene_percs_long_DF, jitter=0, 
-                      alpha=0.6, hue='sampleID', dodge=False, **{'linewidth': 0, 's': 20}, palette = palette)
-        plt.xticks(rotation=45)
+                      alpha=0.6, hue='sampleID', dodge=False, **{'linewidth': 0, 's': 20}, palette = palette, order = cat_order)
+        plt.xticks(rotation=90)
         plt.ylabel('Percentage')
         plt.title(' Percent reads in each structural category - All Genes')
         plt.legend(title='SampleID', bbox_to_anchor=(1.05, 1), loc='upper left')
@@ -1996,17 +2008,18 @@ def plot_pdf(out_path, all_gene_percs_long_DF, annot_gene_percs_long_DF, all_gen
 
         plt.figure(figsize=(10, 7.5))
         sns.stripplot(x='Category', y='Percentage', data=annot_gene_percs_long_DF, jitter=0, 
-                      alpha=0.6, hue='sampleID', dodge=False, **{'linewidth': 0, 's': 20}, palette = palette)
-        plt.xticks(rotation=45)
+                      alpha=0.6, hue='sampleID', dodge=False, **{'linewidth': 0, 's': 20}, palette = palette, order = cat_order)
+        plt.xticks(rotation=90)
         plt.ylabel('Percentage')
-        plt.title(' Percent reads in each structural category - ANnotated Gnes')
+        plt.title(' Percent reads in each structural category - Annotated Genes')
         plt.legend(title='SampleID', bbox_to_anchor=(1.05, 1), loc='upper left')
         plt.tight_layout()
         matplotlib.rcParams['pdf.fonttype'] = 42
         pdf.savefig()
         plt.close()
 
         categories = [col for col in all_gene_percs_pivot_DF.columns if col not in ['sampleID', exp_factor]]
+        categories = [cat for cat in cat_order_stacked if cat in categories]
 
         cols = ['sampleID'] + categories
         all_gene_percs_pivot_DF = all_gene_percs_pivot_DF[cols]
@@ -2032,7 +2045,8 @@ def plot_pdf(out_path, all_gene_percs_long_DF, annot_gene_percs_long_DF, all_gen
 
 
         categories = [cat for cat in ['FSM', 'ISM', 'NIC', 'NNC', 'GI', 'GENIC'] if cat in annot_gene_percs_pivot_DF.columns]
-        palette = [category_color_palette[cat] for cat in categories]  
+        categories = [cat for cat in cat_order_stacked if cat in categories]
+        palette = [category_color_palette[cat] for cat in categories]
 
         annot_gene_percs_pivot_DF = annot_gene_percs_pivot_DF.sort_values(by= 'sampleID')
         plt.figure(figsize=(10, 7.5))
@@ -2189,7 +2203,7 @@ def plot_pdf(out_path, all_gene_percs_long_DF, annot_gene_percs_long_DF, all_gen
              sns.boxplot(x='sampleID', y='percentage', hue='sampleID' ,data=melted_annotated_gene_DF[melted_annotated_gene_DF['category'] == category],
                     palette=sample_color_palette)
              plt.title(f'Gene distribution - {category}')
-             plt.xticks(rotation=45)
+             plt.xticks(rotation=90)
              plt.tight_layout()
              matplotlib.rcParams['pdf.fonttype'] = 42
              pdf.savefig()
@@ -2307,7 +2321,7 @@ def plot_pdf(out_path, all_gene_percs_long_DF, annot_gene_percs_long_DF, all_gen
         plt.close()
 
         #Violin plots
-        sns.violinplot(x='sampleID', y='length', data=length_DF2, palette = sample_color_palette)
+        sns.violinplot(x='sampleID', y='length', data=length_DF2, palette = sample_color_palette, legend = False, hue = "sampleID")
         plt.xlabel('Sample ID')
         plt.ylabel('Length')
         plt.title('Read Length Distribution')
@@ -2508,6 +2522,44 @@ def plot_pdf(out_path, all_gene_percs_long_DF, annot_gene_percs_long_DF, all_gen
         matplotlib.rcParams['pdf.fonttype'] = 42
         pdf.savefig()
         plt.close()
+
+def makeHTML(drty, prefx, sufx):
+    pages = convert_from_path(drty + prefx + sufx)
+    # Encode each page as a base64 string
+    encoded_images = []
+    for page in pages:
+        buffer = io.BytesIO()
+        page.save(buffer, format="PNG")
+        buffer.seek(0)
+        encoded_images.append(base64.b64encode(buffer.read()).decode("utf-8"))
+
+    # HTML template for embedding images
+    html_template = """
+    <!DOCTYPE html>
+    <html>
+    <head>
+        <title>PDF to HTML</title>
+    </head>
+    <body>
+        <h1>PDF Report</h1>
+        {% for img in images %}
+        <div>
+            <img src="data:image/png;base64,{{ img }}" style="width: 100%; height: auto;">
+        </div>
+        {% endfor %}
+    </body>
+    </html>
+    """
+
+    # Render HTML with images
+    template = Template(html_template)
+    html_content = template.render(images=encoded_images)
+
+    # Save to HTML file
+    with open(drty + prefx + sufx[:-4] + ".html", "w") as f:
+        f.write(html_content)
+
+    print(f"HTML report saved as {drty + prefx + sufx[:-4]}.html")
 
 
 def main():
@@ -2525,6 +2577,15 @@ def main():
     else:
         plot_pdf_by_factor(os.path.join(args.OUT, args.PREFIX + '_plots.pdf'), *dfs_for_plotting)
 
+    if args.report in ("both", "html"):
+        makeHTML(args.OUT, args.PREFIX, '_plots.pdf')
+        makeHTML(args.OUT, args.PREFIX, '_annotation_plots.pdf')
+
+    if args.report == "html":
+        os.remove(f"{args.OUT}/{args.PREFIX}_plots.pdf")
+        os.remove(f"{args.OUT}/{args.PREFIX}_annotation_plots.pdf")
+
+
 if __name__ == '__main__':
     #Parse command line arguments
     global args