diff --git a/Dockerfile b/Dockerfile index fbd7494..8b38228 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,11 +16,11 @@ RUN tar -xf ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz RUN conda env create -f environment.yml # Activate the environment -RUN echo "conda activate lrst_py39" >> ~/.bashrc +RUN echo "conda activate longreadsum" >> ~/.bashrc SHELL ["/bin/bash", "--login", "-c"] # Ensure the correct environment is being used -RUN export PATH="/opt/conda/envs/lrst_py39/bin/python" +RUN export PATH="/opt/conda/envs/longreadsum/bin/python" RUN which python # Build LongReadSum @@ -30,4 +30,4 @@ RUN make ENV HDF5_PLUGIN_PATH="/longreadsum/lib/" # The code to run when container is started: -ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "lrst_py39", "python", "/app/longreadsum"] +ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "longreadsum", "python", "/app/longreadsum"] diff --git a/environment.yml b/environment.yml index 236fa4e..628347f 100644 --- a/environment.yml +++ b/environment.yml @@ -1,4 +1,4 @@ -name: lrst_py39 +name: longreadsum channels: - bioconda - anaconda @@ -9,12 +9,8 @@ dependencies: - hdf5 - htslib - swig - - matplotlib - - plotly=4.14 + - plotly - pytest - - pip - - pip: - - kaleido # conda env create --file=environment.yml -# conda activate lrst_py39 +# conda activate longreadsum diff --git a/src/bam_plot.py b/src/bam_plot.py deleted file mode 100644 index e438bfa..0000000 --- a/src/bam_plot.py +++ /dev/null @@ -1,124 +0,0 @@ -""" -plot_for_BAM.py: -Use the formatted statistics from our C++ module output text files to generate summary plots in image format. -""" - -if __package__ == 'src': - from src.plot_utils import * -else: - from plot_utils import * - - -def plot_alignment_numbers(data, path): - fig, axes = plt.subplots(figsize=(8, 6)) - - numbers_list = [[data.num_primary_alignment, data.num_supplementary_alignment, data.num_secondary_alignment, - data.num_reads_with_supplementary_alignment, data.num_reads_with_secondary_alignment, - data.num_reads_with_both_secondary_supplementary_alignment, data.forward_alignment, - data.reverse_alignment]] - - category = ['Primary Alignments', 'Supplementary Alignments', 'Secondary Alignments', - 'Reads with Supplementary Alignments', 'Reads with Secondary Alignments', - 'Reads with Secondary and Supplementary Alignments', 'Forward Alignments', 'Reverse Alignments'] - category = [wrap(x) for x in category] - - category_list = itertools.cycle([category]) - xlabel_list = itertools.cycle(['Counts']) - ylabel_list = itertools.cycle(['']) - subtitle_list = [None] - bar_plot(fig, numbers_list, category_list, xlabel_list, ylabel_list, subtitle_list, path, orientation='h') - - -def plot_errors(bam_output, path): - fig, axes = plt.subplots(1, 1, figsize=(8, 6)) - - numbers_list = [[bam_output.num_matched_bases, bam_output.num_mismatched_bases, bam_output.num_ins_bases, - bam_output.num_del_bases, bam_output.num_clip_bases]] - - category = ['Matched Bases', 'Mismatched Bases', 'Inserted Bases', 'Deleted Bases', 'Clipped Bases'] - category = [wrap(x) for x in category] - - category_list = itertools.cycle([category]) - xlabel_list = itertools.cycle(['Counts']) - ylabel_list = itertools.cycle([None]) - subtitle_list = [None] - bar_plot(fig, numbers_list, category_list, xlabel_list, ylabel_list, subtitle_list, path, orientation='h') - - -def create_summary_table(bam_output, plot_filepaths): - plot_filepaths["basic_st"] = {} - plot_filepaths["basic_st"]['file'] = "" - plot_filepaths["basic_st"]['title'] = "Basic Statistics" - plot_filepaths["basic_st"]['description'] = "BAM: Basic Statistics" - - table_str = "\n\n\n" - table_str += "\n" - int_str_for_format = " " - double_str_for_format = " " - table_str += int_str_for_format.format("#Total Reads", bam_output.mapped_long_read_info.total_num_reads, - bam_output.unmapped_long_read_info.total_num_reads, - bam_output.long_read_info.total_num_reads) - table_str += int_str_for_format.format("#Total Bases", - bam_output.mapped_long_read_info.total_num_bases, - bam_output.unmapped_long_read_info.total_num_bases, - bam_output.long_read_info.total_num_bases) - table_str += int_str_for_format.format("Longest Read Length", - bam_output.mapped_long_read_info.longest_read_length, - bam_output.unmapped_long_read_info.longest_read_length, - bam_output.long_read_info.longest_read_length) - table_str += int_str_for_format.format("N50", - bam_output.mapped_long_read_info.n50_read_length, - bam_output.unmapped_long_read_info.n50_read_length, - bam_output.long_read_info.n50_read_length) - table_str += double_str_for_format.format("GC Content(%)", - bam_output.mapped_long_read_info.gc_cnt * 100, - bam_output.unmapped_long_read_info.gc_cnt * 100, - bam_output.long_read_info.gc_cnt * 100) - table_str += double_str_for_format.format("Mean Read Length", - bam_output.mapped_long_read_info.mean_read_length, - bam_output.unmapped_long_read_info.mean_read_length, - bam_output.long_read_info.mean_read_length) - table_str += int_str_for_format.format("Median Read Length", - bam_output.mapped_long_read_info.median_read_length, - bam_output.unmapped_long_read_info.median_read_length, - bam_output.long_read_info.median_read_length) - table_str += "\n\n
MeasurementMappedUnmappedAll
{}{:,d}{:," \ - "d}{:,d}
{}{:.1f}{:.1f}{:.1f}
" - - plot_filepaths["basic_st"]['detail'] = table_str - - -def plot(bam_output, para_dict): - out_path = para_dict["output_folder"] - plot_filepaths = getDefaultPlotFilenames() - get_image_path = lambda x: os.path.join(out_path, plot_filepaths[x]['file']) - - # Set the default matplotlib font size - setDefaultFontSize(12) - - # Get the font size for plotly plots - font_size = para_dict["fontsize"] - - # Create the summary table - create_summary_table(bam_output, plot_filepaths) - - # Generate plots - plot_alignment_numbers(bam_output, get_image_path('map_st')) - plot_errors(bam_output, get_image_path('err_st')) - - plot_read_length_stats( - [bam_output.long_read_info, bam_output.mapped_long_read_info, bam_output.unmapped_long_read_info], - get_image_path('read_length_st'), subtitles=['All Reads', 'Mapped Reads', 'Unmapped Reads']) - plot_base_counts([bam_output.long_read_info, bam_output.mapped_long_read_info, bam_output.unmapped_long_read_info], - get_image_path('base_st'), subtitles=['All Reads', 'Mapped Reads', 'Unmapped Reads']) - plot_basic_info([bam_output.long_read_info, bam_output.mapped_long_read_info, bam_output.unmapped_long_read_info], - get_image_path('basic_info'), categories=['All Reads', 'Mapped Reads', 'Unmapped Reads']) - - plot_filepaths['read_length_hist']['dynamic'] = read_lengths_histogram(bam_output.long_read_info, - get_image_path('read_length_hist'), - font_size) - plot_filepaths['base_quality']['dynamic'] = base_quality(bam_output.seq_quality_info, - get_image_path('base_quality'), font_size) - - return plot_filepaths diff --git a/src/cli.py b/src/cli.py index 6061e4f..9004931 100644 --- a/src/cli.py +++ b/src/cli.py @@ -85,9 +85,6 @@ def get_common_param(margs): try: if not os.path.isdir(output_dir): os.makedirs(output_dir) - if not os.path.isdir(output_dir + '/' + getDefaultImageFolder()): - os.makedirs(output_dir + '/' + - getDefaultImageFolder()) except OSError as e: this_error_str += "Cannot create folder for " + \ @@ -164,12 +161,11 @@ def fq_module(margs): if exit_code == 0: logging.info("QC generated.") logging.info("Generating HTML report...") - plot_filepaths = create_base_quality_plots(fq_output, param_dict, "FASTQ: Basic statistics") - for static in [True, False]: - fq_html_gen = generate_html.ST_HTML_Generator( - [["basic_st", "read_length_st", "read_length_hist", "base_st", "basic_info", "base_quality", - "read_avg_base_quality"], "FASTQ QC", param_dict], plot_filepaths, static=static) - fq_html_gen.generate_st_html() + plot_filepaths = plot(fq_output, param_dict, 'FASTQ') + fq_html_gen = generate_html.ST_HTML_Generator( + [["basic_st", "read_length_bar", "read_length_hist", "base_counts", "base_quality", + "read_avg_base_quality"], "FASTQ QC", param_dict], plot_filepaths, static=False) + fq_html_gen.generate_st_html() logging.info("Completed.") else: @@ -208,15 +204,11 @@ def fa_module(margs): if exit_code == 0: logging.info("QC generated.") logging.info("Generating HTML report...") - from src import fasta_plot - plot_filepaths = fasta_plot.plot(fa_output, param_dict) - - # TODO: Unused 'static' variable results in redundant function call - for static in [True, False]: - fa_html_gen = generate_html.ST_HTML_Generator( - [["basic_st", "read_length_st", "read_length_hist", "base_st", "basic_info"], "FASTA QC", - param_dict], plot_filepaths, static=True) - fa_html_gen.generate_st_html() + plot_filepaths = plot(fa_output, param_dict, 'FASTA') + fa_html_gen = generate_html.ST_HTML_Generator( + [["basic_st", "read_length_bar", "read_length_hist", "base_counts"], "FASTA QC", + param_dict], plot_filepaths, static=True) + fa_html_gen.generate_st_html() else: logging.error("QC did not generate.") @@ -252,14 +244,13 @@ def bam_module(margs): if exit_code == 0: logging.info("QC generated.") logging.info("Generating HTML report...") - from src import bam_plot - plot_filepaths = bam_plot.plot(bam_output, param_dict) + plot_filepaths = plot(bam_output, param_dict, 'BAM') - for static in [True, False]: - bam_html_gen = generate_html.ST_HTML_Generator( - [["basic_st", "map_st", "err_st", "read_length_st", "read_length_hist", "base_st", "basic_info", - "base_quality"], "BAM QC", param_dict], plot_filepaths, static=static) - bam_html_gen.generate_st_html() + # TODO: Add read average base quality plot (not currently generated by bam_plot.plot) + bam_html_gen = generate_html.ST_HTML_Generator( + [["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "base_counts", "basic_info", + "base_quality"], "BAM QC", param_dict], plot_filepaths, static=False) + bam_html_gen.generate_st_html() else: logging.error("QC did not generate.") @@ -301,18 +292,17 @@ def seqtxt_module(margs): if exit_code == 0: logging.info("QC generated.") logging.info("Generating HTML report...") - from src import seqtxt_plot - plot_filepaths = seqtxt_plot.plot(seqtxt_output, param_dict) - for static in [True, False]: - if margs.seq == 0: - f5_html_gen = generate_html.ST_HTML_Generator( - [["basic_st", "read_length_st", "read_length_hist", "base_st", "basic_info"], - "sequencing_summary.txt QC", param_dict], plot_filepaths, static=static) - else: - f5_html_gen = generate_html.ST_HTML_Generator( - [["basic_st", "read_length_st", "read_length_hist", "basic_info"], "sequencing_summary.txt QC", - param_dict], plot_filepaths, static=static) - f5_html_gen.generate_st_html() + plot_filepaths = plot(seqtxt_output, param_dict, 'SeqTxt') + + if margs.seq == 0: + seqtxt_html_gen = generate_html.ST_HTML_Generator( + [["basic_st", "read_length_bar", "read_length_hist", "base_counts", "base_quality", "basic_info"], + "sequencing_summary.txt QC", param_dict], plot_filepaths, static=False) + else: + seqtxt_html_gen = generate_html.ST_HTML_Generator( + [["basic_st", "read_length_bar", "read_length_hist", "basic_info"], "sequencing_summary.txt QC", + param_dict], plot_filepaths, static=False) + seqtxt_html_gen.generate_st_html() else: logging.error("QC did not generate.") @@ -348,12 +338,12 @@ def fast5_module(margs): if exit_code == 0: logging.info("QC generated.") logging.info("Generating HTML report...") - plot_filepaths = create_base_quality_plots(fast5_output, param_dict, "FAST5: Basic statistics") - for static in [True, False]: - fast5_html_obj = generate_html.ST_HTML_Generator( - [["basic_st", "read_length_st", "read_length_hist", "base_st", "basic_info", "base_quality", - "read_avg_base_quality"], "FAST5 QC", param_dict], plot_filepaths, static=static) - fast5_html_obj.generate_st_html() + plot_filepaths = plot(fast5_output, param_dict, 'FAST5') + fast5_html_obj = generate_html.ST_HTML_Generator( + [["basic_st", "read_length_bar", "read_length_hist", "base_counts", "basic_info", "base_quality", + "read_avg_base_quality"], "FAST5 QC", param_dict], plot_filepaths, static=False) + fast5_html_obj.generate_st_html() + else: logging.error("QC did not generate.") @@ -390,13 +380,12 @@ def fast5_signal_module(margs): if exit_code == 0: logging.info("QC generated.") logging.info("Generating HTML report...") - from src import fast5_signal_plot - dynamic_plots, plot_filepaths = fast5_signal_plot.plot(fast5_output, param_dict) - - # Generate a dynamic HTML file + plot_filepaths = plot(fast5_output, param_dict, 'FAST5s') fast5_html_obj = generate_html.ST_HTML_Generator( - [[], "FAST5 signal QC", param_dict], plot_filepaths, static=False) - fast5_html_obj.generate_st_html(signal_plots=dynamic_plots) + [["basic_st", "read_length_bar", "read_length_hist", "base_counts", "basic_info", "base_quality", + "read_avg_base_quality", "ont_signal"], "FAST5 QC", param_dict], plot_filepaths, static=False) + fast5_html_obj.generate_st_html(signal_plots=True) + else: logging.error("QC did not generate.") diff --git a/src/fast5_signal_plot.py b/src/fast5_signal_plot.py deleted file mode 100644 index 002fdc9..0000000 --- a/src/fast5_signal_plot.py +++ /dev/null @@ -1,151 +0,0 @@ -""" -plot_for_FAST5s.py: -Use the formatted statistics from our C++ module output text files to generate summary plots in image format. -""" - -import os -import logging -import csv -import numpy as np -import plotly.graph_objs as go -from random import sample - -if __package__ == 'src': - from src.plot_utils import * -else: - from plot_utils import * - - -def plot(fast5_output, para_dict): - """ - Update the global variables with HTML strings using the output data. - """ - out_path = para_dict["output_folder"] - - # Set up the global variable with HTML titles - plot_filepaths = getDefaultPlotFilenames() - plot_filepaths["basic_st"] = {} - plot_filepaths["basic_st"]['file'] = "" - plot_filepaths["basic_st"]['title'] = "Summary Table" - plot_filepaths["basic_st"]['description'] = "FAST5: Basic statistics" - - # Get values - read_count = fast5_output.getReadCount() - total_base_count = fast5_output.getTotalBaseCount() - - # Set up the HTML table - table_str = "\n\n\n" - table_str += "\n" - int_str_for_format = "" - table_str += int_str_for_format.format("#Total Reads", read_count) - table_str += int_str_for_format.format("#Total Bases", total_base_count) - table_str += "\n\n
MeasurementStatistics
{}{:,d}
" - plot_filepaths["basic_st"]['detail'] = table_str - - # Randomly sample a small set of reads if it is a large dataset - read_count_max = para_dict["read_count"] - read_sample_size = min(read_count_max, read_count) - unsampled_indices = list(range(0, read_sample_size)) - read_indices = sample(unsampled_indices, read_sample_size) - - # Plot the reads - output_html_plots = {} - for read_index in read_indices: - # Create the figure - fig = go.Figure() - - # Get the read data - nth_read_name = fast5_output.getNthReadName(read_index) - nth_read_data = fast5_output.getNthReadBaseSignals(read_index) - nth_read_means = fast5_output.getNthReadBaseMeans(read_index) - nth_read_stds = fast5_output.getNthReadBaseStds(read_index) - nth_read_medians = fast5_output.getNthReadBaseMedians(read_index) - nth_read_skewness = fast5_output.getNthReadPearsonSkewnessCoeff(read_index) - nth_read_kurtosis = fast5_output.getNthReadKurtosis(read_index) - nth_read_sequence = fast5_output.getNthReadSequence(read_index) - sequence_length = len(nth_read_data) - - # Check if sequence data is available - sequence_available = True if nth_read_sequence else False - - # Set up the output CSVs - csv_qc_filepath = os.path.join(out_path, nth_read_name + '_QC.csv') - qc_file = open(csv_qc_filepath, 'w') - qc_writer = csv.writer(qc_file) - qc_writer.writerow(["Base", "Raw_Signal", "Length", "Mean", "Median", "StdDev", "PearsonSkewnessCoeff", "Kurtosis"]) - - # Loop through the data - first_index = 0 - last_index = sequence_length - start_index = 0 - sequence_list = list(nth_read_sequence) - base_tick_values = [] # Append the last indices of the base signal to use for tick values - for i in range(first_index, last_index): - base_signals = nth_read_data[i] # Get the base's signal - signal_length = len(base_signals) - end_index = start_index + signal_length - base_tick_values.append(end_index) - - # Plot - x = np.arange(start_index, end_index, 1) - fig.add_trace(go.Scatter( - x=x, y=base_signals, - mode='markers', - marker=dict(color='LightSkyBlue', - size=5, - line=dict(color='MediumPurple', width=2)), - opacity=0.5)) - - # Update CSVs - base_value = sequence_list[i] if sequence_available else '' - signal_mean = nth_read_means[i] - signal_median = nth_read_medians[i] - signal_stds = nth_read_stds[i] - signal_skewness = nth_read_skewness[i] - signal_kurtosis = nth_read_kurtosis[i] - raw_row = \ - [base_value, base_signals, signal_length, - signal_mean, signal_median, signal_stds, - signal_skewness, signal_kurtosis] - - qc_writer.writerow(raw_row) - - # Update the index - start_index = end_index - - # Close CSVs - qc_file.close() - - # Update the plot style - font_size = para_dict["fontsize"] - marker_size = para_dict["markersize"] - fig.update_layout( - title=nth_read_name, - yaxis_title="Signal", - showlegend=False, - font=dict(size=font_size) - ) - fig.update_traces(marker={'size': marker_size}) - - if sequence_available: - # Set up X tick labels - x_tick_labels = sequence_list[first_index:last_index] - fig.update_xaxes(title="Base", - tickangle=0, - tickmode='array', - tickvals=base_tick_values, - ticktext=x_tick_labels) - else: - fig.update_xaxes(title="Index") - - # Save image - image_filepath = os.path.join(out_path, "img", nth_read_name + '_BaseSignal.png') - fig.write_image(image_filepath) - save_msg = "Plot image saved to: " + image_filepath - logging.info(save_msg) - - # Append the dynamic HTML object to the output structure - dynamic_html = fig.to_html(full_html=False) - output_html_plots.update({nth_read_name: dynamic_html}) - - return output_html_plots, plot_filepaths diff --git a/src/fasta_plot.py b/src/fasta_plot.py deleted file mode 100644 index 0f506ed..0000000 --- a/src/fasta_plot.py +++ /dev/null @@ -1,55 +0,0 @@ -""" -plot_for_FA.py: -Use the formatted statistics from our C++ module output text files to generate summary plots in image format. -""" - -if __package__ == 'src': - from src.plot_utils import * -else: - from plot_utils import * - - -def create_summary_table(fa_output, plot_filepaths): - plot_filepaths["basic_st"] = {} - plot_filepaths["basic_st"]['file'] = "" - plot_filepaths["basic_st"]['title'] = "Basic statistics" - plot_filepaths["basic_st"]['description'] = "FASTA: Basic statistics" - - table_str = "\n\n\n" - table_str += "\n" - int_str_for_format = "" - double_str_for_format = "" - table_str += int_str_for_format.format("#Total Reads", - fa_output.long_read_info.total_num_reads) - table_str += int_str_for_format.format("#Total Bases", fa_output.long_read_info.total_num_bases) - table_str += int_str_for_format.format("Longest Read Length", fa_output.long_read_info.longest_read_length) - table_str += int_str_for_format.format("N50", fa_output.long_read_info.n50_read_length) - table_str += double_str_for_format.format("GC Content(%)", fa_output.long_read_info.gc_cnt * 100) - table_str += double_str_for_format.format("Mean Read Length", fa_output.long_read_info.mean_read_length) - table_str += int_str_for_format.format("Median Read Length", fa_output.long_read_info.median_read_length) - table_str += "\n\n
MeasurementStatistics
{}{:,d}
{}{:.1f}
" - - plot_filepaths["basic_st"]['detail'] = table_str - - -def plot(fa_output, para_dict): - out_path = para_dict["output_folder"] - plot_filepaths = getDefaultPlotFilenames() - get_image_path = lambda x: os.path.join(out_path, plot_filepaths[x]['file']) - - # Set the default matplotlib font size - setDefaultFontSize(12) - - # Get the font size for plotly plots - font_size = para_dict["fontsize"] - - # Generate plots - create_summary_table(fa_output, plot_filepaths) - - # Save plot images using statistics generated from the C++ module - plot_read_length_stats([fa_output.long_read_info], get_image_path('read_length_st'), subtitles=['Long Reads']) - plot_base_counts([fa_output.long_read_info], get_image_path('base_st'), subtitles=['Long Reads']) - plot_basic_info([fa_output.long_read_info], get_image_path('basic_info'), categories=['Long Reads']) - histogram(fa_output.long_read_info, get_image_path('read_length_hist'), font_size) - - return plot_filepaths diff --git a/src/generate_html.py b/src/generate_html.py index 4453aee..b6358ed 100644 --- a/src/generate_html.py +++ b/src/generate_html.py @@ -2,7 +2,7 @@ generate_html.py: Generate the HTML file from our plot images. """ -import base64 +import logging class ST_HTML_Generator: @@ -241,18 +241,18 @@ def generate_left(self): self.html_writer.write('

Summary

') self.html_writer.write('") @@ -260,39 +260,34 @@ def generate_left(self): def generate_right(self): self.html_writer.write('
') - _imki = 0 - for _imk in self.image_key_list: + key_index = 0 + for plot_key in self.image_key_list: self.html_writer.write('
') self.html_writer.write( - '

' + self.plot_filepaths[_imk]['description'] + '

') - # self.html_writer.write(''+lrst_global.plot_filenames[_imk]['description']+'

') - - if 'dynamic' in self.plot_filepaths[_imk] and self.static == False: - self.html_writer.write(self.plot_filepaths[_imk]['dynamic']) + '

' + self.plot_filepaths[plot_key]['description'] + '

') + # Add the plot or the HTML summary table + if plot_key == "basic_st": + self.html_writer.write(self.plot_filepaths["basic_st"]['detail']) else: - if _imk == "basic_st": - self.html_writer.write(self.plot_filepaths["basic_st"]['detail']) - else: - m_image_file = open( - self.input_para["output_folder"] + '/' + self.plot_filepaths[_imk]['file'], 'rb'); - self.html_writer.write('' + self.plot_filepaths[_imk][
-                                               'description'] + '

') - m_image_file.close() + try: + dynamic_plot = self.plot_filepaths[plot_key]['dynamic'] + self.html_writer.write(dynamic_plot) + + except KeyError: + logging.error("Missing dynamic plot for " + plot_key) self.html_writer.write('
') - _imki += 1 + key_index += 1 self.html_writer.write('
') - self.html_writer.write('

File count = ' + str( + self.html_writer.write('

File count = ' + str( len(self.input_para["input_files"])) + '

') for _af in self.input_para["input_files"]: self.html_writer.write("
" + _af) self.html_writer.write('

') - _imki += 1 + key_index += 1 self.html_writer.write('
') @@ -319,7 +314,7 @@ def generate_left_signal_data(self, read_names): # Add the input files section link self.html_writer.write('
  • ') - self.html_writer.write('Input files') + self.html_writer.write('Input Files') url_index += 1 self.html_writer.write('
  • ') self.html_writer.write("") @@ -343,7 +338,7 @@ def generate_right_signal_data(self, read_names, signal_plot): self.html_writer.write('
    ') # Set the description - description_text = "Basecall signal" + description_text = "ONT Basecall Signal" self.html_writer.write( '

    ' + description_text + '

    ') @@ -371,20 +366,23 @@ def generate_end(self): self.html_writer.write("") self.html_writer.close() - def generate_st_html(self, signal_plots=None): + def generate_st_html(self, signal_plots=False): """ Top-level function for generating the HTML. """ - if signal_plots is None: - # Format base QC - self.generate_header() - self.generate_left() - self.generate_right() - self.generate_end() - else: - # Format signal QC + if signal_plots: self.generate_header() + # Get the signal plots + signal_plots = self.plot_filepaths["ont_signal"]['dynamic'] read_names = signal_plots.keys() self.generate_left_signal_data(read_names) self.generate_right_signal_data(read_names, signal_plots) self.generate_end() + else: + # Format base QC + self.generate_header() + self.generate_left() + self.generate_right() + self.generate_end() + + diff --git a/src/plot_utils.py b/src/plot_utils.py index ca4737b..673c1bc 100644 --- a/src/plot_utils.py +++ b/src/plot_utils.py @@ -1,72 +1,38 @@ import os -import logging import numpy as np -import itertools +import csv +from random import sample -import matplotlib.pyplot as plt import plotly.graph_objs as go from plotly.subplots import make_subplots -logging.getLogger('matplotlib.font_manager').setLevel(logging.ERROR) - - -# Return the default image path -def getDefaultImageFolder(): - return 'img/' - - -# Return the default image suffix -def getDefaultImageSuffix(): - return '.png' - # Return a dictionary of default plot filenames def getDefaultPlotFilenames(): - default_image_path = getDefaultImageFolder() - default_image_suf = getDefaultImageSuffix() - plot_filenames = { # for fq/fa - "read_length_distr": {'file': default_image_path + "read_length_distr" + default_image_suf, - 'title': "Read Length", 'description': "Read Length Distribution"}, # for bam - "map_st": {'file': default_image_path + "map_st" + default_image_suf, 'title': "Map Information", + "read_length_distr": {'title': "Read Length", 'description': "Read Length Distribution"}, # for bam + "read_alignments_bar": {'title': "Map Information", 'description': "Read Mapping Statistics"}, - "err_st": {'file': default_image_path + "err_st" + default_image_suf, - 'title': "Base Alignment and Error Statistics", + "base_alignments_bar": {'title': "Base Alignment and Error Statistics", 'description': "Base Alignment and Error Statistics"}, - "read_length_st": {'file': default_image_path + "read_length_st" + default_image_suf, - 'title': "Read Length Statistics", 'description': "Read Length Statistics"}, - "base_st": {'file': default_image_path + "base_st" + default_image_suf, 'title': "Base Count Statistics", + "read_length_bar": {'title': "Read Length Statistics", 'description': "Read Length Statistics"}, + "base_counts": {'title': "Base Count Statistics", 'description': "Base Count Statistics", 'summary': ""}, - "basic_info": {'file': default_image_path + "basic_info" + default_image_suf, 'title': "Basic Statistics", + "basic_info": {'title': "Basic Statistics", 'description': "Basic Statistics", 'summary': ""}, - "read_length_hist": {'file': default_image_path + "read_length_hist" + default_image_suf, - 'title': "Read Length Histogram", 'description': "Read Length Histogram", 'summary': ""}, + "read_length_hist": {'title': "Read Length Histogram", 'description': "Read Length Histogram", 'summary': ""}, - "base_quality": {'file': default_image_path + "base_quality" + default_image_suf, - 'title': "Base Quality Histogram", 'description': "Base Quality Histogram"}, + "base_quality": {'title': "Base Quality Histogram", 'description': "Base Quality Histogram"}, - "read_avg_base_quality": {'file': default_image_path + "read_avg_base_quality" + default_image_suf, - 'title': "Read Base Quality Histogram", 'description': "Read Base Quality Histogram"}, + "read_avg_base_quality": {'title': "Read Base Quality Histogram", 'description': "Read Base Quality Histogram"}, - "pos_quality": {'file': default_image_path + "pos_quality" + default_image_suf, - 'title': "Base Position Quality", 'description': "Base Position Quality"}, + "pos_quality": {'title': "Base Position Quality", 'description': "Base Position Quality"}, + "ont_signal": {'title': "ONT Signal", 'description': "ONT Signal"}, } return plot_filenames -def setDefaultFontSize(font_size): - """Set the default font size for matplotlib plots.""" - plt.rcParams.update({'font.size': font_size}) - - -def fmt(x): - """Format numbers for plots.""" - format_x = "{:,}".format(round(x)) - - return format_x - - def wrap(s): l = s.split(' ') split = list(zip(*[iter(l)] * 3)) @@ -75,71 +41,161 @@ def wrap(s): return '\n'.join([' '.join(x) for x in split]) -def plot_read_length_stats(data, path, subtitles=None, categories=None): - fig, axes = plt.subplots(len(data), sharey=True, figsize=(8, 6)) +def plot_read_length_stats(output_data, file_type): + # Define the three categories + category = ['N50', 'Mean', 'Median'] + all_traces = [] + + if file_type == 'BAM': + # Create a bar trace for each type of read length statistic + bar_titles = ['All Reads', 'Mapped Reads', 'Unmapped Reads'] + data_objects = [output_data.long_read_info, output_data.mapped_long_read_info, output_data.unmapped_long_read_info] + for i in range(3): + plot_title = bar_titles[i] + data = data_objects[i] + values = [data.n50_read_length, data.mean_read_length, data.median_read_length] + trace = go.Bar(x=category, y=values, name=plot_title) + all_traces.append(trace) + + elif file_type == 'SeqTxt': + # Create a bar trace for each type of read length statistic + bar_titles = ['All Reads', 'Passed Reads', 'Failed Reads'] + data_objects = [output_data.all_long_read_info.long_read_info, output_data.passed_long_read_info.long_read_info, output_data.failed_long_read_info.long_read_info] + for i in range(3): + plot_title = bar_titles[i] + data = data_objects[i] + values = [data.n50_read_length, data.mean_read_length, data.median_read_length] + trace = go.Bar(x=category, y=values, name=plot_title) + all_traces.append(trace) + + else: + # Get the data for all reads + key_list = ['n50_read_length', 'mean_read_length', 'median_read_length'] + + # Create a bar trace + bar_title = 'All Reads' + data = output_data.long_read_info + values = [getattr(data, key_name) for key_name in key_list] + trace = go.Bar(x=category, y=values, name=bar_title) + all_traces.append(trace) + + # Create the layout + layout = go.Layout(title='Read Length Statistics', xaxis=dict(title='Statistics'), yaxis=dict(title='Length (bp)'), barmode='group') + + # Create the figure and add the traces + fig = go.Figure(data=all_traces, layout=layout) + + # Generate the HTML + html_obj = fig.to_html(full_html=False, default_height=500, default_width=700) - numbers_list = [[x.n50_read_length, x.mean_read_length, x.median_read_length] for x in data] + return html_obj - category = ['N50', 'Mean', 'Median'] - category_list = itertools.cycle([category]) - ylabel_list = itertools.cycle(['Length (bp)']) - xlabel_list = itertools.cycle([None]) - subtitle_list = subtitles - bar_plot(fig, numbers_list, category_list, xlabel_list, ylabel_list, subtitle_list, path) +def plot_base_counts(output_data, filetype): + # Define the five categories + category = ['A', 'C', 'G', 'T/U', 'N'] + + # Create a bar trace for each type of data + all_traces = [] + if filetype == 'BAM': + bar_titles = ['All Reads', 'Mapped Reads', 'Unmapped Reads'] + data_objects = [output_data.long_read_info, output_data.mapped_long_read_info, output_data.unmapped_long_read_info] + for i in range(3): + plot_title = bar_titles[i] + data = data_objects[i] + values = [data.total_a_cnt, data.total_c_cnt, data.total_g_cnt, data.total_tu_cnt, data.total_n_cnt] + trace = go.Bar(x=category, y=values, name=plot_title) + all_traces.append(trace) + + elif filetype == 'SeqTxt': + bar_titles = ['All Reads', 'Passed Reads', 'Failed Reads'] + data_objects = [output_data.all_long_read_info.long_read_info, output_data.passed_long_read_info.long_read_info, output_data.failed_long_read_info.long_read_info] + for i in range(3): + plot_title = bar_titles[i] + data = data_objects[i] + values = [data.total_a_cnt, data.total_c_cnt, data.total_g_cnt, data.total_tu_cnt, data.total_n_cnt] + trace = go.Bar(x=category, y=values, name=plot_title) + all_traces.append(trace) + + else: + plot_title = 'All Reads' + data = output_data.long_read_info + values = [data.total_a_cnt, data.total_c_cnt, data.total_g_cnt, data.total_tu_cnt, data.total_n_cnt] + trace = go.Bar(x=category, y=values, name=plot_title) + all_traces.append(trace) + + # Create the layout + layout = go.Layout(title='Base Counts', xaxis=dict(title='Base'), yaxis=dict(title='Counts'), barmode='group') + + # Create the figure and add the traces + fig = go.Figure(data=all_traces, layout=layout) + + # Generate the HTML + html_obj = fig.to_html(full_html=False, default_height=500, default_width=700) + + return html_obj + + +def plot_basic_info(output_data, file_type): + """Plot basic information about the reads.""" + html_obj = '' + if file_type == 'BAM': + + # Create a bar trace for each type of data + bar_titles = ['All Reads', 'Mapped Reads', 'Unmapped Reads'] + data_objects = [output_data.long_read_info, output_data.mapped_long_read_info, output_data.unmapped_long_read_info] -def plot_base_counts(data, path, subtitles=None, categories=None): - fig, axes = plt.subplots(len(data), figsize=(8, 6)) + # Create subplots for each category + fig = make_subplots(rows=2, cols=2, subplot_titles=("Number of Reads", "Number of Bases", "Longest Read", "GC Content"), horizontal_spacing=0.3, vertical_spacing=0.2) - numbers_list = [[x.total_a_cnt, x.total_c_cnt, x.total_g_cnt, x.total_tu_cnt, x.total_n_cnt] for x in data] + # Add traces for each category + key_list = ['total_num_reads', 'total_num_bases', 'longest_read_length', 'gc_cnt'] + for i in range(4): + # Get the data for this category + key_name = key_list[i] - category_list = itertools.cycle([['A', 'C', 'G', 'T/U', 'N']]) - xlabel_list = itertools.cycle([None]) - ylabel_list = itertools.cycle(['Counts']) - subtitle_list = subtitles - bar_plot(fig, numbers_list, category_list, xlabel_list, ylabel_list, subtitle_list, path) - # plot_filepaths['base_st']['summary']='GC Content: {:.2%}'.format(bam_output.mapped_long_read_info.gc_cnt) + # Add the traces for each type of data + data = [getattr(data_objects[0], key_name), getattr(data_objects[1], key_name), getattr(data_objects[2], key_name)] + # Create the trace + trace = go.Bar(x=data, y=bar_titles, orientation='h') -def plot_basic_info(data, path, subtitles=None, categories=None): - fig, axes = plt.subplots(2, 2, figsize=(8, 6)) - numbers_list = [[x.total_num_reads, x.total_num_bases, x.longest_read_length, x.gc_cnt] for x in data] - numbers_list = zip(*numbers_list) + # Add the trace to the figure + fig.add_trace(trace, row=(i // 2) + 1, col=(i % 2) + 1) + fig.update_layout(showlegend=False) - category_list = itertools.cycle([categories]) - subtitle_list = ['Number of Reads', 'Number of Bases', 'Longest Read', 'GC Content'] - xlabel_list = ['Count', 'Count', 'Length (bp)', '%'] - ylabel_list = itertools.cycle([None]) - bar_plot(fig, numbers_list, category_list, xlabel_list, ylabel_list, subtitle_list, path, orientation='h') + # Generate the HTML + html_obj = fig.to_html(full_html=False, default_height=800, default_width=1200) + elif file_type == 'SeqTxt': -def bar_plot(fig, numbers_list, category_list, xlabel_list, ylabel_list, subtitle_list, path, orientation='v', - print_value=True): - plt.subplots_adjust(hspace=0.5, wspace=0.5) - # plt.ticklabel_format(axis='both',style='sci', scilimits=(0,0)) + # Create a bar trace for each type of data + bar_titles = ['All Reads', 'Passed Reads', 'Failed Reads'] + data_objects = [output_data.all_long_read_info.long_read_info, output_data.passed_long_read_info.long_read_info, output_data.failed_long_read_info.long_read_info] - for ax, numbers, category, xlabel, ylabel, subtitle in zip(fig.axes, numbers_list, category_list, xlabel_list, - ylabel_list, subtitle_list): - # ax.set_major_formatter(matplotlib.ticker.ScalarFormatter()) - ax.set(ylabel=ylabel, xlabel=xlabel) - ax.set_title(subtitle, pad=10) - ax.spines['right'].set_visible(False) - ax.spines['top'].set_visible(False) - ax.tick_params(labelbottom=True) - ax.ticklabel_format(style='sci', scilimits=(-3, 4), axis='both') + # Create subplots for each category + fig = make_subplots(rows=1, cols=3, subplot_titles=("Number of Reads", "Number of Bases", "Longest Read"), horizontal_spacing=0.1) - if orientation == 'h': - ax.barh(category, numbers) - for index, value in enumerate(numbers): - ax.text(value, index, ' %s' % fmt(value) if print_value else '') - plt.tight_layout() + # Add traces for each category + key_list = ['total_num_reads', 'total_num_bases', 'longest_read_length'] + for i in range(3): + # Get the data for this category + key_name = key_list[i] - elif orientation == 'v': - ax.bar(category, numbers) - for index, value in enumerate(numbers): - ax.text(index, value + max(numbers) * 0.02, '%s' % fmt(value) if print_value else '', ha='center') + # Add the traces for each type of data + data = [getattr(data_objects[0], key_name), getattr(data_objects[1], key_name), getattr(data_objects[2], key_name)] - plt.savefig(path) + # Create the trace + trace = go.Bar(x=data, y=bar_titles, orientation='h') + + # Add the trace to the figure + fig.add_trace(trace, row=1, col=i + 1) + fig.update_layout(showlegend=False) + + # Generate the HTML + html_obj = fig.to_html(full_html=False, default_height=500, default_width=1600) + + return html_obj def histogram(data, path, font_size): @@ -210,12 +266,11 @@ def histogram(data, path, font_size): fig.update_annotations(font_size=annotation_size) html_obj = fig.to_html(full_html=False) - fig.write_image(path, engine="auto") return html_obj -def read_lengths_histogram(data, path, font_size): +def read_lengths_histogram(data, font_size): """Plot the read length histograms.""" annotation_size = 10 # Annotation font size mean, median, n50 = data.mean_read_length, data.median_read_length, data.n50_read_length @@ -281,13 +336,12 @@ def read_lengths_histogram(data, path, font_size): fig.update_layout(font=dict(size=font_size), autosize=True) fig.update_annotations(font_size=annotation_size) - html_obj = fig.to_html(full_html=False) - fig.write_image(path, engine="auto") + html_obj = fig.to_html(full_html=False, default_height=500, default_width=700) return html_obj -def base_quality(data, path, font_size): +def base_quality(data, font_size): """ Save the 'Base quality' plot image. """ @@ -304,12 +358,11 @@ def base_quality(data, path, font_size): fig.update_xaxes(ticks="outside", dtick=10, title_text='Base Quality', title_standoff=0) fig.update_yaxes(ticks="outside", title_text='Number of bases', title_standoff=0) fig.update_layout(font=dict(size=font_size)) # Set font size - fig.write_image(path, engine="auto") - return fig.to_html(full_html=False) + return fig.to_html(full_html=False, default_height=500, default_width=700) -def read_avg_base_quality(data, path, font_size): +def read_avg_base_quality(data, font_size): """ Save the 'Average base quality' plot image. """ @@ -322,35 +375,32 @@ def read_avg_base_quality(data, path, font_size): fig.update_yaxes(ticks="outside", title_text='Number of Reads', title_standoff=0) fig.update_layout(font=dict(size=font_size)) # Set font size - fig.write_image(path, engine="auto") - - return fig.to_html(full_html=False) + return fig.to_html(full_html=False, default_height=500, default_width=700) -def create_statistics_table(module_output, plot_filepaths, table_title="Basic Statistics"): +def create_statistics_table(output_data, plot_filepaths, table_title="Basic Statistics"): plot_filepaths["basic_st"] = {} plot_filepaths["basic_st"]['file'] = "" plot_filepaths["basic_st"]['title'] = "Summary Table" plot_filepaths["basic_st"]['description'] = table_title - table_str = "\n\n\n" table_str += "\n" int_str_for_format = "" double_str_for_format = "" table_str += int_str_for_format.format("#Total Reads", - module_output.long_read_info.total_num_reads) + output_data.long_read_info.total_num_reads) table_str += int_str_for_format.format("#Total Bases", - module_output.long_read_info.total_num_bases) + output_data.long_read_info.total_num_bases) table_str += int_str_for_format.format("Longest Read Length", - module_output.long_read_info.longest_read_length) + output_data.long_read_info.longest_read_length) table_str += int_str_for_format.format("N50", - module_output.long_read_info.n50_read_length) + output_data.long_read_info.n50_read_length) table_str += double_str_for_format.format("GC Content(%)", - module_output.long_read_info.gc_cnt * 100) + output_data.long_read_info.gc_cnt * 100) table_str += double_str_for_format.format("Mean Read Length", - module_output.long_read_info.mean_read_length) + output_data.long_read_info.mean_read_length) table_str += int_str_for_format.format("Median Read Length", - module_output.long_read_info.median_read_length) + output_data.long_read_info.median_read_length) table_str += "\n\n
    MeasurementStatistics
    {}{:,d}
    {}{:.1f}
    " plot_filepaths["basic_st"]['detail'] = table_str @@ -358,40 +408,323 @@ def create_statistics_table(module_output, plot_filepaths, table_title="Basic St return plot_filepaths -def create_base_quality_plots(module_output, para_dict, table_title): - """ - Generate HTML plots for base and base quality (BAM, FASTQ, FAST5). - """ +def plot(output_data, para_dict, file_type): out_path = para_dict["output_folder"] plot_filepaths = getDefaultPlotFilenames() - get_image_path = lambda x: os.path.join(out_path, plot_filepaths[x]['file']) - - # Set the default matplotlib font size - setDefaultFontSize(12) # Get the font size for plotly plots font_size = para_dict["fontsize"] - # Create the statistics table - plot_filepaths = create_statistics_table(module_output, plot_filepaths, table_title) + # Create the summary table + create_summary_table(output_data, plot_filepaths, file_type) - # Create basic plots - basic_data = module_output.long_read_info - plot_read_length_stats([basic_data], get_image_path('read_length_st'), subtitles=['']) - plot_base_counts([basic_data], get_image_path('base_st'), subtitles=['']) - plot_basic_info([basic_data], get_image_path('basic_info'), categories=['Read']) + # Generate plots + plot_filepaths['base_counts']['dynamic'] = plot_base_counts(output_data, file_type) + plot_filepaths['basic_info']['dynamic'] = plot_basic_info(output_data, file_type) # Read length histogram - length_hist_path = get_image_path('read_length_hist') - plot_filepaths['read_length_hist']['dynamic'] = histogram(basic_data, length_hist_path, font_size) + if file_type == 'SeqTxt': + long_read_data = output_data.all_long_read_info.long_read_info + else: + long_read_data = output_data.long_read_info + + if file_type != 'FAST5s': + plot_filepaths['read_length_hist']['dynamic'] = read_lengths_histogram(long_read_data, font_size) - # Base quality histogram - quality_data = module_output.seq_quality_info - quality_hist_path = get_image_path('base_quality') - plot_filepaths['base_quality']['dynamic'] = base_quality(quality_data, quality_hist_path, font_size) + plot_filepaths['read_length_bar']['dynamic'] = plot_read_length_stats(output_data, file_type) - # Read quality histogram - read_quality_dynamic = read_avg_base_quality(quality_data, get_image_path('read_avg_base_quality'), font_size) - plot_filepaths['read_avg_base_quality']['dynamic'] = read_quality_dynamic + if file_type != 'FASTA' and file_type != 'FAST5s': + if file_type == 'SeqTxt': + seq_quality_info = output_data.all_long_read_info.seq_quality_info + else: + seq_quality_info = output_data.seq_quality_info + + # Base quality histogram + plot_filepaths['base_quality']['dynamic'] = base_quality(seq_quality_info, font_size) + + # Read quality histogram + read_quality_dynamic = read_avg_base_quality(seq_quality_info, font_size) + plot_filepaths['read_avg_base_quality']['dynamic'] = read_quality_dynamic + + if file_type == 'BAM': + plot_filepaths['read_alignments_bar']['dynamic'] = plot_alignment_numbers(output_data) + plot_filepaths['base_alignments_bar']['dynamic'] = plot_errors(output_data) + + elif file_type == 'FAST5s': + plot_filepaths['ont_signal']['dynamic'] = plot_signal(output_data, para_dict) return plot_filepaths + + +def plot_signal(output_data, para_dict): + """Plot the ONT FAST5 signal data""" + # Get input parameters + output_dir = para_dict["output_folder"] + font_size = para_dict["fontsize"] + marker_size = para_dict["markersize"] + read_count_max = para_dict["read_count"] + + # Get read and base counts + read_count = output_data.getReadCount() + + # Randomly sample a small set of reads if it is a large dataset + read_sample_size = min(read_count_max, read_count) + unsampled_indices = list(range(0, read_sample_size)) + read_indices = sample(unsampled_indices, read_sample_size) + + # Plot the reads + output_html_plots = {} + for read_index in read_indices: + # Create the figure + fig = go.Figure() + + # Get the read data + nth_read_name = output_data.getNthReadName(read_index) + nth_read_data = output_data.getNthReadBaseSignals(read_index) + nth_read_means = output_data.getNthReadBaseMeans(read_index) + nth_read_stds = output_data.getNthReadBaseStds(read_index) + nth_read_medians = output_data.getNthReadBaseMedians(read_index) + nth_read_skewness = output_data.getNthReadPearsonSkewnessCoeff(read_index) + nth_read_kurtosis = output_data.getNthReadKurtosis(read_index) + nth_read_sequence = output_data.getNthReadSequence(read_index) + sequence_length = len(nth_read_data) + + # Check if sequence data is available + sequence_available = True if nth_read_sequence else False + + # Set up the output CSVs + csv_qc_filepath = os.path.join(output_dir, nth_read_name + '_QC.csv') + qc_file = open(csv_qc_filepath, 'w') + qc_writer = csv.writer(qc_file) + qc_writer.writerow(["Base", "Raw_Signal", "Length", "Mean", "Median", "StdDev", "PearsonSkewnessCoeff", "Kurtosis"]) + + # Loop through the data + first_index = 0 + last_index = sequence_length + start_index = 0 + sequence_list = list(nth_read_sequence) + base_tick_values = [] # Append the last indices of the base signal to use for tick values + for i in range(first_index, last_index): + base_signals = nth_read_data[i] # Get the base's signal + signal_length = len(base_signals) + end_index = start_index + signal_length + base_tick_values.append(end_index) + + # Plot + x = np.arange(start_index, end_index, 1) + fig.add_trace(go.Scatter( + x=x, y=base_signals, + mode='markers', + marker=dict(color='LightSkyBlue', + size=5, + line=dict(color='MediumPurple', width=2)), + opacity=0.5)) + + # Update CSVs + base_value = sequence_list[i] if sequence_available else '' + signal_mean = nth_read_means[i] + signal_median = nth_read_medians[i] + signal_stds = nth_read_stds[i] + signal_skewness = nth_read_skewness[i] + signal_kurtosis = nth_read_kurtosis[i] + raw_row = \ + [base_value, base_signals, signal_length, + signal_mean, signal_median, signal_stds, + signal_skewness, signal_kurtosis] + + qc_writer.writerow(raw_row) + + # Update the index + start_index = end_index + + # Close CSVs + qc_file.close() + + # Update the plot style + fig.update_layout( + title=nth_read_name, + yaxis_title="Signal", + showlegend=False, + font=dict(size=font_size) + ) + fig.update_traces(marker={'size': marker_size}) + + if sequence_available: + # Set up X tick labels + x_tick_labels = sequence_list[first_index:last_index] + fig.update_xaxes(title="Base", + tickangle=0, + tickmode='array', + tickvals=base_tick_values, + ticktext=x_tick_labels) + else: + fig.update_xaxes(title="Index") + + # Append the dynamic HTML object to the output structure + dynamic_html = fig.to_html(full_html=False) + output_html_plots.update({nth_read_name: dynamic_html}) + + return output_html_plots + + +def create_summary_table(output_data, plot_filepaths, file_type): + plot_filepaths["basic_st"] = {} + plot_filepaths["basic_st"]['file'] = "" + plot_filepaths["basic_st"]['title'] = "Summary Table" + + # Decide the file type label + file_type_label = file_type + if file_type == 'FAST5s': + file_type_label = 'FAST5' + + plot_filepaths["basic_st"]['description'] = "{} Basic statistics".format(file_type_label) + + if file_type == 'BAM': + table_str = "\n\n\n" \ + " " + table_str += "\n" + int_str_for_format = " " + double_str_for_format = " " + table_str += int_str_for_format.format("#Total Reads", output_data.mapped_long_read_info.total_num_reads, + output_data.unmapped_long_read_info.total_num_reads, + output_data.long_read_info.total_num_reads) + table_str += int_str_for_format.format("#Total Bases", + output_data.mapped_long_read_info.total_num_bases, + output_data.unmapped_long_read_info.total_num_bases, + output_data.long_read_info.total_num_bases) + table_str += int_str_for_format.format("Longest Read Length", + output_data.mapped_long_read_info.longest_read_length, + output_data.unmapped_long_read_info.longest_read_length, + output_data.long_read_info.longest_read_length) + table_str += int_str_for_format.format("N50", + output_data.mapped_long_read_info.n50_read_length, + output_data.unmapped_long_read_info.n50_read_length, + output_data.long_read_info.n50_read_length) + table_str += double_str_for_format.format("GC Content(%)", + output_data.mapped_long_read_info.gc_cnt * 100, + output_data.unmapped_long_read_info.gc_cnt * 100, + output_data.long_read_info.gc_cnt * 100) + table_str += double_str_for_format.format("Mean Read Length", + output_data.mapped_long_read_info.mean_read_length, + output_data.unmapped_long_read_info.mean_read_length, + output_data.long_read_info.mean_read_length) + table_str += int_str_for_format.format("Median Read Length", + output_data.mapped_long_read_info.median_read_length, + output_data.unmapped_long_read_info.median_read_length, + output_data.long_read_info.median_read_length) + + elif file_type == 'SeqTxt': + table_str = "
    MeasurementMappedUnmappedAll
    {}{:,d}{:," \ + "d}{:,d}
    {}{:.1f}{:.1f}{:.1f}
    \n\n\n" + table_str += "\n" + int_str_for_format = "" + double_str_for_format = "" + table_str += int_str_for_format.format("#Total Reads", + output_data.passed_long_read_info.long_read_info.total_num_reads, + output_data.failed_long_read_info.long_read_info.total_num_reads, + output_data.all_long_read_info.long_read_info.total_num_reads) + table_str += int_str_for_format.format("#Total Bases", + output_data.passed_long_read_info.long_read_info.total_num_bases, + output_data.failed_long_read_info.long_read_info.total_num_bases, + output_data.all_long_read_info.long_read_info.total_num_bases) + table_str += int_str_for_format.format("Longest Read Length", + output_data.passed_long_read_info.long_read_info.longest_read_length, + output_data.failed_long_read_info.long_read_info.longest_read_length, + output_data.all_long_read_info.long_read_info.longest_read_length) + table_str += int_str_for_format.format("N50", + output_data.passed_long_read_info.long_read_info.n50_read_length, + output_data.failed_long_read_info.long_read_info.n50_read_length, + output_data.all_long_read_info.long_read_info.n50_read_length) + table_str += double_str_for_format.format("Mean Read Length", + output_data.passed_long_read_info.long_read_info.mean_read_length, + output_data.failed_long_read_info.long_read_info.mean_read_length, + output_data.all_long_read_info.long_read_info.mean_read_length) + table_str += int_str_for_format.format("Median Read Length", + output_data.passed_long_read_info.long_read_info.median_read_length, + output_data.failed_long_read_info.long_read_info.median_read_length, + output_data.all_long_read_info.long_read_info.median_read_length) + + elif file_type == 'FAST5s': + # Get values + read_count = output_data.getReadCount() + total_base_count = output_data.getTotalBaseCount() + + # Set up the HTML table + table_str = "
    MeasurementPassedFailedAll
    {}{:,d}{:,d}{:,d}
    {}{:.1f}{:.1f}{:.1f}
    \n\n\n" + table_str += "\n" + int_str_for_format = "" + table_str += int_str_for_format.format("#Total Reads", read_count) + table_str += int_str_for_format.format("#Total Bases", total_base_count) + + else: + table_str = "
    MeasurementStatistics
    {}{:,d}
    \n\n\n" + table_str += "\n" + int_str_for_format = "" + double_str_for_format = "" + table_str += int_str_for_format.format("#Total Reads", + output_data.long_read_info.total_num_reads) + table_str += int_str_for_format.format("#Total Bases", + output_data.long_read_info.total_num_bases) + table_str += int_str_for_format.format("Longest Read Length", + output_data.long_read_info.longest_read_length) + table_str += int_str_for_format.format("N50", + output_data.long_read_info.n50_read_length) + table_str += double_str_for_format.format("GC Content(%)", + output_data.long_read_info.gc_cnt * 100) + table_str += double_str_for_format.format("Mean Read Length", + output_data.long_read_info.mean_read_length) + table_str += int_str_for_format.format("Median Read Length", + output_data.long_read_info.median_read_length) + + table_str += "\n\n
    MeasurementStatistics
    {}{:,d}
    {}{:.1f}
    " + plot_filepaths["basic_st"]['detail'] = table_str + + +def plot_alignment_numbers(data): + category = ['Primary Alignments', 'Supplementary Alignments', 'Secondary Alignments', + 'Reads with Supplementary Alignments', 'Reads with Secondary Alignments', + 'Reads with Secondary and Supplementary Alignments', 'Forward Alignments', 'Reverse Alignments'] + category = [wrap(x) for x in category] + + # Create a horizontally aligned bar plot trace from the data using plotly + trace = go.Bar(x=[data.num_primary_alignment, data.num_supplementary_alignment, data.num_secondary_alignment, + data.num_reads_with_supplementary_alignment, data.num_reads_with_secondary_alignment, + data.num_reads_with_both_secondary_supplementary_alignment, data.forward_alignment, + data.reverse_alignment], y=category, orientation='h') + + # Create the layout for the plot + layout = go.Layout(title=go.layout.Title(text=""), + xaxis=go.layout.XAxis(title=go.layout.xaxis.Title(text="Counts")), + yaxis=go.layout.YAxis(title=go.layout.yaxis.Title(text=""))) + + # Create the figure object + fig = go.Figure(data=[trace], layout=layout) + + # Generate the HTML object for the plot + html_obj = fig.to_html(full_html=False, default_height=500, default_width=1000) + + return html_obj + + +def plot_errors(output_data): + category = ['Matched Bases', 'Mismatched Bases', 'Inserted Bases', 'Deleted Bases', 'Clipped Bases'] + category = [wrap(x) for x in category] + + # Create a horizontally aligned bar plot trace from the data using plotly + trace = go.Bar(x=[output_data.num_matched_bases, output_data.num_mismatched_bases, output_data.num_ins_bases, + output_data.num_del_bases, output_data.num_clip_bases], y=category, orientation='h') + + # Create the layout for the plot + layout = go.Layout(title=go.layout.Title(text=""), + xaxis=go.layout.XAxis(title=go.layout.xaxis.Title(text="Counts")), + yaxis=go.layout.YAxis(title=go.layout.yaxis.Title(text=""))) + + # Create the figure object + fig = go.Figure(data=[trace], layout=layout) + + # Generate the HTML object for the plot + html_obj = fig.to_html(full_html=False, default_height=500, default_width=700) + + return html_obj + diff --git a/src/seqtxt_module.cpp b/src/seqtxt_module.cpp index 93b01b2..ab57ca9 100644 --- a/src/seqtxt_module.cpp +++ b/src/seqtxt_module.cpp @@ -221,7 +221,7 @@ int SeqTxt_Module::generateStatistics( Output_SeqTxt& t_output_SeqTxt_info){ t_output_SeqTxt_info.global_sum(); auto relapse_end_time = std::chrono::high_resolution_clock::now(); - std::cout<<"Total time(Elapsed): "<{:,d}{:,d}{:,d}" - double_str_for_format = "{}{:.1f}{:.1f}{:.1f}" - table_str += int_str_for_format.format("#Total Reads", - seqtxt_output.passed_long_read_info.long_read_info.total_num_reads, - seqtxt_output.failed_long_read_info.long_read_info.total_num_reads, - seqtxt_output.all_long_read_info.long_read_info.total_num_reads) - table_str += int_str_for_format.format("#Total Bases", - seqtxt_output.passed_long_read_info.long_read_info.total_num_bases, - seqtxt_output.failed_long_read_info.long_read_info.total_num_bases, - seqtxt_output.all_long_read_info.long_read_info.total_num_bases) - table_str += int_str_for_format.format("Longest Read Length", - seqtxt_output.passed_long_read_info.long_read_info.longest_read_length, - seqtxt_output.failed_long_read_info.long_read_info.longest_read_length, - seqtxt_output.all_long_read_info.long_read_info.longest_read_length) - table_str += int_str_for_format.format("N50", - seqtxt_output.passed_long_read_info.long_read_info.n50_read_length, - seqtxt_output.failed_long_read_info.long_read_info.n50_read_length, - seqtxt_output.all_long_read_info.long_read_info.n50_read_length) - table_str += double_str_for_format.format("Mean Read Length", - seqtxt_output.passed_long_read_info.long_read_info.mean_read_length, - seqtxt_output.failed_long_read_info.long_read_info.mean_read_length, - seqtxt_output.all_long_read_info.long_read_info.mean_read_length) - table_str += int_str_for_format.format("Median Read Length", - seqtxt_output.passed_long_read_info.long_read_info.median_read_length, - seqtxt_output.failed_long_read_info.long_read_info.median_read_length, - seqtxt_output.all_long_read_info.long_read_info.median_read_length) - table_str += "\n\n" - - plot_filepaths["basic_st"]['detail'] = table_str - - -def plot(seqtxt_output, para_dict): - out_path = para_dict["output_folder"] - plot_filepaths = getDefaultPlotFilenames() - get_image_path = lambda x: os.path.join(out_path, plot_filepaths[x]['file']) - - # Set the default matplotlib font size - setDefaultFontSize(12) - - # Get the font size for plotly plots - font_size = para_dict["fontsize"] - - # Create table - create_summary_table(seqtxt_output, plot_filepaths) - - # Generate plots - plot_read_length_stats( - [seqtxt_output.all_long_read_info.long_read_info, seqtxt_output.passed_long_read_info.long_read_info, - seqtxt_output.failed_long_read_info.long_read_info], get_image_path('read_length_st'), - subtitles=['All Reads', 'Passed Reads', 'Failed Reads']) - plot_base_counts( - [seqtxt_output.all_long_read_info.long_read_info, seqtxt_output.passed_long_read_info.long_read_info, - seqtxt_output.failed_long_read_info.long_read_info], get_image_path('base_st'), - subtitles=['All Reads', 'Passed Reads', 'Failed Reads']) - plot_basic_info( - [seqtxt_output.all_long_read_info.long_read_info, seqtxt_output.passed_long_read_info.long_read_info, - seqtxt_output.failed_long_read_info.long_read_info], get_image_path('basic_info'), - categories=['All Reads', 'Passed Reads', 'Failed Reads']) - histogram(seqtxt_output.all_long_read_info.long_read_info, get_image_path('read_length_hist'), font_size) - - return plot_filepaths