diff --git a/Dockerfile b/Dockerfile index 64772ec..a300109 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,6 +8,10 @@ WORKDIR /longreadsum # Install build tools RUN apt-get update && apt-get install build-essential -y +# Install VBZ compression +RUN wget https://github.com/nanoporetech/vbz_compression/releases/download/v1.0.1/ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz +RUN tar -xf ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz + # Create the environment RUN conda env create -f environment.yml @@ -22,6 +26,11 @@ RUN which python # Build LongReadSum RUN make +# Set up the HDF5 plugin path +#RUN export HDF5_PLUGIN_PATH=/longreadsum/ont-vbz-hdf-plugin-1.0.1-Linux/usr/local/hdf5/lib/plugin/ +ENV HDF5_PLUGIN_PATH="/longreadsum/ont-vbz-hdf-plugin-1.0.1-Linux/usr/local/hdf5/lib/plugin/" +RUN echo "The HDF5_PLUGIN_PATH is $HDF5_PLUGIN_PATH" + # The code to run when container is started: WORKDIR / ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "lrst_py39", "python", "longreadsum"] diff --git a/README.md b/README.md index 71f55e6..0b8367f 100644 --- a/README.md +++ b/README.md @@ -30,9 +30,10 @@ docker pull genomicslab/longreadsum On Unix/Linux: ``` -docker run -v local/inputs/:/inputs/ -it genomicslab/longreadsum bam -i /inputs/input.bam +docker run -v C:/Users/.../DataDirectory:/mnt/ -it genomicslab/longreadsum bam -i /mnt/input.bam -o /mnt/output ``` -Note that the `-v` command is required for Docker to find the input file. In the above BAM file example, the local directory `local/inputs/` containing the input file is mapped to a directory `/inputs/` in the Docker container. Thus, the input file is specified as `/inputs/input.bam` +Note that the `-v` command is required for Docker to find the input file. Use a directory under `C:/Users/` to ensure volume files are mounted correctly. In the above example, the local directory `C:/Users/.../DataDirectory` containing the input file `input.bam` is mapped to a directory `/mnt/` in the Docker container. Thus, the input file and output directory arguments are relative to the `/mnt/` directory, but the output files will also be saved locally in `C:/Users/.../DataDirectory` under the specified subdirectory `output`. + # Installation using Anaconda First install [Anaconda](https://www.anaconda.com/). Then follow the instructions below to install LongReadSum and its dependencies: @@ -47,6 +48,23 @@ conda activate lrst_py39 make ``` + +If you are using FAST5 files with VBZ compression, you will need to download and install the VBZ plugin corresponding to your architecture: +https://github.com/nanoporetech/vbz_compression/releases + +For example: + +``` +wget https://github.com/nanoporetech/vbz_compression/releases/download/v1.0.1/ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz +tar -xf ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz +``` + +Finally, add the plugin to your path: +``` +export HDF5_PLUGIN_PATH=/full/path/to/ont-vbz-hdf-plugin-1.0.1-Linux/usr/local/hdf5/lib/plugin +``` + + ## Running Activate the conda environment: diff --git a/environment.yml b/environment.yml index 3d87e49..8bfb8f1 100644 --- a/environment.yml +++ b/environment.yml @@ -9,7 +9,6 @@ dependencies: - hdf5 - htslib - samtools - - minimap2 - swig - matplotlib - plotly=4.14 diff --git a/include/FAST5_module.h b/include/FAST5_module.h index a827a9a..c6bdf8a 100644 --- a/include/FAST5_module.h +++ b/include/FAST5_module.h @@ -4,6 +4,7 @@ #include "InputStructure.h" #include "OutputStructures.h" + int generateQCForFAST5(Input_Para &_input_data, Output_FAST5 &output_data); #endif diff --git a/lib/hdf5-1_12_1/COPYING b/lib/hdf5-1_12_1/COPYING deleted file mode 100644 index 9d32232..0000000 --- a/lib/hdf5-1_12_1/COPYING +++ /dev/null @@ -1,106 +0,0 @@ -Copyright Notice and License Terms for -HDF5 (Hierarchical Data Format 5) Software Library and Utilities ------------------------------------------------------------------------------ - -HDF5 (Hierarchical Data Format 5) Software Library and Utilities -Copyright 2006 by The HDF Group. - -NCSA HDF5 (Hierarchical Data Format 5) Software Library and Utilities -Copyright 1998-2006 by The Board of Trustees of the University of Illinois. - -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted for any purpose (including commercial purposes) -provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, - this list of conditions, and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions, and the following disclaimer in the documentation - and/or materials provided with the distribution. - -3. Neither the name of The HDF Group, the name of the University, nor the - name of any Contributor may be used to endorse or promote products derived - from this software without specific prior written permission from - The HDF Group, the University, or the Contributor, respectively. - -DISCLAIMER: -THIS SOFTWARE IS PROVIDED BY THE HDF GROUP AND THE CONTRIBUTORS -"AS IS" WITH NO WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED. IN NO -EVENT SHALL THE HDF GROUP OR THE CONTRIBUTORS BE LIABLE FOR ANY DAMAGES -SUFFERED BY THE USERS ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN IF -ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -You are under no obligation whatsoever to provide any bug fixes, patches, or -upgrades to the features, functionality or performance of the source code -("Enhancements") to anyone; however, if you choose to make your Enhancements -available either publicly, or directly to The HDF Group, without imposing a -separate written license agreement for such Enhancements, then you hereby -grant the following license: a non-exclusive, royalty-free perpetual license -to install, use, modify, prepare derivative works, incorporate into other -computer software, distribute, and sublicense such enhancements or derivative -works thereof, in binary and source code form. - ------------------------------------------------------------------------------ ------------------------------------------------------------------------------ - -Limited portions of HDF5 were developed by Lawrence Berkeley National -Laboratory (LBNL). LBNL's Copyright Notice and Licensing Terms can be -found here: COPYING_LBNL_HDF5 file in this directory or at -http://support.hdfgroup.org/ftp/HDF5/releases/COPYING_LBNL_HDF5. - ------------------------------------------------------------------------------ ------------------------------------------------------------------------------ - -Contributors: National Center for Supercomputing Applications (NCSA) at -the University of Illinois, Fortner Software, Unidata Program Center -(netCDF), The Independent JPEG Group (JPEG), Jean-loup Gailly and Mark Adler -(gzip), and Digital Equipment Corporation (DEC). - ------------------------------------------------------------------------------ - -Portions of HDF5 were developed with support from the Lawrence Berkeley -National Laboratory (LBNL) and the United States Department of Energy -under Prime Contract No. DE-AC02-05CH11231. - ------------------------------------------------------------------------------ - -Portions of HDF5 were developed with support from Lawrence Livermore -National Laboratory and the United States Department of Energy under -Prime Contract No. DE-AC52-07NA27344. - ------------------------------------------------------------------------------ - -Portions of HDF5 were developed with support from the University of -California, Lawrence Livermore National Laboratory (UC LLNL). -The following statement applies to those portions of the product and must -be retained in any redistribution of source code, binaries, documentation, -and/or accompanying materials: - - This work was partially produced at the University of California, - Lawrence Livermore National Laboratory (UC LLNL) under contract - no. W-7405-ENG-48 (Contract 48) between the U.S. Department of Energy - (DOE) and The Regents of the University of California (University) - for the operation of UC LLNL. - - DISCLAIMER: - THIS WORK WAS PREPARED AS AN ACCOUNT OF WORK SPONSORED BY AN AGENCY OF - THE UNITED STATES GOVERNMENT. NEITHER THE UNITED STATES GOVERNMENT NOR - THE UNIVERSITY OF CALIFORNIA NOR ANY OF THEIR EMPLOYEES, MAKES ANY - WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY OR RESPONSIBILITY - FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, - APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE - WOULD NOT INFRINGE PRIVATELY- OWNED RIGHTS. REFERENCE HEREIN TO ANY - SPECIFIC COMMERCIAL PRODUCTS, PROCESS, OR SERVICE BY TRADE NAME, - TRADEMARK, MANUFACTURER, OR OTHERWISE, DOES NOT NECESSARILY CONSTITUTE - OR IMPLY ITS ENDORSEMENT, RECOMMENDATION, OR FAVORING BY THE UNITED - STATES GOVERNMENT OR THE UNIVERSITY OF CALIFORNIA. THE VIEWS AND - OPINIONS OF AUTHORS EXPRESSED HEREIN DO NOT NECESSARILY STATE OR REFLECT - THOSE OF THE UNITED STATES GOVERNMENT OR THE UNIVERSITY OF CALIFORNIA, - AND SHALL NOT BE USED FOR ADVERTISING OR PRODUCT ENDORSEMENT PURPOSES. - ------------------------------------------------------------------------------ - - diff --git a/lib/hdf5-1_12_1/libhdf5_cpp.so b/lib/hdf5-1_12_1/libhdf5_cpp.so deleted file mode 100644 index 43d602f..0000000 Binary files a/lib/hdf5-1_12_1/libhdf5_cpp.so and /dev/null differ diff --git a/src/FAST5_module.cpp b/src/FAST5_module.cpp index c2500a1..87758a1 100644 --- a/src/FAST5_module.cpp +++ b/src/FAST5_module.cpp @@ -27,11 +27,10 @@ Class for calling FAST5 statistics modules. using namespace H5; - +// Split a string into tokens std::vector splitString(const std::string& str) { std::vector tokens; - std::stringstream ss(str); std::string token; while (std::getline(ss, token, '\n')) { @@ -45,175 +44,145 @@ std::vector splitString(const std::string& str) // Read the FASTQ dataset from the file std::vector getFastq(H5::H5File f5, std::string basecall_group) { - // Attempt to access the FASTQ dataset - H5::DataSet dataset; - dataset = f5.openDataSet("/Analyses/"+basecall_group+"/BaseCalled_template/Fastq"); - H5::DataType datatype = dataset.getDataType(); - - // Read output - std::string data; - dataset.read(data, datatype); - std::vector tokens = splitString(data); + std::vector sequence; // Output header and sequence + H5::Exception::dontPrint(); // Disable error printing + try { + // Attempt to access the FASTQ dataset + H5::DataSet dataset; + dataset = f5.openDataSet("/Analyses/"+basecall_group+"/BaseCalled_template/Fastq"); + H5::DataType datatype = dataset.getDataType(); + + // Read output + std::string data; + dataset.read(data, datatype); + sequence = splitString(data); + } catch (FileIException &error) { + } - return tokens; + return sequence; } - -// Add base and read QC to the output data structure adn the output details file -static int writeBaseQCDetails(const char *input_file, Output_FAST5 &output_data, FILE *read_details_fp) +// Read alignment data from the file to get start and end map positions +std::vector getMapPositions(H5::H5File f5) { - int exit_code = 0; - const char * read_name; - char baseq; - double gc_content_pct; - std::string basecall_group = "Basecall_1D_000"; - - // Access output QC and base quality QC structures - Basic_Seq_Statistics &long_read_info = output_data.long_read_info; - Basic_Seq_Quality_Statistics &seq_quality_info = output_data.seq_quality_info; - - // Run QC on the HDF5 file + std::vector map_positions; // Start and end map positions H5::Exception::dontPrint(); // Disable error printing try { - // Open the file - H5::H5File f5 = H5::H5File(input_file, H5F_ACC_RDONLY); - - // Get the FASTQ dataset - std::vector fq = getFastq(f5, basecall_group); - - // Access the read name - std::string header_str = fq[0]; - std::istringstream iss_header( header_str ); - std::string read_name_str; - std::getline( iss_header, read_name_str, ' ' ); - read_name = read_name_str.c_str(); - - // Access the sequence data - std::string sequence_data_str = fq[1]; - - // Update the total number of bases - int base_count = sequence_data_str.length(); - long_read_info.total_num_bases += base_count; - - // Store the read length - long_read_info.read_lengths.push_back(base_count); - - // Access base quality data - char value; - std::vector base_quality_values; - std::string base_quality_str = fq[3]; - std::istringstream iss( base_quality_str ); - while (iss >> value) { - int base_quality_value = value - '!'; // '!' symbol represent 0-quality score - base_quality_values.push_back(base_quality_value); - } - - // Update the base quality and GC content information - int gc_count = 0; - double read_mean_base_qual = 0; - char current_base; - for (int i = 0; i < base_count; i++) - { - current_base = sequence_data_str[i]; - if (current_base == 'A' || current_base == 'a') - { - long_read_info.total_a_cnt += 1; - } - else if (current_base == 'G' || current_base == 'g') - { - long_read_info.total_g_cnt += 1; - gc_count += 1; - } - else if (current_base == 'C' || current_base == 'c') - { - long_read_info.total_c_cnt += 1; - gc_count += 1; - } - else if (current_base == 'T' || current_base == 't' || current_base == 'U' || current_base == 'u') - { - long_read_info.total_tu_cnt += 1; - } - baseq = base_quality_values[i]; // Get the base quality - seq_quality_info.base_quality_distribution[baseq] += 1; - read_mean_base_qual += baseq; - } - - // Calculate percent guanine & cytosine - gc_content_pct = 100.0 *( (double)gc_count / (double)base_count ); + // Get the alignment start position + H5::Group align_group_obj = f5.openGroup("/Analyses/RawGenomeCorrected_000/BaseCalled_template/Alignment"); + H5::Attribute start_map_obj = align_group_obj.openAttribute("mapped_start"); + H5::DataType start_map_datatype= start_map_obj.getDataType(); + int start_map_buffer [1]; + start_map_obj.read(start_map_datatype, start_map_buffer); + int start_map = start_map_buffer[0]; + map_positions.push_back(start_map); + std::cout << "Start map = " << start_map << std::endl; + + // Get the alignment end position + H5::Attribute end_map_obj = align_group_obj.openAttribute("mapped_end"); + H5::DataType end_map_datatype= end_map_obj.getDataType(); + int end_map_buffer [1]; + end_map_obj.read(end_map_datatype, end_map_buffer); + int end_map = end_map_buffer[0]; + map_positions.push_back(end_map); + std::cout << "End map = " << end_map << std::endl; - // Look into this section - long_read_info.read_gc_content_count[(int)(gc_content_pct + 0.5)] += 1; - read_mean_base_qual /= (double) base_count; - seq_quality_info.read_average_base_quality_distribution[(uint)(read_mean_base_qual + 0.5)] += 1; - fprintf(read_details_fp, "%s\t%d\t%.2f\t%.2f\n", read_name, base_count, gc_content_pct, read_mean_base_qual); - - // Update the total number of reads (1 per FAST5 file) - output_data.long_read_info.total_num_reads += 1; } catch (FileIException &error) { - // If the dataset is missing, continue and ignore this file - if (error.getFuncName() == "H5File::openDataSet") { - std::cout << "No FASTQ sequence dataset found for file: " << input_file << std::endl; - } else { - std::cerr << "Error accessing the FAST5 file: " << input_file << std::endl; - error.printErrorStack(); - } - exit_code = 2; - }; + } catch (AttributeIException &error) { + } - return exit_code; + return map_positions; } - -// Add read signal QC to the output data structure and the output details file -static int writeSignalQCDetails(const char *input_file, Output_FAST5 &output_data) +// Read alignment data from the file to get the mapped chromosome +std::string getChromosome(H5::H5File f5) { - int exit_code = 0; - std::string basecall_group = "Basecall_1D_000"; - -// // Open the CSV files -// std::ofstream raw_csv; -// raw_csv.open(signal_raw_csv); -// std::ofstream qc_csv; -// qc_csv.open(signal_qc_csv); - - // Run QC on the HDF5 file - //H5::Exception::dontPrint(); // Disable error printing + std::string mapped_chrom; // Start and end map positions + std::string error_msg("No mapping found."); // Message when no basecalling is found. + H5::Exception::dontPrint(); // Disable error printing try { - // Open the file - H5::H5File f5 = H5::H5File(input_file, H5F_ACC_RDONLY); + // Get the alignment start position + std::cout << "Opening mapping group..." << std::endl; + H5::Group align_group_obj = f5.openGroup("/Analyses/RawGenomeCorrected_000/BaseCalled_template/Alignment"); + H5::Attribute chrom_obj = align_group_obj.openAttribute("mapped_chrom"); + H5::DataType chrom_datatype= chrom_obj.getDataType(); + std::string chrom_buffer; + chrom_obj.read(chrom_datatype, chrom_buffer); + std::cout << "Chromosome = " << chrom_buffer << std::endl; + mapped_chrom = chrom_buffer; + } catch (FileIException &error) { + std::cout << error_msg << std::endl; + } catch (AttributeIException &error) { + std::cout << error_msg << std::endl; + } - // Get the read name - std::string signal_group_str = "/Raw/Reads"; - H5::Group signal_group = f5.openGroup(signal_group_str); - std::string read_name; - read_name = signal_group.getObjnameByIdx(0); + return mapped_chrom; +} - // Get the sequence string - std::vector fq = getFastq(f5, basecall_group); - std::string sequence_data_str = fq[1]; +// Get the read name for a single-read FAST5 file +std::string getFileReadName(H5::H5File f5) { + std::string read_name; + H5::Group signal_group = f5.openGroup("/Raw/Reads"); + read_name = signal_group.getObjnameByIdx(0); - // Format the read signal dataset name - std::ostringstream ss; - ss << signal_group_str << "/" << read_name << "/Signal"; - std::string signal_dataset_str = ss.str(); + return read_name; +} - // Get the signal dataset - H5::DataSet signal_ds = f5.openDataSet(signal_dataset_str); - H5::DataType mdatatype= signal_ds.getDataType(); - H5::DataSpace dataspace = signal_ds.getSpace(); - hsize_t dims[2]; - dataspace.getSimpleExtentDims(dims, NULL); // rank = 1 - int data_count = dims[0]; +// Format read base signal data from the specified signal group +Base_Signals getReadBaseSignalData(H5::H5File f5, std::string read_name, bool single_read) +{ + // Get the read name + std::string sequence_data_str(""); + std::vector> basecall_signals; // Holds signals across bases + std::string mapped_chrom; + std::vector map_positions; + + // Access signal data + std::string signal_group_str; + std::string signal_dataset_str; + if (single_read) { + signal_group_str = "/Raw/Reads"; + signal_dataset_str = signal_group_str + "/" + read_name + "/Signal"; + } else { + signal_group_str = "/" + read_name + "/Raw"; + signal_dataset_str = signal_group_str + "/Signal"; + } - // Store the signals in an array - uint16_t f5signals_u16 [data_count]; - signal_ds.read(f5signals_u16, mdatatype); + //std::cout << "Opening signal group " << signal_group_str << std::endl; + H5::Group signal_group = f5.openGroup(signal_group_str); + + // Get the signal dataset + //std::cout << "Opening dataset " << signal_dataset_str << std::endl; + H5::DataSet signal_ds = f5.openDataSet(signal_dataset_str); + H5::DataType mdatatype= signal_ds.getDataType(); + H5::DataSpace dataspace = signal_ds.getSpace(); + hsize_t dims[2]; + dataspace.getSimpleExtentDims(dims, NULL); // rank = 1 + int data_count = dims[0]; + + // Store the signals in an array + int16_t f5signals_16 [data_count]; + signal_ds.read(f5signals_16, mdatatype); + + // Cast signals to int + int f5signals [data_count]; + for (int i = 0; i < data_count; i++) { f5signals[i] = (int) f5signals_16[i]; }; + + // Get the sequence string if available + std::string basecall_group("Basecall_1D_000"); + std::vector fq = getFastq(f5, basecall_group); + if (fq.empty()){ + // Return the raw signal only + int n = sizeof(f5signals) / sizeof(f5signals[0]); + std::vector all_signals(f5signals, f5signals + n); + basecall_signals.push_back(all_signals); - // Cast signals to int - int f5signals [data_count]; - std::copy(f5signals_u16, f5signals_u16 + data_count, f5signals); + } else { + // Access signal basecalling information + sequence_data_str = fq[1]; // Get the block stride (window length) attribute + std::cout << "Opening basecall group..." << std::endl; H5::Group group_obj = f5.openGroup("/Analyses/Basecall_1D_000/Summary/basecall_1d_template"); H5::Attribute block_stride_obj = group_obj.openAttribute("block_stride"); H5::DataType bs_datatype= block_stride_obj.getDataType(); @@ -226,9 +195,9 @@ static int writeSignalQCDetails(const char *input_file, Output_FAST5 &output_dat H5::DataType sl_datatype= seq_length_obj.getDataType(); int sl_buffer [1]; seq_length_obj.read(sl_datatype, sl_buffer); - //int sequence_length = sl_buffer[0]; // Get the raw signal basecall start position + std::cout << "Opening segmentation group..." << std::endl; H5::Group segm_group_obj = f5.openGroup("/Analyses/Segmentation_000/Summary/segmentation"); H5::Attribute start_attr_obj = segm_group_obj.openAttribute("first_sample_template"); H5::DataType st_datatype= start_attr_obj.getDataType(); @@ -245,19 +214,21 @@ static int writeSignalQCDetails(const char *input_file, Output_FAST5 &output_dat move_dataspace.getSimpleExtentDims(move_dims, NULL); // rank = 1 int move_data_count = move_dims[0]; + // Get alignment information + map_positions = getMapPositions(f5); + mapped_chrom = getChromosome(f5); + // Read the boolean array uint8_t move_bool [move_data_count]; move_dataset_obj.read(move_bool, move_datatype, move_dataspace); // Segment the raw signal by the window length std::vector called_base_signal; // Holds the current base's signal - std::vector> basecall_signals; // Holds signals across bases int basecall_index = 0; int base_start_index = start_index; int base_count = 0; for (int i = 0; i < move_data_count; i++) { - // Grab the signal int end_index = base_start_index + block_stride_value; std::vector block_signal(block_stride_value); @@ -284,32 +255,206 @@ static int writeSignalQCDetails(const char *input_file, Output_FAST5 &output_dat base_start_index += block_stride_value; basecall_index ++; } + } + + // Set up the read information header for the output report + std::string read_info = read_name; + if (!mapped_chrom.empty()) { + read_info += " " + mapped_chrom; + if (!map_positions.empty()) { + std::string map_start = std::to_string(map_positions[0]); + read_info += " [" + map_start; + std::string map_end = std::to_string(map_positions[1]); + read_info += ", " + map_end + "]"; + } + } + + // Append the basecall signals to the output structure + Base_Signals basecall_obj(read_info, sequence_data_str, basecall_signals); - // Append the basecall signals to the output structure - Base_Signals basecall_obj(read_name, sequence_data_str, basecall_signals); - output_data.addReadBaseSignals(basecall_obj); + return basecall_obj; +} + +// Add base and read QC to the output data structure adn the output details file +static int writeBaseQCDetails(const char *input_file, Output_FAST5 &output_data, FILE *read_details_fp) +{ + int exit_code = 0; + const char * read_name; + char baseq; + double gc_content_pct; + std::string basecall_group = "Basecall_1D_000"; + + // Access output QC and base quality QC structures + Basic_Seq_Statistics &long_read_info = output_data.long_read_info; + Basic_Seq_Quality_Statistics &seq_quality_info = output_data.seq_quality_info; + + // Run QC on the HDF5 file + H5::Exception::dontPrint(); // Disable error printing + try { + // Open the file + H5::H5File f5 = H5::H5File(input_file, H5F_ACC_RDONLY); + + // Get the FASTQ dataset + std::vector fq = getFastq(f5, basecall_group); + if (fq.empty()){ + std::cerr << "No sequence data found. Signal QC may be generated using the 'f5s' option." << std::endl; + exit_code = 2; + } else { + // Access the read name + std::string header_str = fq[0]; + std::istringstream iss_header( header_str ); + std::string read_name_str; + std::getline( iss_header, read_name_str, ' ' ); + read_name = read_name_str.c_str(); + + // Access the sequence data + std::string sequence_data_str = fq[1]; + + // Update the total number of bases + int base_count = sequence_data_str.length(); + long_read_info.total_num_bases += base_count; + + // Store the read length + long_read_info.read_lengths.push_back(base_count); + + // Access base quality data + char value; + std::vector base_quality_values; + std::string base_quality_str = fq[3]; + std::istringstream iss( base_quality_str ); + while (iss >> value) { + int base_quality_value = value - '!'; // '!' symbol represent 0-quality score + base_quality_values.push_back(base_quality_value); + } + + // Update the base quality and GC content information + int gc_count = 0; + double read_mean_base_qual = 0; + char current_base; + for (int i = 0; i < base_count; i++) + { + current_base = sequence_data_str[i]; + if (current_base == 'A' || current_base == 'a') + { + long_read_info.total_a_cnt += 1; + } + else if (current_base == 'G' || current_base == 'g') + { + long_read_info.total_g_cnt += 1; + gc_count += 1; + } + else if (current_base == 'C' || current_base == 'c') + { + long_read_info.total_c_cnt += 1; + gc_count += 1; + } + else if (current_base == 'T' || current_base == 't' || current_base == 'U' || current_base == 'u') + { + long_read_info.total_tu_cnt += 1; + } + baseq = base_quality_values[i]; // Get the base quality + seq_quality_info.base_quality_distribution[baseq] += 1; + read_mean_base_qual += baseq; + } + + // Calculate percent guanine & cytosine + gc_content_pct = 100.0 *( (double)gc_count / (double)base_count ); + + // Look into this section + long_read_info.read_gc_content_count[(int)(gc_content_pct + 0.5)] += 1; + read_mean_base_qual /= (double) base_count; + seq_quality_info.read_average_base_quality_distribution[(uint)(read_mean_base_qual + 0.5)] += 1; + fprintf(read_details_fp, "%s\t%d\t%.2f\t%.2f\n", read_name, base_count, gc_content_pct, read_mean_base_qual); + + // Update the total number of reads (1 per FAST5 file) + output_data.long_read_info.total_num_reads += 1; + } + } catch (FileIException &error) { + // If the dataset is missing, continue and ignore this file + if (error.getFuncName() == "H5File::openDataSet") { + std::cout << "No FASTQ sequence dataset found for file: " << input_file << std::endl; + } else { + std::cerr << "Error accessing the FAST5 file: " << input_file << std::endl; + error.printErrorStack(); + } + exit_code = 2; + }; + + return exit_code; +} + + +// Add read signal QC to the output data structure and the output details file +static int writeSignalQCDetails(const char *input_file, Output_FAST5 &output_data) +{ + int exit_code = 0; + +// // Open the CSV files +// std::ofstream raw_csv; +// raw_csv.open(signal_raw_csv); +// std::ofstream qc_csv; +// qc_csv.open(signal_qc_csv); + + // Run QC on the HDF5 file + //H5::Exception::dontPrint(); // Disable error printing + try { + // Open the file + H5::H5File f5 = H5::H5File(input_file, H5F_ACC_RDONLY); + + // Check if it is a multi-read FAST5 file + std::string signal_group_str; + std::string read_name; + if (f5.nameExists("/Raw")) { + std::cout << "Single read FAST5" << std::endl; + + // Append the basecall signals to the output structure + signal_group_str = "/Raw/Reads"; + read_name = getFileReadName(f5); + std::cout << read_name << std::endl; + Base_Signals basecall_obj = getReadBaseSignalData(f5, read_name, true); + output_data.addReadBaseSignals(basecall_obj); + } else { + std::cout << "Multi-read FAST5" << std::endl; + + // Loop through each read + H5::Group root_group = f5.openGroup("/"); + size_t num_objs = root_group.getNumObjs(); + for (size_t i=0; i < num_objs; i++) { + read_name = root_group.getObjnameByIdx(i); + //std::cout << read_name << std::endl; + + // Append the basecall signals to the output structure + //signal_group_str = "/" + read_name + "/Raw"; + Base_Signals basecall_obj = getReadBaseSignalData(f5, read_name, false); + output_data.addReadBaseSignals(basecall_obj); + } + } // catch failure caused by the H5File operations } catch (FileIException &error) { + std::cerr << "FileIException" << std::endl; error.printErrorStack(); exit_code = 2; } // catch failure caused by the DataSet operations catch (DataSetIException &error) { + std::cerr << "DataSetIException" << std::endl; error.printErrorStack(); exit_code = 2; } // catch failure caused by the DataSpace operations catch (DataSpaceIException &error) { + std::cerr << "DataSpaceIException" << std::endl; error.printErrorStack(); exit_code = 2; } // catch failure caused by the Attribute operations catch (AttributeIException &error) { + std::cerr << "AttributeIException" << std::endl; error.printErrorStack(); exit_code = 2; } diff --git a/src/cli.py b/src/cli.py index ee90051..710a572 100644 --- a/src/cli.py +++ b/src/cli.py @@ -5,10 +5,11 @@ """ import os +import logging import sys + from glob import glob import argparse -import logging from argparse import RawTextHelpFormatter from src import generate_html from src import lrst_global @@ -18,6 +19,7 @@ import faulthandler faulthandler.enable() +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) def get_log_level(log_l): @@ -39,35 +41,40 @@ def get_log_level(log_l): return logging.ERROR -def get_common_param(margs, para_dict): +def get_common_param(margs): """ - Format a dict object para_dict to contain our input files/output folder parameters + Format a dict object param_dict to contain our input files/output folder parameters Output a string containing all parse argument errors. Inputs: margs: Command line input arguments (dict). """ - # TODO: The para_dict input is always empty in this function. Thus, it can be defined in this function. + param_dict = {} this_error_str = "" if (margs.input == None or margs.input == "") and (margs.inputs == None or margs.inputs == "") and (margs.inputPattern == None or margs.inputPattern == ""): this_error_str += "No input file(s) are provided. \n" else: # Group parameters into an array - para_dict["input_files"] = [] + param_dict["input_files"] = [] if not (margs.input == None or margs.input == ""): input_filepath = margs.input.name - para_dict["input_files"].append(input_filepath) + param_dict["input_files"].append(input_filepath) if not (margs.inputs == None or margs.inputs == ""): file_list = [file_str.strip() for file_str in margs.inputs.split(',')] - para_dict["input_files"].extend(file_list) + param_dict["input_files"].extend(file_list) if not (margs.inputPattern == None or margs.inputPattern == ""): pat_split = margs.inputPattern.split("*") - para_dict["input_files"].extend( + param_dict["input_files"].extend( glob(os.path.join("*".join(pat_split[:-1]), "*"+pat_split[-1]))) - if len(para_dict["input_files"]) == 0: + + # Number of reads to sample + read_count = margs.readCount + param_dict["read_count"] = read_count + + if len(param_dict["input_files"]) == 0: this_error_str += "No input file(s) can be found. \n" else: - for input_filepath in para_dict["input_files"]: + for input_filepath in param_dict["input_files"]: if not os.path.isfile(input_filepath): this_error_str += "Cannot find the input file: " + input_filepath + "\n" @@ -75,7 +82,7 @@ def get_common_param(margs, para_dict): this_error_str += "No output file is provided. \n" else: output_dir = margs.outputfolder - para_dict["output_folder"] = output_dir + param_dict["output_folder"] = output_dir try: if not os.path.isdir(output_dir): os.makedirs(output_dir) @@ -85,299 +92,292 @@ def get_common_param(margs, para_dict): except OSError as e: this_error_str += "Cannot create folder for " + \ - para_dict["output_folder"]+" \n" - para_dict["out_prefix"] = margs.outprefix + param_dict["output_folder"]+" \n" + param_dict["out_prefix"] = margs.outprefix if (margs.log == None or margs.log == ""): this_error_str += "No log file is provided. \n" else: - para_dict["log_file"] = margs.log - para_dict["log_level"] = margs.Log_level + param_dict["log_file"] = margs.log + param_dict["log_level"] = margs.Log_level logging.basicConfig(filename=margs.log, level=get_log_level(margs.Log_level), filemode='w', format="%(levelname)s: %(message)s") - para_dict["downsample_percentage"] = margs.downsample_percentage + param_dict["downsample_percentage"] = margs.downsample_percentage - para_dict["threads"] = margs.thread + param_dict["threads"] = margs.thread - para_dict["random_seed"] = margs.seed + param_dict["random_seed"] = margs.seed - para_dict["detail"] = margs.detail + param_dict["detail"] = margs.detail # Plot style parameters - para_dict["fontsize"] = margs.fontsize - para_dict["markersize"] = margs.markersize + param_dict["fontsize"] = margs.fontsize + param_dict["markersize"] = margs.markersize - return this_error_str + return param_dict, this_error_str def fq_module(margs): - para_dict = {} - errorStr = lrst_global.originalError - - errorStr += get_common_param(margs, para_dict) - - if not errorStr == lrst_global.originalError: - print(errorStr) - print("#############################################\n") - # parser.print_help() + """ + Run the FASTQ filetype module. + """ + # Format a dict object to contain our input files/output folder parameters + param_dict, error_msg = get_common_param(margs) + if not error_msg == "": + logging.error(error_msg) parser.parse_args(['fq', '--help']) sys.exit(1001) else: - logging.info('Input file(s) are ' + ';'.join(para_dict["input_files"])) - para_dict["out_prefix"] += "fq_"; + logging.info('Input file(s) are ' + ';'.join(param_dict["input_files"])) + param_dict["out_prefix"] += "fq_"; # Import the SWIG Python wrapper for our C++ module input_para = lrst.Input_Para() - input_para.threads = para_dict["threads"] - input_para.rdm_seed = para_dict["random_seed"] - input_para.downsample_percentage = para_dict["downsample_percentage"] + input_para.threads = param_dict["threads"] + input_para.rdm_seed = param_dict["random_seed"] + input_para.downsample_percentage = param_dict["downsample_percentage"] input_para.other_flags = 0 input_para.user_defined_fastq_base_qual_offset = margs.udqual; - input_para.output_folder = str(para_dict["output_folder"]) - input_para.out_prefix = str(para_dict["out_prefix"]) + input_para.output_folder = str(param_dict["output_folder"]) + input_para.out_prefix = str(param_dict["out_prefix"]) - for _ipf in para_dict["input_files"]: + for _ipf in param_dict["input_files"]: input_para.add_input_file(str(_ipf)) fq_output = lrst.Output_FQ() exit_code = lrst.callFASTQModule(input_para, fq_output) if exit_code == 0: - create_base_quality_plots(fq_output, para_dict, "FASTQ: Basic statistics") + create_base_quality_plots(fq_output, param_dict, "FASTQ: Basic statistics") for static in [True, False]: fq_html_gen = generate_html.ST_HTML_Generator( - [["basic_st", "read_length_st","read_length_hist", "base_st", "basic_info", "base_quality", "read_avg_base_quality"], "FASTQ QC", para_dict], static=static) + [["basic_st", "read_length_st","read_length_hist", "base_st", "basic_info", "base_quality", "read_avg_base_quality"], "FASTQ QC", param_dict], static=static) fq_html_gen.generate_st_html() - print("Call FQ-module done!") + logging.info("Completed.") + else: + logging.error("Completed with errors.") def fa_module(margs): """ Run the FASTA filetype module. """ - para_dict = {} - errorStr = lrst_global.originalError - # Format a dict object to contain our input files/output folder parameters - errorStr += get_common_param(margs, para_dict) # TODO: para_dict is always an empty dict in these calls + param_dict, error_msg = get_common_param(margs) - # If original error string has not been appended to by get_common_params(), this means that there were no parse - # errors. TODO: originalError can be an empty string in lrst_global.py - # TODO: Define these global variables at the top of this file (or module function) instead - if not errorStr == lrst_global.originalError: + if not error_msg == "": # If there are parse errors, display the filetype-specific help instructions - print(errorStr) - print("#############################################\n") - # parser.print_help() + logging.error(error_msg) parser.parse_args(['fa', '--help']) sys.exit(1002) else: # If there are no parse errors, run the filetype-specific module - logging.info('Input file(s) are ' + ';'.join(para_dict["input_files"])) - para_dict["out_prefix"] += "fa_" + logging.info('Input file(s) are ' + ';'.join(param_dict["input_files"])) + param_dict["out_prefix"] += "fa_" input_para = lrst.Input_Para() - input_para.threads = para_dict["threads"] - input_para.rdm_seed = para_dict["random_seed"] - input_para.downsample_percentage = para_dict["downsample_percentage"] + input_para.threads = param_dict["threads"] + input_para.rdm_seed = param_dict["random_seed"] + input_para.downsample_percentage = param_dict["downsample_percentage"] input_para.other_flags = 0 - input_para.output_folder = str(para_dict["output_folder"]) - input_para.out_prefix = str(para_dict["out_prefix"]) + input_para.output_folder = str(param_dict["output_folder"]) + input_para.out_prefix = str(param_dict["out_prefix"]) - for _ipf in para_dict["input_files"]: + for _ipf in param_dict["input_files"]: input_para.add_input_file(str(_ipf)) fa_output = lrst.Output_FA() exit_code = lrst.callFASTAModule(input_para, fa_output) if exit_code == 0: + logging.info("QC generated.") + logging.info("Generating output files...") from src import plot_for_FA - plot_for_FA.fa_plot(fa_output, para_dict) + plot_for_FA.fa_plot(fa_output, param_dict) # TODO: Unused 'static' variable results in redundant function call for static in [True, False]: fa_html_gen = generate_html.ST_HTML_Generator( - [["basic_st", "read_length_st","read_length_hist", "base_st", "basic_info"], "FASTA QC", para_dict], static=True) + [["basic_st", "read_length_st","read_length_hist", "base_st", "basic_info"], "FASTA QC", param_dict], static=True) fa_html_gen.generate_st_html() - print("Call FA-module done!") + else: + logging.error("QC did not generate.") + + logging.info("Done.") def bam_module(margs): """ Run the BAM filetype module. """ - para_dict = {} - errorStr = lrst_global.originalError - errorStr += get_common_param(margs, para_dict) - - if not errorStr == lrst_global.originalError: - print(errorStr) - print("#############################################\n") - # parser.print_help() + param_dict, error_msg = get_common_param(margs) + + if not error_msg == "": + logging.error(error_msg) parser.parse_args(['bam', '--help']) sys.exit(1003) else: - logging.info('Input file(s) are ' + ';'.join(para_dict["input_files"])) - para_dict["out_prefix"] += "bam_"; + logging.info('Input file(s) are ' + ';'.join(param_dict["input_files"])) + param_dict["out_prefix"] += "bam_"; input_para = lrst.Input_Para() - input_para.threads = para_dict["threads"] - input_para.rdm_seed = para_dict["random_seed"] - input_para.downsample_percentage = para_dict["downsample_percentage"] - input_para.other_flags = (1 if para_dict["detail"]>0 else 0) ; - input_para.output_folder = str(para_dict["output_folder"]) - input_para.out_prefix = str(para_dict["out_prefix"]) - - for _ipf in para_dict["input_files"]: + input_para.threads = param_dict["threads"] + input_para.rdm_seed = param_dict["random_seed"] + input_para.downsample_percentage = param_dict["downsample_percentage"] + input_para.other_flags = (1 if param_dict["detail"]>0 else 0) ; + input_para.output_folder = str(param_dict["output_folder"]) + input_para.out_prefix = str(param_dict["out_prefix"]) + + for _ipf in param_dict["input_files"]: input_para.add_input_file(str(_ipf)) bam_output = lrst.Output_BAM() exit_code = lrst.callBAMModule(input_para, bam_output) if exit_code == 0: + logging.info("QC generated.") + logging.info("Generating output files...") from src import plot_for_BAM - plot_for_BAM.bam_plot(bam_output, para_dict) + plot_for_BAM.bam_plot(bam_output, param_dict) for static in [True, False]: bam_html_gen = generate_html.ST_HTML_Generator( - [["basic_st","map_st", "err_st", "read_length_st", "read_length_hist", "base_st", "basic_info", "base_quality"], "BAM QC", para_dict], static=static) + [["basic_st","map_st", "err_st", "read_length_st", "read_length_hist", "base_st", "basic_info", "base_quality"], "BAM QC", param_dict], static=static) bam_html_gen.generate_st_html() + else: + logging.error("QC did not generate.") - print("Call BAM-module done!") + logging.info("Done.") def seqtxt_module(margs): """ Run the sequencing_summary.txt filetype module. """ - para_dict = {} - errorStr = lrst_global.originalError - errorStr += get_common_param(margs, para_dict) - - if not errorStr == lrst_global.originalError: - print(errorStr) - print("#############################################\n") - # parser.print_help() + param_dict, error_msg = get_common_param(margs) + + if not error_msg == "": + logging.error(error_msg) parser.parse_args(['seqtxt', '--help']) sys.exit(1004) else: - logging.info('Input file(s) are ' + ';'.join(para_dict["input_files"])) - para_dict["out_prefix"] += "seqtxt_"; + logging.info('Input file(s) are ' + ';'.join(param_dict["input_files"])) + param_dict["out_prefix"] += "seqtxt_"; input_para = lrst.Input_Para() - input_para.threads = para_dict["threads"] - input_para.rdm_seed = para_dict["random_seed"] - input_para.downsample_percentage = para_dict["downsample_percentage"] - - # TODO: 'seq' flag is 1 by default, which in the argument description states that only the summary text will be - # output (no HTML). + input_para.threads = param_dict["threads"] + input_para.rdm_seed = param_dict["random_seed"] + input_para.downsample_percentage = param_dict["downsample_percentage"] input_para.other_flags = margs.seq # Default = 1 input_para.other_flags = (input_para.other_flags << 4) - input_para.other_flags += (1 if para_dict["detail"]>0 else 0) + input_para.other_flags += (1 if param_dict["detail"]>0 else 0) input_para.other_flags = (input_para.other_flags << 4) input_para.other_flags += int(margs.sum_type) - input_para.output_folder = str(para_dict["output_folder"]) - input_para.out_prefix = str(para_dict["out_prefix"]) + input_para.output_folder = str(param_dict["output_folder"]) + input_para.out_prefix = str(param_dict["out_prefix"]) - for _ipf in para_dict["input_files"]: + for _ipf in param_dict["input_files"]: input_para.add_input_file(str(_ipf)) seqtxt_output = lrst.Output_SeqTxt() exit_code = lrst.callSeqTxtModule(input_para, seqtxt_output) if exit_code == 0: - print("Generating output files...") + logging.info("QC generated.") + logging.info("Generating output files...") from src import plot_for_SeqTxt - plot_for_SeqTxt.plot(seqtxt_output, para_dict) + plot_for_SeqTxt.plot(seqtxt_output, param_dict) for static in [True, False]: if margs.seq==0: - f5_html_gen = generate_html.ST_HTML_Generator([["basic_st", "read_length_st","read_length_hist","base_st","basic_info"], "sequencing_summary.txt QC", para_dict], static=static); + f5_html_gen = generate_html.ST_HTML_Generator([["basic_st", "read_length_st","read_length_hist","base_st","basic_info"], "sequencing_summary.txt QC", param_dict], static=static); else: f5_html_gen = generate_html.ST_HTML_Generator( - [["basic_st", "read_length_st","read_length_hist", "basic_info"], "sequencing_summary.txt QC", para_dict], static=static) + [["basic_st", "read_length_st","read_length_hist", "basic_info"], "sequencing_summary.txt QC", param_dict], static=static) f5_html_gen.generate_st_html() - print("Done.") + else: + logging.error("QC did not generate.") + + logging.info("Done.") def fast5_module(margs): """ Run the FAST5 filetype module. """ - para_dict = {} - errorStr = lrst_global.originalError - errorStr += get_common_param(margs, para_dict) - - if not errorStr == lrst_global.originalError: - print(errorStr) - print("#############################################\n") - # parser.print_help() + param_dict, error_msg = get_common_param(margs) + + if not error_msg == "": + logging.error(error_msg) parser.parse_args(['f5', '--help']) sys.exit(1004) else: - logging.info('Input file(s) are ' + ';'.join(para_dict["input_files"])) - para_dict["out_prefix"] += "f5_" + logging.info('Input file(s) are ' + ';'.join(param_dict["input_files"])) + param_dict["out_prefix"] += "f5_" input_para = lrst.Input_Para() - input_para.threads = para_dict["threads"] - input_para.rdm_seed = para_dict["random_seed"] - input_para.downsample_percentage = para_dict["downsample_percentage"] - input_para.output_folder = str(para_dict["output_folder"]) - input_para.out_prefix = str(para_dict["out_prefix"]) + input_para.threads = param_dict["threads"] + input_para.rdm_seed = param_dict["random_seed"] + input_para.downsample_percentage = param_dict["downsample_percentage"] + input_para.output_folder = str(param_dict["output_folder"]) + input_para.out_prefix = str(param_dict["out_prefix"]) input_para.other_flags = 0 # 0 for normal QC, 1 for signal statistics output - for _ipf in para_dict["input_files"]: + for _ipf in param_dict["input_files"]: input_para.add_input_file(str(_ipf)) fast5_output = lrst.Output_FAST5() exit_code = lrst.callFAST5Module(input_para, fast5_output) if exit_code == 0: - print("Generating output files...") - create_base_quality_plots(fast5_output, para_dict, "FAST5: Basic statistics") + logging.info("QC generated.") + logging.info("Generating output files...") + create_base_quality_plots(fast5_output, param_dict, "FAST5: Basic statistics") for static in [True, False]: fast5_html_obj = generate_html.ST_HTML_Generator( - [["basic_st", "read_length_st","read_length_hist", "base_st", "basic_info", "base_quality", "read_avg_base_quality"], "FAST5 QC", para_dict], static=static) + [["basic_st", "read_length_st","read_length_hist", "base_st", "basic_info", "base_quality", "read_avg_base_quality"], "FAST5 QC", param_dict], static=static) fast5_html_obj.generate_st_html() - print("Done.") + else: + logging.error("QC did not generate.") + + logging.info("Done.") def fast5_signal_module(margs): """ Run the FAST5 filetype module with signal statistics output. """ - para_dict = {} - errorStr = lrst_global.originalError - errorStr += get_common_param(margs, para_dict) - - if not errorStr == lrst_global.originalError: - print(errorStr) - print("#############################################\n") - # parser.print_help() + param_dict, error_msg = get_common_param(margs) + + if not error_msg == "": + logging.error(error_msg) parser.parse_args(['f5s', '--help']) sys.exit(1004) else: - logging.info('Input file(s) are ' + ';'.join(para_dict["input_files"])) - para_dict["out_prefix"] += "f5s_" + logging.info('Input file(s) are ' + ';'.join(param_dict["input_files"])) + param_dict["out_prefix"] += "f5s_" input_para = lrst.Input_Para() - input_para.threads = para_dict["threads"] - input_para.rdm_seed = para_dict["random_seed"] - input_para.downsample_percentage = para_dict["downsample_percentage"] - input_para.output_folder = str(para_dict["output_folder"]) - input_para.out_prefix = str(para_dict["out_prefix"]) + input_para.threads = param_dict["threads"] + input_para.rdm_seed = param_dict["random_seed"] + input_para.downsample_percentage = param_dict["downsample_percentage"] + input_para.output_folder = str(param_dict["output_folder"]) + input_para.out_prefix = str(param_dict["out_prefix"]) input_para.other_flags = 1 # 0 for normal QC, 1 for signal statistics output - for _ipf in para_dict["input_files"]: + for _ipf in param_dict["input_files"]: input_para.add_input_file(str(_ipf)) fast5_output = lrst.Output_FAST5() exit_code = lrst.callFAST5Module(input_para, fast5_output) if exit_code == 0: - print("Generating output files...") + logging.info("QC generated.") + logging.info("Generating output files...") from src import plot_for_FAST5s - dynamic_plots = plot_for_FAST5s.plot(fast5_output, para_dict) + dynamic_plots = plot_for_FAST5s.plot(fast5_output, param_dict) # Generate a dynamic HTML file fast5_html_obj = generate_html.ST_HTML_Generator( - [[], "FAST5 signal QC", para_dict], static=False) + [[], "FAST5 signal QC", param_dict], static=False) fast5_html_obj.generate_st_html(signal_plots=dynamic_plots) + else: + logging.error("QC did not generate.") - print("Done.") + logging.info("Done.") # ===== @@ -419,6 +419,11 @@ def fast5_signal_module(margs): common_grp_param.add_argument("--markersize", type=int, default=10, help="Marker size for plots. Default: 10") +# Number of reads to sample +common_grp_param.add_argument( + "-R", "--readCount", nargs="+", type=int, default=8, + help="Set the number of reads to randomly sample from the file. Default: 8.") + # Misc. parameters input_files_group.add_argument("-p", "--downsample_percentage", type=float, default=1.0, help="The percentage of downsampling for quick run. Default: 1.0 without downsampling") @@ -473,7 +478,7 @@ def fast5_signal_module(margs): parents=[parent_parser], help="FAST5 file input with signal statistics output", description="For example:\n" - "python %(prog)s -i input.fast5 -o /output_directory/", + "python %(prog)s -R 5 10 -i input.fast5 -o /output_directory/", formatter_class=RawTextHelpFormatter) fast5_signal_parser.set_defaults(func=fast5_signal_module) @@ -504,11 +509,11 @@ def fast5_signal_module(margs): # ===== def main(): if sys.version_info[0] < 2: - print(lrst_global.prg_name + + logging.info(lrst_global.prg_name + " could not be run with lower version than python 2.7.") else: if sys.version_info[1] < 6: - print(lrst_global.prg_name+" could be run with python 2.7.") + logging.info(lrst_global.prg_name+" could be run with python 2.7.") else: if len(sys.argv) < 2: parser.print_help() diff --git a/src/generate_html.py b/src/generate_html.py index a19983b..86cbc86 100644 --- a/src/generate_html.py +++ b/src/generate_html.py @@ -30,7 +30,6 @@ def generate_header(self): self.html_writer.write("") self.html_writer.write(self.header_info) self.html_writer.write("") - self.html_writer.write('''''') @@ -245,11 +247,11 @@ def generate_left(self): '' + lrst_global.plot_filenames[_imk]['title'] + '') _imki += 1; self.html_writer.write('') - if True: # self.more_input_files: - self.html_writer.write('
  • ') - self.html_writer.write('List of input files') - _imki += 1; - self.html_writer.write('
  • ') + + self.html_writer.write('
  • ') + self.html_writer.write('List of input files') + _imki += 1; + self.html_writer.write('
  • ') self.html_writer.write("") self.html_writer.write('') @@ -298,6 +300,7 @@ def generate_left_signal_data(self, read_names): self.html_writer.write('

    Summary

    ') self.html_writer.write('
      ') + # Add the summary table section link url_index = 0 self.html_writer.write('
    • ') @@ -338,7 +341,7 @@ def generate_right_signal_data(self, read_names, signal_plot): self.html_writer.write('
      ') # Set the description - description_text = "Base signal statistics for " + read_name + description_text = "Basecall signal" self.html_writer.write( '

      ' + description_text + '

      ') diff --git a/src/lrst_global.py b/src/lrst_global.py index e2acffa..1990077 100644 --- a/src/lrst_global.py +++ b/src/lrst_global.py @@ -6,8 +6,6 @@ prg_name = "LongReadSum" -originalError = '' - LOG_ALL = 0; LOG_DEBUG = 1; LOG_INFO = 2; diff --git a/src/plot_for_FAST5s.py b/src/plot_for_FAST5s.py index 44a4056..b5e16fb 100644 --- a/src/plot_for_FAST5s.py +++ b/src/plot_for_FAST5s.py @@ -6,9 +6,11 @@ from src import lrst_global import os +import logging import csv import numpy as np import plotly.graph_objs as go +from random import sample def plot(fast5_output, para_dict): @@ -36,9 +38,20 @@ def plot(fast5_output, para_dict): table_str += "\n\n" lrst_global.plot_filenames["basic_st"]['detail'] = table_str + # Randomly sample a small set of reads if it is a large dataset + read_count_max = para_dict["read_count"] + 1 + read_indices = [] + if read_count_max <= read_count: + # No random sampling required + read_indices = list(range(1, read_count_max)) + else: + # Randomly sample from the set + unsampled_indices = list(range(1, read_count)) + read_indices = sample(unsampled_indices, read_count_max) + # Plot the reads output_html_plots = {} - for read_index in range(read_count): + for read_index in read_indices: # Create the figure fig = go.Figure() @@ -51,7 +64,10 @@ def plot(fast5_output, para_dict): nth_read_skewness = fast5_output.getNthReadPearsonSkewnessCoeff(read_index) nth_read_kurtosis = fast5_output.getNthReadKurtosis(read_index) nth_read_sequence = fast5_output.getNthReadSequence(read_index) - sequence_length = len(nth_read_sequence) + sequence_length = len(nth_read_data) + + # Check if sequence data is available + sequence_available = True if nth_read_sequence else False # Set up the output CSVs csv_qc_filepath = os.path.join(out_path, nth_read_name + '_QC.csv') @@ -60,14 +76,15 @@ def plot(fast5_output, para_dict): qc_writer.writerow(["Base", "Raw_Signal", "Length", "Mean", "Median", "StdDev", "PearsonSkewnessCoeff", "Kurtosis"]) # Loop through the data + first_index = 0 + last_index = sequence_length start_index = 0 sequence_list = list(nth_read_sequence) base_tick_values = [] # Append the last indices of the base signal to use for tick values - for i in range(sequence_length): - base_signals = nth_read_data[i] - - window_size = len(base_signals) - end_index = start_index + window_size + for i in range(first_index, last_index): + base_signals = nth_read_data[i] # Get the base's signal + signal_length = len(base_signals) + end_index = start_index + signal_length base_tick_values.append(end_index) # Plot @@ -81,14 +98,14 @@ def plot(fast5_output, para_dict): opacity=0.5)) # Update CSVs - base_value = sequence_list[i] + base_value = sequence_list[i] if sequence_available else '' signal_mean = nth_read_means[i] signal_median = nth_read_medians[i] signal_stds = nth_read_stds[i] signal_skewness = nth_read_skewness[i] signal_kurtosis = nth_read_kurtosis[i] raw_row = \ - [base_value, base_signals, window_size, + [base_value, base_signals, signal_length, signal_mean, signal_median, signal_stds, signal_skewness, signal_kurtosis] @@ -105,25 +122,31 @@ def plot(fast5_output, para_dict): marker_size = para_dict["markersize"] fig.update_layout( title=nth_read_name, - xaxis_title="Base", yaxis_title="Signal", showlegend=False, font=dict(size=font_size) ) fig.update_traces(marker={'size': marker_size}) - fig.update_xaxes(tickangle=45, - tickmode='array', - tickvals=base_tick_values, - ticktext=sequence_list) + + if sequence_available: + # Set up X tick labels + x_tick_labels = sequence_list[first_index:last_index] + fig.update_xaxes(title="Base", + tickangle=0, + tickmode='array', + tickvals=base_tick_values, + ticktext=x_tick_labels) + else: + fig.update_xaxes(title="Index") # Save image - image_filepath = os.path.join(out_path, nth_read_name + '_BaseSignal.png') - print("saving plot to ", image_filepath, "...") + image_filepath = os.path.join(out_path, "img", nth_read_name + '_BaseSignal.png') fig.write_image(image_filepath) + save_msg = "Plot image saved to: " + image_filepath + logging.info(save_msg) # Append the dynamic HTML object to the output structure dynamic_html = fig.to_html(full_html=False) output_html_plots.update({nth_read_name: dynamic_html}) - print("Plot generated.") return output_html_plots diff --git a/src/utils.py b/src/utils.py index e400dc5..8171aea 100644 --- a/src/utils.py +++ b/src/utils.py @@ -1,3 +1,4 @@ +import logging import os, itertools from src import lrst_global