diff --git a/Makefile b/Makefile index e658b0c..271d6ba 100644 --- a/Makefile +++ b/Makefile @@ -3,10 +3,8 @@ SRC_DIR := $(CURDIR)/src LIB_DIR := $(CURDIR)/lib all: - # Generate the SWIG python <-> C++ wrappers (Both ways) - # The Python wrapper is stored in the main directory, while the C++ wrapper code is stored in /src + # Generate the SWIG Python/C++ wrappers swig -c++ -python -outdir $(LIB_DIR) -I$(INCL_DIR) -o $(SRC_DIR)/lrst_wrap.cpp $(SRC_DIR)/lrst.i - # Compile the C++ shared libraries - # The C++ wrapper binary _lrst is generated in the /src directory + # Compile the C++ shared libraries into lib/ python setup.py build_ext --build-lib $(LIB_DIR) diff --git a/src/__main__.py b/__main__.py similarity index 100% rename from src/__main__.py rename to __main__.py diff --git a/conda/build.sh b/conda/build.sh index 292cd88..f61b8ec 100644 --- a/conda/build.sh +++ b/conda/build.sh @@ -1,5 +1,7 @@ #!/bin/bash +# Build the C++ library + # Add the library path to the LD_LIBRARY_PATH export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PREFIX}/lib @@ -13,11 +15,11 @@ $PYTHON setup.py -I"${PREFIX}"/include -L"${PREFIX}"/lib install cp "${SRC_DIR}"/longreadsum "${PREFIX}"/bin # Create the src directory -mkdir -p "${PREFIX}"/src +#mkdir -p "${PREFIX}"/lib -# Copy the lib files to the src directory -cp -r "${SRC_DIR}"/src/*.py "${PREFIX}"/src +# Copy the lib files to the build directory +cp -r "${SRC_DIR}"/src/*.py "${PREFIX}"/lib # Copy the SWIG generated library to the lib directory -cp -r "${SRC_DIR}"/lib/*.py "${PREFIX}"/src +cp -r "${SRC_DIR}"/lib/*.py "${PREFIX}"/lib cp -r "${SRC_DIR}"/lib/*.so "${PREFIX}"/lib diff --git a/environment.yml b/environment.yml index 90c394f..236fa4e 100644 --- a/environment.yml +++ b/environment.yml @@ -6,17 +6,15 @@ channels: - defaults dependencies: - python - - setuptools - - conda-build - hdf5 - htslib - - samtools - swig - matplotlib - plotly=4.14 - pytest - - pytest-dependency - pip + - pip: + - kaleido # conda env create --file=environment.yml -# conda activate lrst_py36 +# conda activate lrst_py39 diff --git a/include/BAM_module.h b/include/BAM_module.h deleted file mode 100644 index d7dc4a5..0000000 --- a/include/BAM_module.h +++ /dev/null @@ -1,57 +0,0 @@ -#ifndef BAM_MODULE_H_ -#define BAM_MODULE_H_ - -#include -#include -#include - -#include "InputStructure.h" -#include "OutputStructures.h" -#include "BamReader.h" - - -class BAM_Thread_data { - public: - int _thread_id; - std::vector record_list; // Vector of individual BAM file records - Input_Para m_input_op; - std::map t_secondary_alignment; - std::map t_supplementary_alignment; - - Output_BAM t_output_bam_; - - BAM_Thread_data(Input_Para& ref_input_op, int p_thread_id, int p_batch_size); - ~BAM_Thread_data(); -}; - -class BAM_Module{ -private: - static size_t read_i_bam; - -public: - static std::mutex myMutex_readBam; - static std::mutex myMutex_output; - static size_t batch_size_of_record; - - std::map secondary_alignment; - std::map supplementary_alignment; - - Input_Para m_input_op; - - BamReader* _bam_reader_ptr; - std::vector m_threads; - - - int exit_code; // Exit code for error handling - - static void BAM_do_thread(BamReader* ref_bam_reader_ptr, Input_Para& ref_input_op, int thread_id, BAM_Thread_data& ref_thread_data, Output_BAM& ref_output, std::map& ref_secondary_alignment, std::map& ref_supplementary_alignment); - - int calculateStatistics( Output_BAM& t_output_bam_info); - - BAM_Module(Input_Para& _m_input); - ~BAM_Module(); -}; - - -#endif - diff --git a/include/BamReader.h b/include/BamReader.h deleted file mode 100644 index 01025cf..0000000 --- a/include/BamReader.h +++ /dev/null @@ -1,191 +0,0 @@ -#ifndef BAMREADER_H_ -#define BAMREADER_H_ - -#include -#include - -#include -#include -#include - -#include - -#include "ComStruct.h" - -#define BAM_UN_OPEN 1 -#define BAM_FAILED 2 -#define BAM_OPEN 3 -#define BAM_CLOSE 4 - -#define DNASEQ 1 -#define RNASEQ 2 - - -// Type representing data for a single record in the BAM file -typedef struct Bam1Record{ - int num_mismatch; - uint64_t qry_start_pos; - uint64_t qry_end_pos; - uint64_t qry_start_pos_rel; - uint64_t qry_seq_len; - uint64_t _len_original_read; - uc8string qry_qual; - std::string qry_name; - std::string qry_seq; - - uint16_t pri_sec_sup; - uint16_t map_flag; - uint8_t map_qual; - uint16_t map_strand; - std::vector cigar_len; - std::vector cigar_type; - - std::vector map_detail; - std::vector map_pos_detail; - - uint64_t ref_start_pos; - uint64_t ref_end_pos; - uint64_t left_clip; - uint64_t right_clip; - std::string map_chr; -} Bam1Record; - -class BamReadOption{ - bool m_w_qry_seq; - bool m_w_qry_qual; - - //bool m_w_map_detail; - bool m_w_pos_map_detail; - //char * m_w_ref_seq; - //uint64_t m_w_ref_seq_len; - - bool m_w_unmap; - bool m_w_supplementary; - bool m_w_secondary; - - bool m_w_specifiedRegion; - uint64_t min_ovlp_len; - - uint64_t min_read_len; - - uint8_t map_quality_thr; - -public: - std::map > repdict; - -private: - std::map >::iterator chr_dit; - std::map::iterator pos_it; - bool ovlp_region; - uint64_t ovlp_max_start; - uint64_t ovlp_min_end; - - std::ostringstream _c_oss_chr; - std::ostringstream _c_oss_pos; - std::string _c_line; - -public: - uint8_t data_type; - uint8_t get_map_quality_thr(); - int set_map_quality_thr(uint8_t p_map_quality_thr); - int set_w_qry_seq(bool mqryseq); - int set_w_map_detail(bool m_p_det); //, bool m_c_det, char * ref_seq, int m_refseq_len); - int set_w_qry_qual(bool mqryqual); - bool get_w_qry_qual(); - bool get_w_qry_seq(); - //bool get_w_map_detail(); - bool get_w_pos_map_detail(); - //char get_ref_char_at_pos(uint64_t m_pos); - uint64_t get_min_read_len(){return min_read_len;} - - int set_min_read_len(uint64_t _min_read_len){ min_read_len = _min_read_len; return 0; } - int set_w_unmap(bool m_unmap); - int set_w_supplementary(bool m_sup); - int set_w_secondary(bool m_sec); - int set_w_specifiedRegion(bool spR, uint64_t mol); - int set_w_specifiedRegion(bool spR); - - int set_repdict(std::string mrepfile); - int set_repdict1(const std::string & m_str, std::string m_delimiter=WhiteSpace); - //std::map > get_repdict(){ return repdict; } - - bool get_w_unmap(); - bool get_w_supplementary(); - bool get_w_secondary(); - bool get_w_specifiedRegion(); - uint64_t get_min_ovlp_len(); - - bool check_unmap(uint8_t curflag); - bool check_supplementary(uint8_t curflag); - bool check_secondary(uint8_t curflag); - int check_specifiedRegion(const char * chrn, uint64_t refStartPos, uint64_t refEdnPos); - void check_specifiedRegion_multi(const char * chrn, uint64_t refStartPos, uint64_t refEndPos, std::vector& c_rep_regions, std::map > & p_rep_dict); - // not safe - std::map >::iterator get_chr_it(); - std::map::iterator get_pos_it(); - bool check_min_qry_len(uint64_t _qry_len); - - BamReadOption(); - ~BamReadOption(); - - std::string toString(); -}; - -class BamReader{ - void init(); - void destroy(); - - int bam_status; - - char in_bam_file[CHAR_SIZE]; - samFile * in_bam; - bam_hdr_t * hdr; - bam1_t * bam_one_alignment; - bam1_core_t * bam_1alignment_core; - - int _read1RecordFromBam_(); - - uint64_t ref_go_pos; - uint64_t qry_go_pos; - uint64_t qry_go_pos_rel; - - void _set_map_pos_detail(uint64_t ref_go_pos, uint64_t qry_go_pos, int mlen, Bam1Record & br, uint8_t m_op, bool ref_add, bool qry_add, const uint16_t m_map_strand, const uint64_t m_len_original_read); - //void _set_map_detail(uint64_t ref_go_pos, uint64_t qry_go_pos, int mlen, Bam1Record & br, uint8_t m_op, bool ref_add, bool qry_add, const uint16_t m_map_strand, const uint64_t m_len_read); - - int t_num_records; - - std::string _bam_file_str; - -public: - static const char m_cigar_str[]; - - std::mutex myMutex_readBam; - - BamReadOption bamReadOp; - - BamReader(const char * bamfile); - BamReader(); - int openBam(const char * bamfile); - ~BamReader(); - bool check_bam_status(); - - Bam1Record current_bam_record; - - int read1RecordFromBam(); - int readBam(std::vector &br_list, int num_records=-1, bool br_clear=true); - int resetBam(const char * bamfile); - //int resetBam(); - std::string get_bam_file(); - int reset_Bam1Record(); - int reset_Bam1Record(Bam1Record & br); - - static std::string Bam1Record_toString(Bam1Record & br); - std::string Bam1Record_toString(); - static std::string Basic_Bam1Record_toString(Bam1Record & br); - static std::string Min_Bam1Record_toString(Bam1Record & br); -}; - - -#endif - - diff --git a/include/ComStruct.h b/include/ComStruct.h index 279cdce..1851ff6 100644 --- a/include/ComStruct.h +++ b/include/ComStruct.h @@ -4,7 +4,7 @@ #include #include #include - +#include ///////////////////////////////// #define MAX_F5EVENT_SIZE 3000000 diff --git a/include/OutputStructures.h b/include/OutputStructures.h deleted file mode 100644 index e481cee..0000000 --- a/include/OutputStructures.h +++ /dev/null @@ -1,237 +0,0 @@ -#ifndef OUTPUTSTRUCTURES_H -#define OUTPUTSTRUCTURES_H - -/* -OutputStructures.h -Define the output structures for each module. -*/ - -#include -#include - -#define MAX_READ_LENGTH 10485760 -#define MAX_MAP_QUALITY 256 -#define MAX_BASE_QUALITY 256 -#define MAX_READ_QUALITY 256 -#define MAX_SIGNAL_VALUE 5000 -#define PERCENTAGE_ARRAY_SIZE 101 -#define ZeroDefault 0 -#define MoneDefault -1 - - -// Base class for storing error output. -class Output_Info -{ -public: - int error_flag; - std::string error_str; - Output_Info(); -}; - - -// Base class for storing basic QC data -class Basic_Seq_Statistics -{ -public: - int64_t total_num_reads = ZeroDefault; // total number of long reads - int64_t total_num_bases = ZeroDefault; // total number of bases - - uint64_t longest_read_length = ZeroDefault; // the length of longest reads - int64_t n50_read_length = MoneDefault; // N50 - int64_t n95_read_length = MoneDefault; // N95 - int64_t n05_read_length = MoneDefault; // N05; - double mean_read_length = MoneDefault; // mean of read length - std::vector nx_read_length; // deprecated - std::vector NXX_read_length; // NXX_read_length[50] means N50 read length - int64_t median_read_length = MoneDefault; // median of read length - - int64_t total_a_cnt = ZeroDefault; // A content - int64_t total_c_cnt = ZeroDefault; // C content - int64_t total_g_cnt = ZeroDefault; // G content - int64_t total_tu_cnt = ZeroDefault; // T content for DNA, or U content for RNA - int64_t total_n_cnt = ZeroDefault; // N content - double gc_cnt = ZeroDefault; // GC ratio - - std::vector read_length_count; - - std::vector read_gc_content_count; - - // - std::vector read_lengths; // Length of reads - - void reset(); - void add(Basic_Seq_Statistics &t_seq_st); - void global_sum(); - void global_sum_no_gc(); - void resize(); - - Basic_Seq_Statistics(); - Basic_Seq_Statistics(const Basic_Seq_Statistics &_bss); - ~Basic_Seq_Statistics(); -}; - - -// Base class for storing base quality data -class Basic_Seq_Quality_Statistics -{ -public: - std::vector base_quality_distribution; - std::vector read_average_base_quality_distribution; - int min_base_quality = MoneDefault; // minimum base quality; - int max_base_quality = MoneDefault; // maximum base quality; - std::vector pos_quality_distribution; - std::vector pos_quality_distribution_dev; - std::vector pos_quality_distribution_count; - int64_t max_length = ZeroDefault; - - std::vector read_quality_distribution; - int min_read_quality = MoneDefault; // the minimum mapping quality - int max_read_quality = MoneDefault; // the maximum mapping quality - - void reset(); - void add(Basic_Seq_Quality_Statistics &t_qual_st); - void global_sum(); - - Basic_Seq_Quality_Statistics(); - Basic_Seq_Quality_Statistics(const Basic_Seq_Quality_Statistics &_bsqs); - ~Basic_Seq_Quality_Statistics(); -}; - -// FASTA output -class Output_FA : public Output_Info -{ -public: - Basic_Seq_Statistics long_read_info; -}; - -// FASTQ output -class Output_FQ : public Output_FA -{ -public: - Basic_Seq_Quality_Statistics seq_quality_info; -}; - - -// BAM output -class Output_BAM : public Output_FQ -{ -public: - int64_t num_primary_alignment = ZeroDefault; // the number of primary alignment/ - int64_t num_secondary_alignment = ZeroDefault; // the number of secondary alignment - int64_t num_reads_with_secondary_alignment = ZeroDefault; // the number of long reads with the secondary alignment: one read might have multiple seconard alignment - int64_t num_supplementary_alignment = ZeroDefault; // the number of supplementary alignment - int64_t num_reads_with_supplementary_alignment = ZeroDefault; // the number of long reads with secondary alignment; - int64_t num_reads_with_both_secondary_supplementary_alignment = ZeroDefault; // the number of long reads with both secondary and supplementary alignment. - - int64_t forward_alignment = ZeroDefault; - int64_t reverse_alignment = ZeroDefault; - - std::vector map_quality_distribution; - int min_map_quality = MoneDefault; // the minimum mapping quality - int max_map_quality = MoneDefault; // the maximum mapping quality - - // Similar to Output_FA: below are for mapped. - int64_t num_matched_bases = ZeroDefault; // the number of matched bases with = - int64_t num_mismatched_bases = ZeroDefault; // the number of mismatched bases X - int64_t num_ins_bases = ZeroDefault; // the number of inserted bases; - int64_t num_del_bases = ZeroDefault; // the number of deleted bases; - int64_t num_clip_bases = ZeroDefault; // the number of soft-clipped bases; - - // The number of columns can be calculated by summing over the lengths of M/I/D CIGAR operators - int64_t num_columns = ZeroDefault; // the number of columns - double percent_identity = ZeroDefault; // Percent identity = (num columns - NM) / num columns - - std::vector accuracy_per_read; - - Basic_Seq_Statistics mapped_long_read_info; - Basic_Seq_Statistics unmapped_long_read_info; - - Basic_Seq_Quality_Statistics mapped_seq_quality_info; - Basic_Seq_Quality_Statistics unmapped_seq_quality_info; - - void reset(); - void add(Output_BAM &t_output_bam); - void global_sum(); - - Output_BAM(); - ~Output_BAM(); -}; - - -// sequencing_summary.txt output -class Basic_SeqTxt_Statistics -{ -public: - Basic_Seq_Statistics long_read_info; - Basic_Seq_Quality_Statistics seq_quality_info; - - std::vector signal_range; - int min_signal = MoneDefault; // minimum signals; - int max_signal = MoneDefault; // maximum signals; - - void reset(); - void add(Basic_SeqTxt_Statistics &output_data); - void global_sum(); - - Basic_SeqTxt_Statistics(); - ~Basic_SeqTxt_Statistics(); -}; - -class Output_SeqTxt : public Output_Info -{ -public: - Basic_SeqTxt_Statistics all_long_read_info; - Basic_SeqTxt_Statistics passed_long_read_info; - Basic_SeqTxt_Statistics failed_long_read_info; - - void reset(); - void add(Output_SeqTxt &output_data); - void global_sum(); -}; - -// Base class for storing a read's base signal data -class Base_Signals -{ -public: - std::string read_name; - int base_count; - std::string sequence_data_str; // Sequence of bases - std::vector> basecall_signals; // 2D vector of base signals - - // Methods - int getBaseCount(); - std::string getReadName(); - std::string getSequenceString(); - std::vector> getDataVector(); - Base_Signals(std::string read_name, std::string sequence_data_str, std::vector> basecall_signals); -}; - -// FAST5 output -class Output_FAST5 : public Output_FA -{ -public: - // Base quality section - Basic_Seq_Quality_Statistics seq_quality_info; - - // Signal data section - int read_count; - int base_count; - std::vector read_base_signals; - - // Methods - int getReadCount(); - int getTotalBaseCount(); - std::string getNthReadName(int read_index); - std::string getNthReadSequence(int read_index); - void addReadBaseSignals(Base_Signals values); - std::vector> getNthReadBaseSignals(int read_index); - std::vector getNthReadBaseMeans(int read_index); - std::vector getNthReadBaseStds(int read_index); - std::vector getNthReadBaseMedians(int read_index); - std::vector getNthReadPearsonSkewnessCoeff(int read_index); - std::vector getNthReadKurtosis(int read_index); - - Output_FAST5(); -}; - -#endif diff --git a/include/bam_module.h b/include/bam_module.h new file mode 100644 index 0000000..a2634d3 --- /dev/null +++ b/include/bam_module.h @@ -0,0 +1,23 @@ +#ifndef BAM_MODULE_H_ +#define BAM_MODULE_H_ + +#include +#include +#include + +#include "input_parameters.h" +#include "output_data.h" +#include "hts_reader.h" + +class BAM_Module{ +public: + static const size_t records_per_thread = 3000; + size_t file_index = 0; + std::map secondary_alignment; + std::map supplementary_alignment; + + int calculateStatistics(Input_Para& input_params, Output_BAM& final_output); + static void batchStatistics(HTSReader& reader, int batch_size, Input_Para& input_params, Output_BAM& ref_output, std::mutex& bam_mutex, std::mutex& output_mutex, std::mutex& cout_mutex); +}; + +#endif diff --git a/include/BasicStatistics.h b/include/basic_statistics.h similarity index 100% rename from include/BasicStatistics.h rename to include/basic_statistics.h diff --git a/include/FAST5_module.h b/include/fast5_module.h similarity index 69% rename from include/FAST5_module.h rename to include/fast5_module.h index c6bdf8a..4a85a33 100644 --- a/include/FAST5_module.h +++ b/include/fast5_module.h @@ -1,8 +1,8 @@ #ifndef FAST5_MODULE_H_ #define FAST5_MODULE_H_ -#include "InputStructure.h" -#include "OutputStructures.h" +#include "input_parameters.h" +#include "output_data.h" int generateQCForFAST5(Input_Para &_input_data, Output_FAST5 &output_data); diff --git a/include/FASTA_module.h b/include/fasta_module.h similarity index 68% rename from include/FASTA_module.h rename to include/fasta_module.h index 95e61a1..571d57a 100644 --- a/include/FASTA_module.h +++ b/include/fasta_module.h @@ -1,8 +1,8 @@ #ifndef FASTA_MODULE_H_ #define FASTA_MODULE_H_ -#include "InputStructure.h" -#include "OutputStructures.h" +#include "input_parameters.h" +#include "output_data.h" int qc_fasta_files(Input_Para &_input_data, Output_FA &py_output_fa); diff --git a/include/FASTQ_module.h b/include/fastq_module.h similarity index 68% rename from include/FASTQ_module.h rename to include/fastq_module.h index 82d1eec..afa76e1 100644 --- a/include/FASTQ_module.h +++ b/include/fastq_module.h @@ -1,8 +1,8 @@ #ifndef FASTQ_MODULE_H_ #define FASTQ_MODULE_H_ -#include "InputStructure.h" -#include "OutputStructures.h" +#include "input_parameters.h" +#include "output_data.h" int qc_fastq_files(Input_Para &_input_data, Output_FQ &py_output_fq); diff --git a/include/hts_reader.h b/include/hts_reader.h new file mode 100644 index 0000000..be5cb86 --- /dev/null +++ b/include/hts_reader.h @@ -0,0 +1,53 @@ +#ifndef HTSREADER_H_ +#define HTSREADER_H_ + +#include +#include + +#include +#include +#include +#include +#include + +#include "ComStruct.h" +#include "output_data.h" + +#define BAM_UN_OPEN 1 +#define BAM_FAILED 2 +#define BAM_OPEN 3 +#define BAM_CLOSE 4 + +#define DNASEQ 1 +#define RNASEQ 2 + +// Class representing an HTSlib BAM file +class HTSReader { + public: + htsFile* bam_file; // open the BAM file for reading + bam_hdr_t* header; // read the BAM header + bam1_t* record; + int record_count = 0; + + // Bool for whether the reading is complete + bool reading_complete = false; + + // Update read and base counts + int updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics* basic_qc, std::vector& base_quality_distribution); + + // Read the next batch of records from the BAM file + int readNextRecords(int batch_size, Output_BAM & output_data, std::mutex & read_mutex); + + // Return if the reader has finished reading the BAM file + bool hasNextRecord(); + + // Return the number of records in the BAM file using the BAM index + int64_t getNumRecords(const std::string &bam_file_name); + + HTSReader(const std::string &bam_file_name); + ~HTSReader(); +}; + +#endif + + diff --git a/include/InputStructure.h b/include/input_parameters.h similarity index 83% rename from include/InputStructure.h rename to include/input_parameters.h index 7050c2b..e0ab772 100644 --- a/include/InputStructure.h +++ b/include/input_parameters.h @@ -1,10 +1,10 @@ /* -Python_to_CXX_class.h: +input_parameters.h: Define the Python bindings from our C++ modules */ -#ifndef PYTHON_TO_CXX_CLASS_H -#define PYTHON_TO_CXX_CLASS_H +#ifndef INPUT_PARAMETERS_H +#define INPUT_PARAMETERS_H #include #define MAX_INPUT_FILES 2048 @@ -12,7 +12,7 @@ Define the Python bindings from our C++ modules /* Input_Para: -Python-invoked C++ class +Define the input parameters for the program */ class Input_Para{ public: diff --git a/include/ModuleCaller.h b/include/module_caller.h similarity index 78% rename from include/ModuleCaller.h rename to include/module_caller.h index f9697f4..4d3efb8 100644 --- a/include/ModuleCaller.h +++ b/include/module_caller.h @@ -1,5 +1,9 @@ -#include "InputStructure.h" -#include "OutputStructures.h" +/* module_caller.h + * Module caller header file + */ + +#include "input_parameters.h" +#include "output_data.h" int callBAMModule( Input_Para& _input_data, Output_BAM& py_output_bam ); diff --git a/include/output_data.h b/include/output_data.h new file mode 100644 index 0000000..7268317 --- /dev/null +++ b/include/output_data.h @@ -0,0 +1,246 @@ +#ifndef OUTPUTSTRUCTURES_H +#define OUTPUTSTRUCTURES_H + +/* +OutputStructures.h +Define the output structures for each module. +*/ + +#include +#include +#include +#include + +#include "input_parameters.h" + +#define MAX_READ_LENGTH 10485760 +#define MAX_MAP_QUALITY 256 +#define MAX_BASE_QUALITY 256 +#define MAX_READ_QUALITY 256 +#define MAX_SIGNAL_VALUE 5000 +#define PERCENTAGE_ARRAY_SIZE 101 +#define ZeroDefault 0 +#define MoneDefault -1 + + +// Base class for storing error output. +class Output_Info +{ +public: + int error_flag; + std::string error_str; + Output_Info(); +}; + + +// Base class for storing basic QC data +class Basic_Seq_Statistics +{ +public: + int total_num_reads = ZeroDefault; // total number of long reads + uint64_t total_num_bases = ZeroDefault; // total number of bases + int longest_read_length = ZeroDefault; // the length of longest reads + int n50_read_length = MoneDefault; // N50 + int n95_read_length = MoneDefault; // N95 + int n05_read_length = MoneDefault; // N05; + double mean_read_length = MoneDefault; // mean of read length + std::vector NXX_read_length; // NXX_read_length[50] means N50 read length + int median_read_length = MoneDefault; // median of read length + + uint64_t total_a_cnt = ZeroDefault; // A content + uint64_t total_c_cnt = ZeroDefault; // C content + uint64_t total_g_cnt = ZeroDefault; // G content + uint64_t total_tu_cnt = ZeroDefault; // T content for DNA, or U content for RNA + uint64_t total_n_cnt = ZeroDefault; // N content + double gc_cnt = ZeroDefault; // GC ratio + std::vector read_gc_content_count; // GC content distribution + std::vector read_length_count; // Read length distribution (TODO: Replace usages with read_lengths) + std::vector read_lengths; // Length of reads + + void reset(); + void add(Basic_Seq_Statistics &t_seq_st); + void global_sum(); + void global_sum_no_gc(); + void calculate_NXX_scores(); // Calculate N50, N95, N05, etc. + void resize(); + + Basic_Seq_Statistics(); + Basic_Seq_Statistics(const Basic_Seq_Statistics &_bss); + ~Basic_Seq_Statistics(); +}; + + +// Base class for storing base quality data +class Basic_Seq_Quality_Statistics +{ +public: + std::vector base_quality_distribution; + std::vector read_average_base_quality_distribution; + int min_base_quality = MoneDefault; // minimum base quality; + int max_base_quality = MoneDefault; // maximum base quality; + std::vector pos_quality_distribution; + std::vector pos_quality_distribution_dev; + std::vector pos_quality_distribution_count; + int max_length = ZeroDefault; + + std::vector read_quality_distribution; + int min_read_quality = MoneDefault; // the minimum mapping quality + int max_read_quality = MoneDefault; // the maximum mapping quality + + void reset(); + void add(Basic_Seq_Quality_Statistics &t_qual_st); + void global_sum(); + + Basic_Seq_Quality_Statistics(); + Basic_Seq_Quality_Statistics(const Basic_Seq_Quality_Statistics &_bsqs); + ~Basic_Seq_Quality_Statistics(); +}; + +// FASTA output +class Output_FA : public Output_Info +{ +public: + Basic_Seq_Statistics long_read_info; +}; + +// FASTQ output +class Output_FQ : public Output_FA +{ +public: + Basic_Seq_Quality_Statistics seq_quality_info; +}; + + +// BAM output +class Output_BAM : public Output_FQ +{ +public: + uint64_t num_primary_alignment = ZeroDefault; // the number of primary alignment/ + uint64_t num_secondary_alignment = ZeroDefault; // the number of secondary alignment + uint64_t num_reads_with_secondary_alignment = ZeroDefault; // the number of long reads with the secondary alignment: one read might have multiple seconard alignment + uint64_t num_supplementary_alignment = ZeroDefault; // the number of supplementary alignment + uint64_t num_reads_with_supplementary_alignment = ZeroDefault; // the number of long reads with secondary alignment; + uint64_t num_reads_with_both_secondary_supplementary_alignment = ZeroDefault; // the number of long reads with both secondary and supplementary alignment. + uint64_t forward_alignment = ZeroDefault; // Total number of forward alignments + uint64_t reverse_alignment = ZeroDefault; // Total number of reverse alignments + + // Map of reads with supplementary alignments + std::map reads_with_supplementary; + + // Map of reads with secondary alignments + std::map reads_with_secondary; + + std::vector map_quality_distribution; + int min_map_quality = MoneDefault; // the minimum mapping quality + int max_map_quality = MoneDefault; // the maximum mapping quality + + // Similar to Output_FA: below are for mapped. + uint64_t num_matched_bases = ZeroDefault; // the number of matched bases with = + uint64_t num_mismatched_bases = ZeroDefault; // the number of mismatched bases X + uint64_t num_ins_bases = ZeroDefault; // the number of inserted bases; + uint64_t num_del_bases = ZeroDefault; // the number of deleted bases; + uint64_t num_clip_bases = ZeroDefault; // the number of soft-clipped bases; + + // The number of columns can be calculated by summing over the lengths of M/I/D CIGAR operators + int num_columns = ZeroDefault; // the number of columns + double percent_identity = ZeroDefault; // Percent identity = (num columns - NM) / num columns + + std::vector accuracy_per_read; + + Basic_Seq_Statistics mapped_long_read_info; + Basic_Seq_Statistics unmapped_long_read_info; + + Basic_Seq_Quality_Statistics mapped_seq_quality_info; + Basic_Seq_Quality_Statistics unmapped_seq_quality_info; + + // Add a batch of records to the output + void add(Output_BAM &t_output_bam); + + // Calculate QC across all records + void global_sum(); + + // Save the output to a summary text file + void save_summary(std::string &output_file, Input_Para ¶ms, Output_BAM &output_data); + + Output_BAM(); + ~Output_BAM(); +}; + + +// sequencing_summary.txt output +class Basic_SeqTxt_Statistics +{ +public: + Basic_Seq_Statistics long_read_info; + Basic_Seq_Quality_Statistics seq_quality_info; + + std::vector signal_range; + int min_signal = MoneDefault; // minimum signals; + int max_signal = MoneDefault; // maximum signals; + + void reset(); + void add(Basic_SeqTxt_Statistics &output_data); + void global_sum(); + + Basic_SeqTxt_Statistics(); + ~Basic_SeqTxt_Statistics(); +}; + +class Output_SeqTxt : public Output_Info +{ +public: + Basic_SeqTxt_Statistics all_long_read_info; + Basic_SeqTxt_Statistics passed_long_read_info; + Basic_SeqTxt_Statistics failed_long_read_info; + + void reset(); + void add(Output_SeqTxt &output_data); + void global_sum(); +}; + +// Base class for storing a read's base signal data +class Base_Signals +{ +public: + std::string read_name; + int base_count; + std::string sequence_data_str; // Sequence of bases + std::vector> basecall_signals; // 2D vector of base signals + + // Methods + int getBaseCount(); + std::string getReadName(); + std::string getSequenceString(); + std::vector> getDataVector(); + Base_Signals(std::string read_name, std::string sequence_data_str, std::vector> basecall_signals); +}; + +// FAST5 output +class Output_FAST5 : public Output_FA +{ +public: + // Base quality section + Basic_Seq_Quality_Statistics seq_quality_info; + + // Signal data section + int read_count; + int base_count; + std::vector read_base_signals; + + // Methods + int getReadCount(); + int getTotalBaseCount(); + std::string getNthReadName(int read_index); + std::string getNthReadSequence(int read_index); + void addReadBaseSignals(Base_Signals values); + std::vector> getNthReadBaseSignals(int read_index); + std::vector getNthReadBaseMeans(int read_index); + std::vector getNthReadBaseStds(int read_index); + std::vector getNthReadBaseMedians(int read_index); + std::vector getNthReadPearsonSkewnessCoeff(int read_index); + std::vector getNthReadKurtosis(int read_index); + + Output_FAST5(); +}; + +#endif diff --git a/include/SeqTxt_module.h b/include/seqtxt_module.h similarity index 97% rename from include/SeqTxt_module.h rename to include/seqtxt_module.h index 2134255..1035598 100644 --- a/include/SeqTxt_module.h +++ b/include/seqtxt_module.h @@ -1,8 +1,8 @@ #ifndef SeqTxt_MODULE_H_ #define SeqTxt_MODULE_H_ -#include "InputStructure.h" -#include "OutputStructures.h" +#include "input_parameters.h" +#include "output_data.h" #include #include diff --git a/longreadsum b/longreadsum deleted file mode 100644 index d02f7cf..0000000 --- a/longreadsum +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -# Add the library path to the LD_LIBRARY_PATH -export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PREFIX}/lib - -# Show all files in src/ -echo "Files in src/" -ls -l "${PREFIX}"/src - -# Run the entry point.py file -python "${PREFIX}"/src/entry_point.py "$@" diff --git a/setup.py b/setup.py index 7b7bb7f..fbd61f6 100644 --- a/setup.py +++ b/setup.py @@ -17,24 +17,13 @@ project_src_files.extend(glob.glob(project_dir + '*.cpp')) project_headers = glob.glob('include/*.h') -print("Project dir: ") -print(project_dir) - -print("working dir: ") -print(os.getcwd()) - -# Print project source files -print("Project source files:") -for src_file in project_src_files: - print("\t" + src_file) - # Set up the extension include_dirs = ['include'] lrst_mod = Extension("_lrst", sources=project_src_files, language='c++', extra_compile_args=['-std=c++11'], - libraries=["rt", "pthread", "z", "dl", "m", "hts", "hdf5_cpp", "hdf5", "hdf5_hl_cpp", "hdf5_hl"], + libraries=["hts", "hdf5_cpp", "hdf5", "hdf5_hl_cpp", "hdf5_hl"], include_dirs=include_dirs, depends=project_headers, ) diff --git a/src/BAM_module.cpp b/src/BAM_module.cpp deleted file mode 100644 index bac8b7d..0000000 --- a/src/BAM_module.cpp +++ /dev/null @@ -1,329 +0,0 @@ -/* -BAM_module.cpp: -Class for generating BAM file statistics. Records are accessed using multi-threading. -*/ - -#include - -#include "BAM_module.h" -#include "ComFunction.h" - -BAM_Thread_data::BAM_Thread_data(Input_Para& ref_input_op, int p_thread_id, int p_batch_size=1000){ - this->record_list.reserve(p_batch_size+1); - _thread_id = p_thread_id; - m_input_op = ref_input_op; -} - -BAM_Thread_data::~BAM_Thread_data(){ -} - -size_t BAM_Module::batch_size_of_record=3000; // Number of records for each thread -std::mutex BAM_Module::myMutex_readBam; // Mutex allows you to compile data across threads -std::mutex BAM_Module::myMutex_output; -size_t BAM_Module::read_i_bam = 0; - -BAM_Module::BAM_Module(Input_Para& _m_input){ - m_input_op = _m_input; - - exit_code = 0; - read_i_bam = 0; - - _bam_reader_ptr = NULL; - if ( read_i_bam >= _m_input.num_input_files ){ - std::cerr << "No input BAM are find. #input_files="<< _m_input.num_input_files <check_bam_status()){ - std::cerr << "Cannot open bam file="<< _m_input.input_files[read_i_bam] <bamReadOp.set_w_supplementary(true); - _bam_reader_ptr->bamReadOp.set_w_secondary(true); - _bam_reader_ptr->bamReadOp.set_min_read_len(1); - _bam_reader_ptr->bamReadOp.set_w_qry_seq(true); - _bam_reader_ptr->bamReadOp.set_w_unmap(true); - _bam_reader_ptr->bamReadOp.set_w_qry_qual(true); - _bam_reader_ptr->bamReadOp.set_w_map_detail(false); - read_i_bam = 1; - } - } -} - -BAM_Module::~BAM_Module(){ - if (_bam_reader_ptr!=NULL){ - delete _bam_reader_ptr; - } - _bam_reader_ptr = NULL; -} - -int BAM_Module::calculateStatistics( Output_BAM& t_output_bam_info){ - auto relapse_start_time = std::chrono::high_resolution_clock::now(); - - if (exit_code==0){ - m_threads.reserve(m_input_op.threads+3); - - t_output_bam_info.unmapped_long_read_info.resize(); - t_output_bam_info.mapped_long_read_info.resize(); - t_output_bam_info.long_read_info.resize(); - - int _i_t=0; - BAM_Thread_data** thread_data_vector = new BAM_Thread_data*[m_input_op.threads]; - try{ - - for (_i_t=0; _i_t::iterator sec_it=secondary_alignment.begin(),sup_it=supplementary_alignment.begin(); (sec_it!=secondary_alignment.end()) && ( sup_it!=supplementary_alignment.end()); ){ - if ( *sec_it < *sup_it ){ ++sec_it; } - else if ( *sec_it > *sup_it ){ ++sup_it; } - else{ t_output_bam_info.num_reads_with_both_secondary_supplementary_alignment += 1; ++sec_it; ++sup_it; } - } - - // Calculate NXX scores and GC content - t_output_bam_info.global_sum(); - - auto relapse_end_time = std::chrono::high_resolution_clock::now(); - std::cout<<"Total time(Elapsed): "<: "<< (exit_code==0?"Completed successfully":"Failed") <& ref_secondary_alignment, std::map& ref_supplementary_alignment){ - std::vector::iterator record_data; - uint64_t match_this_read; - double accuracy_perc_this_read; - uint64_t _t_a, _t_c, _t_g, _t_tu; - double _t_gc_per; - while (true){ - ref_thread_data.record_list.clear(); - myMutex_readBam.lock(); - - // Read the record into the thread pointer - ref_bam_reader_ptr->readBam( ref_thread_data.record_list, BAM_Module::batch_size_of_record ); - if (ref_thread_data.record_list.size() == 0 && !(read_i_bam < ref_input_op.num_input_files) ){ - myMutex_readBam.unlock(); - break; - } - if ( ref_thread_data.record_list.size() < batch_size_of_record ){ - if ( read_i_bam < ref_input_op.num_input_files ){ - std::cout<< "INFO: Open bam file="<< ref_input_op.input_files[read_i_bam] <resetBam( ref_input_op.input_files[read_i_bam].c_str() ); - if ( ref_thread_data.record_list.size() == 0 ){ - ref_bam_reader_ptr->readBam( ref_thread_data.record_list, BAM_Module::batch_size_of_record ); - } - read_i_bam++; - } - } - myMutex_readBam.unlock(); - - if (ref_thread_data.record_list.size() == 0 ) { continue; } - - // Count individual bases (mapped and unmapped) in BAM file records - ref_thread_data.t_secondary_alignment.clear(); - ref_thread_data.t_supplementary_alignment.clear(); - ref_thread_data.t_output_bam_.reset(); - ref_thread_data.t_output_bam_.unmapped_long_read_info.resize(); - ref_thread_data.t_output_bam_.mapped_long_read_info.resize(); - ref_thread_data.t_output_bam_.long_read_info.resize(); - for(record_data=ref_thread_data.record_list.begin(); record_data!=ref_thread_data.record_list.end(); record_data++){ - Basic_Seq_Statistics* output_qc = NULL; // Output base QC data for either mapped or unmapped reads - Basic_Seq_Quality_Statistics* output_quality_qc = NULL; // Output base quality QC data - - // Set the number of mismatches using the NM tag - ref_thread_data.t_output_bam_.num_mismatched_bases = record_data->num_mismatch; - - // The map flag indicates whether this record refers to unmapped or mapped data. - if ( (record_data->map_flag)& BAM_FUNMAP ) { - // Unmapped data - output_qc = &( ref_thread_data.t_output_bam_.unmapped_long_read_info); - output_quality_qc = &(ref_thread_data.t_output_bam_.unmapped_seq_quality_info); - - // Update the read lengths - int read_length = record_data->qry_seq_len; - output_qc->read_lengths.push_back(read_length); - - }else if ( (record_data->map_flag)& BAM_FSUPPLEMENTARY ) { - // Supplementary alignments (Only reads are counted for these) - ref_thread_data.t_output_bam_.num_supplementary_alignment += 1; - ref_thread_data.t_supplementary_alignment[ record_data->qry_name ] = true; - - }else if ( (record_data->map_flag)& BAM_FSECONDARY ) { - // Secondary alignments (Only reads are counted for these) - ref_thread_data.t_output_bam_.num_secondary_alignment += 1; - ref_thread_data.t_secondary_alignment[ record_data->qry_name ] = true; - - } else if (! ((record_data->map_flag & BAM_FSECONDARY) || (record_data->map_flag & BAM_FSUPPLEMENTARY)) ) { - // To get primary alignments, remove supplementary (Bit #12) and non-primary alignments (Bit #9) - ref_thread_data.t_output_bam_.num_primary_alignment += 1; - output_qc = &( ref_thread_data.t_output_bam_.mapped_long_read_info); - output_quality_qc = &(ref_thread_data.t_output_bam_.mapped_seq_quality_info); - - // Update the read lengths - int read_length = record_data->qry_seq_len; - output_qc->read_lengths.push_back(read_length); - } - - if ( !( (record_data->map_flag)& BAM_FUNMAP ) ){ - if (record_data->map_strand==0){ ref_thread_data.t_output_bam_.forward_alignment += 1; } - else { ref_thread_data.t_output_bam_.reverse_alignment += 1; } - } - - if ( record_data->map_qual < ref_thread_data.t_output_bam_.min_map_quality || ref_thread_data.t_output_bam_.min_map_quality==MoneDefault){ - ref_thread_data.t_output_bam_.min_map_quality = record_data->map_qual; } - if ( record_data->map_qual > ref_thread_data.t_output_bam_.max_map_quality ){ ref_thread_data.t_output_bam_.max_map_quality = record_data->map_qual; } - ref_thread_data.t_output_bam_.map_quality_distribution[ int(record_data->map_qual) ] += 1; - - if (output_qc!= NULL && !( (record_data->map_flag)& BAM_FSECONDARY ) && !((record_data->map_flag)& BAM_FSUPPLEMENTARY ) ){ - output_qc->total_num_reads += 1; - output_qc->total_num_bases += record_data->qry_seq_len; - - if( record_data->qry_seq_len > output_qc->longest_read_length){ output_qc->longest_read_length = record_data->qry_seq_len; } - if ( record_data->qry_seq_len < MAX_READ_LENGTH ){ - output_qc->read_length_count[record_data->qry_seq_len] += 1; - } - - _t_a=0; - _t_c=0; - _t_g=0; - _t_tu=0; - _t_gc_per = 0; - - for (size_t _si=0; _siqry_seq_len; _si++){ - int _base_qual_int = int(record_data->qry_qual[_si]); - if ( _base_qual_int > MAX_BASE_QUALITY || _base_qual_int <0){ - std::cerr << "Thread "<< thread_id << ": The base quality is not in [0,"<< MAX_BASE_QUALITY <<"]: " << int(record_data->qry_qual[_si]) << " at " << _si << " for " << record_data->qry_name <base_quality_distribution[_base_qual_int ] += 1; - if ( output_quality_qc->min_base_quality==MoneDefault || _base_qual_int < output_quality_qc->min_base_quality){ - output_quality_qc->min_base_quality = _base_qual_int; } - if ( _base_qual_int > output_quality_qc->max_base_quality) { output_quality_qc->max_base_quality = _base_qual_int; } - } - - switch( record_data->qry_seq[ _si ] ){ - case 'A': case 'a': - output_qc->total_a_cnt += 1; _t_a += 1; break; - case 'C': case 'c': - output_qc->total_c_cnt += 1; _t_c += 1; break; - case 'G': case 'g': - output_qc->total_g_cnt += 1; _t_g += 1; break; - case 'T': case 't': case 'U': case 'u': - output_qc->total_tu_cnt += 1; _t_tu += 1; break; - case 'N': case 'n': - output_qc->total_n_cnt += 1; break; - default: - std::cerr<<"Thread "<< thread_id << ": Unknown type of nucletides: "<< record_data->qry_seq[ _si ] << " for " << record_data->qry_name <qry_seq_len>0?record_data->qry_seq_len:-1)/100.0 ); - if ( record_data->qry_seq_len>0 ){ - if ( int(_t_gc_per+0.5) < PERCENTAGE_ARRAY_SIZE){ - output_qc->read_gc_content_count[ int(_t_gc_per+0.5) ] += 1; - }else{ - std::cerr<<"GC (%) content("<<_t_gc_per<<") is larger than total("<qry_name <map_flag)& BAM_FUNMAP ) || ((record_data->map_flag)&BAM_FSECONDARY) ) ){ - match_this_read = 0; - accuracy_perc_this_read = 0; - for(size_t _ci=0; _cicigar_type.size(); _ci++){ - switch (record_data->cigar_type[_ci]){ - case BAM_CEQUAL: - case BAM_CMATCH: // M - match_this_read += record_data->cigar_len[_ci]; - ref_thread_data.t_output_bam_.num_matched_bases += record_data->cigar_len[_ci]; - - // Update the number of columns for the percent identity computation - ref_thread_data.t_output_bam_.num_columns += record_data->cigar_len[_ci]; - break; - case BAM_CINS: // I - ref_thread_data.t_output_bam_.num_ins_bases += record_data->cigar_len[_ci]; - - // Update the number of columns for the percent identity computation - ref_thread_data.t_output_bam_.num_columns += record_data->cigar_len[_ci]; - break; - case BAM_CDEL: // D - ref_thread_data.t_output_bam_.num_del_bases += record_data->cigar_len[_ci]; - - // Update the number of columns for the percent identity computation - ref_thread_data.t_output_bam_.num_columns += record_data->cigar_len[_ci]; - break; - case BAM_CREF_SKIP: // R - break; - case BAM_CSOFT_CLIP: // S - ref_thread_data.t_output_bam_.num_clip_bases += record_data->cigar_len[_ci]; - break; - case BAM_CHARD_CLIP: // H - ref_thread_data.t_output_bam_.num_clip_bases += record_data->cigar_len[_ci]; - break; - case BAM_CPAD: // P - break; - case BAM_CDIFF: // X - // Below is replaced with the NM tag in BAMReader.cpp - //ref_thread_data.t_output_bam_.num_mismatched_bases += record_data->cigar_len[_ci]; - break; - default: - std::cerr<<"Thread "<< thread_id << ": Unknown cigar "<< record_data->cigar_type[_ci] << record_data->cigar_len[_ci] << " for " << record_data->qry_name <num_mismatch; - ref_thread_data.t_output_bam_.percent_identity = ((num_columns_ - num_mismatch_) / num_columns_) * 100.0; - //std::cout << "Percent identity = " << ref_thread_data.t_output_bam_.percent_identity << std::endl; - - accuracy_perc_this_read = int( (double(match_this_read)/record_data->qry_seq_len)*100+0.5); - if ( accuracy_perc_this_read <0 || accuracy_perc_this_read>=PERCENTAGE_ARRAY_SIZE) { - std::cout<<"ERROR!!! #matched("<qry_seq_len <<") for "<< record_data->qry_name < -#include -#include -#include - -#include "BamReader.h" -#include "ComFunction.h" - -int BamReadOption::set_repdict1(const std::string & input_pattern, std::string str_delimiters){ - RepeatRegion repeat_region; // Where the repeat region will be stored - _c_oss_chr.str(""); - _c_oss_chr.clear(); - _c_oss_pos.str(""); - _c_oss_pos.clear(); - - if (input_pattern.size()<3){ - return 0; - } - - std::vector region_splitstr = readRepeatDataFromString(input_pattern, str_delimiters); - // TODO: The following tests should occur within the function - if (region_splitstr.size()<4){ - std::cout<< "\tWarning!!! Repeat pattern format is not correct! <" << input_pattern << ">" < new_rr_dict; - new_rr_dict[_c_oss_pos.str()] = repeat_region; - repdict[_c_oss_chr.str()] = new_rr_dict; - }else{ - pos_it = chr_dit->second.find(_c_oss_pos.str()); - if (pos_it==chr_dit->second.end()){ - (chr_dit->second)[_c_oss_pos.str()] = repeat_region; - }else{ - std::cout<<"\t Warning!!! Repeat info already in the dict: "<<_c_oss_chr.str()<<":"<<_c_oss_pos.str()<< " ??? "<first << ":" << pos_it->first<0){ - std::ifstream infile(mrepfile); - while (std::getline(infile, _c_line)){ - set_repdict1(_c_line); - } - infile.close(); - } - return 0; -} - -std::string BamReadOption::toString(){ - std::ostringstream oss_tostr; - - oss_tostr <<"\t" << (m_w_qry_seq?"With query sequence":"Without query sequence")<<"\n"; - oss_tostr <<"\t" << (m_w_qry_qual?"With query sequence quality":"Without query sequence quality")<<"\n"; - oss_tostr <<"\t" << (m_w_pos_map_detail?"With map_pos detail":"Without map_pos detail")<<"\n"; - oss_tostr <<"\t" << (m_w_unmap?"With unmapped reads":"Without unmapped reads")<<"\n"; - oss_tostr <<"\t" << (m_w_supplementary?"With supplementary map":"Without supplementary map")<<"\n"; - oss_tostr <<"\t" << (m_w_secondary?"With seconday map":"Without seconday map")<<"\n"; - if (m_w_specifiedRegion){ - oss_tostr <<"\t" << "Specify a region of interest and require "<second.begin(); pos_it!=chr_dit->second.end(); pos_it++){ - oss_tostr <<"\t\t" << (chr_dit->first) << ":"<<(pos_it->first) << ":" << pos_it->second.len_repeat_unit << "\n"; - } - } - }else{ - oss_tostr <<"\t" << "Get all alignment rather than for specific regions" <<"\n"; - } - - return oss_tostr.str(); -} - -int BamReadOption::set_w_qry_seq(bool mqryseq){ m_w_qry_seq = mqryseq; return 0; } - -int BamReadOption::set_w_map_detail(bool m_p_det){ - m_w_pos_map_detail = m_p_det; - return 0; -} - -int BamReadOption::set_w_qry_qual(bool mqryqual){ m_w_qry_qual=mqryqual; return 0; } -bool BamReadOption::get_w_qry_qual(){ return m_w_qry_qual; } - -bool BamReadOption::get_w_qry_seq(){ return m_w_qry_seq; } - -bool BamReadOption::get_w_pos_map_detail(){ return m_w_pos_map_detail; } - -BamReadOption::BamReadOption(){ - data_type = DNASEQ; - set_w_qry_seq(false); - set_w_map_detail(true); //, false, NULL, 0); - set_w_unmap(false); - set_w_supplementary(true); - set_w_secondary(true); - set_w_specifiedRegion(false); - set_w_qry_qual(false); - set_min_read_len(50); - set_map_quality_thr(0); -} - -BamReadOption::~BamReadOption(){ set_w_map_detail(true); }//, false, NULL, 0); } - -int BamReadOption::set_w_unmap(bool m_unmap){ m_w_unmap = m_unmap; return 0; } -int BamReadOption::set_w_supplementary(bool m_sup){ m_w_supplementary = m_sup; return 0; } -int BamReadOption::set_w_secondary(bool m_sec){ m_w_secondary = m_sec; return 0; } -int BamReadOption::set_w_specifiedRegion(bool spR){ return set_w_specifiedRegion(spR, 10); } -int BamReadOption::set_w_specifiedRegion(bool spR, uint64_t mol){ - m_w_specifiedRegion = spR; - min_ovlp_len = mol; - if (mol<1 && m_w_specifiedRegion){ - std::cout<<"Warning!!! the length for overlaping (" << mol << ") is too small."<< std::endl; - } - return 0; -} -uint8_t BamReadOption::get_map_quality_thr(){ - return map_quality_thr; -} -int BamReadOption::set_map_quality_thr(uint8_t p_map_quality_thr){ - map_quality_thr = p_map_quality_thr; - return 0; -} - -bool BamReadOption::get_w_unmap(){ return m_w_unmap; } -bool BamReadOption::get_w_supplementary(){ return m_w_supplementary; } -bool BamReadOption::get_w_secondary(){ return m_w_secondary; } -bool BamReadOption::get_w_specifiedRegion(){ return m_w_specifiedRegion; } -uint64_t BamReadOption::get_min_ovlp_len(){ return min_ovlp_len; } - -bool BamReadOption::check_unmap(uint8_t curflag){ - if ((!m_w_unmap) && (curflag & BAM_FUNMAP)) { return false; } - else {return true;} -} -bool BamReadOption::check_supplementary(uint8_t curflag){ - if ((!m_w_supplementary) && (curflag & BAM_FSUPPLEMENTARY)) { return false; } - else {return true;} -} -bool BamReadOption::check_secondary(uint8_t curflag){ - if ((!m_w_secondary) && (curflag & BAM_FSECONDARY)) { return false; } - else {return true;} -} - -bool BamReadOption::check_min_qry_len(uint64_t _qry_len){ - if (_qry_len > min_read_len){ return true; } - else { return false; } -} - -inline uint64_t get_c_min_ovlp_len(uint64_t endp1, uint64_t startp1, uint64_t endp2, uint64_t startp2, uint64_t p_min_ovlp_len){ - uint64_t d1 = (endp1 > startp1+5) ? (endp1 - startp1) : 5; - uint64_t d2 = (endp2 > startp2+5) ? (endp2 - startp2) : 5; - uint64_t mmin = d1>d2 ? d2 : d1; - mmin = mmin > p_min_ovlp_len ? p_min_ovlp_len : mmin; - if (mmin < 5) { return 5; } - else { return mmin; } -} - -void BamReadOption::check_specifiedRegion_multi(const char * repeat_size, uint64_t refStartPos, uint64_t refEndPos, std::vector& all_repeat_regions, std::map > & p_rep_dict){ - chr_dit = p_rep_dict.find(repeat_size); - if (chr_dit!=p_rep_dict.end()){ - for (pos_it=chr_dit->second.begin(); pos_it!=chr_dit->second.end(); pos_it++){ - ovlp_max_start = pos_it->second.start_pos > refStartPos ? pos_it->second.start_pos : refStartPos; - ovlp_min_end = pos_it->second.end_pos < refEndPos ? pos_it->second.end_pos : refEndPos; - if (ovlp_min_end >= ovlp_max_start + get_c_min_ovlp_len(refEndPos, refStartPos, pos_it->second.end_pos, pos_it->second.start_pos, min_ovlp_len)){ - RepeatRegion repeat_region; - if (isExpectedLength(repeat_region.repeat_size, repeat_size, CHAR_SIZE)){ - repeat_region.start_pos = pos_it->second.start_pos; - repeat_region.end_pos = pos_it->second.end_pos; - repeat_region.len_repeat_unit = pos_it->second.len_repeat_unit; - all_repeat_regions.push_back(repeat_region); - } - } - } - } -} -int BamReadOption::check_specifiedRegion(const char * repeat_size, uint64_t refStartPos, uint64_t refEndPos){ - if (m_w_specifiedRegion){ - chr_dit = repdict.find(repeat_size); - if (chr_dit==repdict.end()){ - return -1; - } - - ovlp_region = false; - for (pos_it=chr_dit->second.begin(); pos_it!=chr_dit->second.end(); pos_it++){ - ovlp_max_start = pos_it->second.start_pos > refStartPos ? pos_it->second.start_pos : refStartPos; - ovlp_min_end = pos_it->second.end_pos < refEndPos ? pos_it->second.end_pos : refEndPos; - if (ovlp_min_end >= ovlp_max_start + get_c_min_ovlp_len(refEndPos, refStartPos, pos_it->second.end_pos, pos_it->second.start_pos, min_ovlp_len)){ - ovlp_region = true; - break; - } - } - if (ovlp_region){ return 2; } - else { return -2;} - }else{ - return 1; - } -} - -std::map >::iterator BamReadOption::get_chr_it(){ - return chr_dit; -} -std::map::iterator BamReadOption::get_pos_it(){ - return pos_it; -} - - -const char BamReader::m_cigar_str[] = "MIDNSHP=XB"; - -int BamReader::openBam(const char * bamfile){ - if (!isExpectedLength(in_bam_file, (char*)bamfile, CHAR_SIZE)) { return BAM_FAILED;} - - _bam_file_str = std::string(bamfile); - - bam_one_alignment = bam_init1(); - bam_1alignment_core = &bam_one_alignment->core; - - in_bam = sam_open(bamfile, "rb"); - if (NULL == in_bam){ - std::cerr<< "Cannot open SAM file ("<current_bam_record = br1; - //br_list.clear(); - return openBam(bamfile); -} - -std::string BamReader::get_bam_file(){ - return _bam_file_str; -} - -BamReader::~BamReader(){ - destroy(); -} -void BamReader::destroy(){ - if (bam_one_alignment!=NULL){ - bam_destroy1(bam_one_alignment); - } - if (hdr!=NULL){ - bam_hdr_destroy(hdr); - } - if (in_bam!=NULL){ - sam_close(in_bam); - } - bam_status = BAM_CLOSE; - init(); -} - -bool BamReader::check_bam_status(){ - if (bam_status==BAM_OPEN){ - return true; - }else{ - return false; - } -} - -int BamReader::read1RecordFromBam(){ - if (!check_bam_status()){ - std::cout<< "No bam opened or Open bam failed." << std::endl; - return 1; - } - - int failed = 2; - while (sam_read1(in_bam, hdr, bam_one_alignment)>=0){ - if (_read1RecordFromBam_()==0){ - failed = 0; - break; - } - } - - return failed; -} - -int BamReader::reset_Bam1Record(){ - return reset_Bam1Record(this->current_bam_record); -} -int BamReader::reset_Bam1Record(Bam1Record & br){ - br.cigar_len.clear(); - br.cigar_type.clear(); - if (br.cigar_len.capacity()<1000){ br.cigar_len.reserve(1000); } - if (br.cigar_type.capacity()<1000){ br.cigar_type.reserve(1000); } - - br.map_detail.clear(); - br.map_pos_detail.clear(); - if (br.map_pos_detail.capacity()<50000){ br.map_pos_detail.reserve(50000); } - - br.qry_qual.clear(); - br.qry_name.clear(); - br.qry_seq.clear(); - - br.map_chr.clear(); - - br.qry_seq.clear(); - br.qry_qual.clear(); - - br._len_original_read = 0; - - return 0; -} - -void BamReader::_set_map_pos_detail(uint64_t ref_go_pos, uint64_t qry_go_pos, int mlen, Bam1Record & br, uint8_t m_op, bool ref_add, bool qry_add, const uint16_t m_map_strand, const uint64_t m_len_original_read){ - //std::cout<<" in _set_map_pos_detail "<=m_len_original_read){ - std::cout<< "Error!!! mbp.qry_pos("<< mbp.qry_pos << ") >= m_len_original_read(" <current_bam_record); - - Bam1Record & br = this->current_bam_record; - - // Get the number of mismatched bases using the MD tag - uint8_t *nmTag = bam_aux_get(bam_one_alignment, "NM"); - if (nmTag != NULL) { - br.num_mismatch = bam_aux2i(nmTag); - } else { - br.num_mismatch = -1; - } - - br.qry_name = bam_get_qname(bam_one_alignment); - br.map_flag = bam_1alignment_core->flag; - if (!bamReadOp.check_unmap(br.map_flag)){ - return 1; - } - if (!bamReadOp.check_supplementary(br.map_flag)){ - return 2; - } - if (!bamReadOp.check_secondary(br.map_flag)){ - return 3; - } - br.map_qual = bam_1alignment_core->qual; - if ( br.map_qual < bamReadOp.get_map_quality_thr()){ - return 505; - } - - br.pri_sec_sup = 0; - if (br.map_flag & BAM_FSUPPLEMENTARY) { br.pri_sec_sup = 1000; } - if (br.map_flag & BAM_FSECONDARY) { br.pri_sec_sup = 100; } - br.ref_start_pos = bam_1alignment_core->pos; - br.ref_end_pos = bam_endpos(bam_one_alignment); - - br.map_strand = 0; // forward; - if (bam_is_rev(bam_one_alignment) ) { br.map_strand = 1; } - if ( !( br.map_flag & BAM_FUNMAP ) ){ - br.map_chr = hdr->target_name[bam_1alignment_core->tid]; - }else{ br.map_chr = ""; } - - br.qry_seq_len = bam_1alignment_core->l_qseq; - if ( br.map_flag & BAM_FSECONDARY ){ - size_t cal_len = bam_cigar2qlen(bam_1alignment_core->n_cigar, bam_get_cigar(bam_one_alignment)); - if ( cal_len > br.qry_seq_len){ br.qry_seq_len = cal_len; } - } - if (!bamReadOp.check_min_qry_len(br.qry_seq_len)){ - return 4; - } - if ( (bamReadOp.get_w_qry_seq() || bamReadOp.get_w_pos_map_detail()) && (!( br.map_flag & BAM_FSECONDARY )) ){ - uint8_t * seq_int = bam_get_seq(bam_one_alignment); - for (uint64_t sqii=0; sqii < br.qry_seq_len; sqii++){ - br.qry_seq.push_back(seq_nt16_str[bam_seqi(seq_int, sqii)]); - } - } - - if (bamReadOp.get_w_qry_qual()){ - if (!( br.map_flag & BAM_FSECONDARY )) { br.qry_qual.append(bam_get_qual(bam_one_alignment)); } - } - - if(bamReadOp.get_w_specifiedRegion() && bamReadOp.check_specifiedRegion(br.map_chr.c_str(), br.ref_start_pos, br.ref_end_pos) < 0) { - return 5; - } - // get qry information; - uint32_t* m_cigar = bam_get_cigar(bam_one_alignment); - - int m_op, m_next_op, m_pre_op; - int m_len; - if (bamReadOp.get_w_pos_map_detail()){; // || bamReadOp.get_w_map_detail()){; - }else{ - for (uint32_t m_i_cigar=0; m_i_cigarn_cigar; ++m_i_cigar){ - m_op = bam_cigar_op(m_cigar[m_i_cigar]); br.cigar_type.push_back(m_op); - m_len = bam_cigar_oplen(m_cigar[m_i_cigar]); br.cigar_len.push_back(m_len); - } - return 0; - } - - br.left_clip = 0; - br.right_clip = 0; - ref_go_pos = br.ref_start_pos; - qry_go_pos = 0; qry_go_pos_rel = 0; - bool first_non_indel_clip = false; - br._len_original_read = 0; - uint64_t _len_read = 0; - uint64_t _len_align = 0; - for (uint32_t m_i_cigar=0; m_i_cigarn_cigar; ++m_i_cigar){ - m_op = bam_cigar_op(m_cigar[m_i_cigar]); //br.cigar_type.push_back(m_op); - if (m_op==BAM_CDEL || m_op==BAM_CREF_SKIP || m_op==BAM_CPAD || m_op==BAM_CBACK){ - continue; - } - br._len_original_read += bam_cigar_oplen(m_cigar[m_i_cigar]); //br.cigar_len.push_back(m_len); - if (m_op!=BAM_CHARD_CLIP){ - _len_read += bam_cigar_oplen(m_cigar[m_i_cigar]); //br.cigar_len.push_back(m_len); - } - if ((m_op!=BAM_CHARD_CLIP) && (m_op!=BAM_CSOFT_CLIP)){ - _len_align += bam_cigar_oplen(m_cigar[m_i_cigar]); - } - } - - if (bamReadOp.get_min_read_len()>_len_align){return 6;} - - for (uint32_t m_i_cigar=0; m_i_cigarn_cigar; ++m_i_cigar){ - m_op = bam_cigar_op(m_cigar[m_i_cigar]); br.cigar_type.push_back(m_op); - m_len = bam_cigar_oplen(m_cigar[m_i_cigar]); br.cigar_len.push_back(m_len); - - if (m_i_cigar==0){ - if (m_op==BAM_CSOFT_CLIP || m_op==BAM_CHARD_CLIP){ - if (br.map_strand==0) { br.left_clip = m_len; } - else{ br.right_clip = m_len;} - if (m_i_cigar+1n_cigar){ - m_next_op = bam_cigar_op(m_cigar[m_i_cigar+1]); - if (m_next_op!=BAM_CEQUAL && m_next_op!=BAM_CMATCH){ - std::cout<<"Warning-non-match! The second aligned bases are not matached "<< br.qry_name <<" in " << _bam_file_str <<" " << m_i_cigar+1<<":"<< m_next_op << "/" << bam_cigar_oplen(m_cigar[m_i_cigar+1]) <n_cigar-1){ - if (m_op==BAM_CSOFT_CLIP || m_op==BAM_CHARD_CLIP){ - if (br.map_strand==0) { br.right_clip = m_len; } - else{ br.left_clip = m_len; } - if (m_i_cigar>0){ - m_pre_op = bam_cigar_op(m_cigar[m_i_cigar-1]); - if (m_pre_op !=BAM_CEQUAL && m_pre_op!=BAM_CMATCH){ - std::cout<<"Warning-non-match! The last second aligned bases are not matached "<< br.qry_name<<" in " << _bam_file_str <<" " << m_i_cigar-1<<":"<< m_pre_op << "/" << bam_cigar_oplen(m_cigar[m_i_cigar-1]) <0){ // to include br.qry_end_pos - br.qry_end_pos = br.qry_end_pos - 1; - }else{ - return -1; - } - return 0; -} - -int BamReader::readBam(std::vector &br_list, int num_records, bool br_clear){ - if (!check_bam_status()){ - std::cout<< "No bam opened or Open bam failed." << std::endl; - return 1; - } - if ( br_clear){ - br_list.clear(); - } - int sam_ret; - t_num_records = 0; - while (sam_read1(in_bam, hdr, bam_one_alignment)>=0){ - if ((sam_ret=_read1RecordFromBam_())==0){ - br_list.push_back(this->current_bam_record); - t_num_records += 1; - if (num_records>0 && t_num_records>=num_records){ - break; - } - } - } - - return 0; -} - -std::string BamReader::Min_Bam1Record_toString(Bam1Record & br){ - std::ostringstream oss_tostr; - oss_tostr << br.qry_name << ":" << br.qry_start_pos<<"-" << br.qry_end_pos<<"/"<topn?topn:br.cigar_len.size()); cgi++){ - oss_tostr << br.cigar_type[cgi]<<":"<::iterator mbp1_it; - oss_tostr <<"Map_pos=" << br.map_pos_detail.size() << "/"<< (br.map_pos_detail.size() - br.left_clip - br.right_clip) << "\n"; - for (mbp1_it=br.map_pos_detail.begin(); mbp1_it!=br.map_pos_detail.end(); mbp1_it++){ - if (topi>=topn){break;} - oss_tostr << "\t" <ref_pos<<"<-->"<qry_pos<<" <---"<map_type; - topi++; - } - oss_tostr << "\n"; - - return oss_tostr.str(); -} - -std::string BamReader::Bam1Record_toString(){ - return Bam1Record_toString(this->current_bam_record); -} - - - - diff --git a/src/bam_module.cpp b/src/bam_module.cpp new file mode 100644 index 0000000..a366b81 --- /dev/null +++ b/src/bam_module.cpp @@ -0,0 +1,109 @@ +/* +BAM_module.cpp: +Class for generating BAM file statistics. Records are accessed using multi-threading. +*/ + +#include +#include +#include +#include + +#include "bam_module.h" +#include "ComFunction.h" + + +int BAM_Module::calculateStatistics(Input_Para& input_params, Output_BAM& final_output){ + int exit_code = 0; + auto relapse_start_time = std::chrono::high_resolution_clock::now(); + + // Create mutexes + std::mutex bam_mutex; + std::mutex output_mutex; + std::mutex cout_mutex; + + // Loop through the input files + int file_count = (int) input_params.num_input_files; + //std::cout << "Number of input files = " << file_count << std::endl; + for (int i=0; i < file_count; i++){ + this->file_index = i; + + // Create a BAM reader + std::string filepath(input_params.input_files[this->file_index]); + HTSReader reader(filepath); + std::cout<<"Processing file: "<< filepath << std::endl; + + // Get the number of threads + int thread_count = input_params.threads; + + // Get the number of records in the file using the BAM index + std::cout << "Getting number of records..." << std::endl; + int num_records = reader.getNumRecords(filepath); + std::cout << "Number of records = " << num_records << std::endl; + + // Determine the batch size if the thread count is greater than 1 + int batch_size = 0; + if (thread_count > 1) { + // Determine the number of records per thread + batch_size = (int) ceil((double)num_records / (double)thread_count); + std::cout << "Batch size (records per thread) = " << batch_size << std::endl; + } else { + // Set the batch size to the number of records + batch_size = num_records; + } + + // Calculate statistics in batches + while (reader.hasNextRecord()){ + std::cout << "Generating " << thread_count << " thread(s)..." << std::endl; + std::vector thread_vector; + for (int thread_index=0; thread_index{:,d}{:," \ + "d}{:,d} " + double_str_for_format = "{}{:.1f}{:.1f}{:.1f} " + table_str += int_str_for_format.format("#Total Reads", bam_output.mapped_long_read_info.total_num_reads, + bam_output.unmapped_long_read_info.total_num_reads, + bam_output.long_read_info.total_num_reads) + table_str += int_str_for_format.format("#Total Bases", + bam_output.mapped_long_read_info.total_num_bases, + bam_output.unmapped_long_read_info.total_num_bases, + bam_output.long_read_info.total_num_bases) + table_str += int_str_for_format.format("Longest Read Length", + bam_output.mapped_long_read_info.longest_read_length, + bam_output.unmapped_long_read_info.longest_read_length, + bam_output.long_read_info.longest_read_length) + table_str += int_str_for_format.format("N50", + bam_output.mapped_long_read_info.n50_read_length, + bam_output.unmapped_long_read_info.n50_read_length, + bam_output.long_read_info.n50_read_length) + table_str += double_str_for_format.format("GC Content(%)", + bam_output.mapped_long_read_info.gc_cnt * 100, + bam_output.unmapped_long_read_info.gc_cnt * 100, + bam_output.long_read_info.gc_cnt * 100) + table_str += double_str_for_format.format("Mean Read Length", + bam_output.mapped_long_read_info.mean_read_length, + bam_output.unmapped_long_read_info.mean_read_length, + bam_output.long_read_info.mean_read_length) + table_str += int_str_for_format.format("Median Read Length", + bam_output.mapped_long_read_info.median_read_length, + bam_output.unmapped_long_read_info.median_read_length, + bam_output.long_read_info.median_read_length) + table_str += "\n\n" + + plot_filepaths["basic_st"]['detail'] = table_str + + +def plot(bam_output, para_dict): + out_path = para_dict["output_folder"] + plot_filepaths = getDefaultPlotFilenames() + get_image_path = lambda x: os.path.join(out_path, plot_filepaths[x]['file']) + + # Set the default matplotlib font size + setDefaultFontSize(12) + + # Get the font size for plotly plots + font_size = para_dict["fontsize"] + + # Create the summary table + create_summary_table(bam_output, plot_filepaths) + + # Generate plots + plot_alignment_numbers(bam_output, get_image_path('map_st')) + plot_errors(bam_output, get_image_path('err_st')) + + plot_read_length_stats( + [bam_output.long_read_info, bam_output.mapped_long_read_info, bam_output.unmapped_long_read_info], + get_image_path('read_length_st'), subtitles=['All Reads', 'Mapped Reads', 'Unmapped Reads']) + plot_base_counts([bam_output.long_read_info, bam_output.mapped_long_read_info, bam_output.unmapped_long_read_info], + get_image_path('base_st'), subtitles=['All Reads', 'Mapped Reads', 'Unmapped Reads']) + plot_basic_info([bam_output.long_read_info, bam_output.mapped_long_read_info, bam_output.unmapped_long_read_info], + get_image_path('basic_info'), categories=['All Reads', 'Mapped Reads', 'Unmapped Reads']) + + plot_filepaths['read_length_hist']['dynamic'] = read_lengths_histogram(bam_output.long_read_info, + get_image_path('read_length_hist'), + font_size) + plot_filepaths['base_quality']['dynamic'] = base_quality(bam_output.seq_quality_info, + get_image_path('base_quality'), font_size) + + return plot_filepaths diff --git a/src/BasicStatistics.cpp b/src/basic_statistics.cpp similarity index 98% rename from src/BasicStatistics.cpp rename to src/basic_statistics.cpp index 84ce6d0..53e0eb9 100644 --- a/src/BasicStatistics.cpp +++ b/src/basic_statistics.cpp @@ -9,7 +9,7 @@ Utility functions for basic statistics #include // std::foreach #include // sqrt, pow -#include "BasicStatistics.h" +#include "basic_statistics.h" // Compute basic statistics double computeMean(std::vector values) diff --git a/src/cli.py b/src/cli.py index 2c58a99..e3ceb6b 100644 --- a/src/cli.py +++ b/src/cli.py @@ -9,30 +9,35 @@ import argparse from argparse import RawTextHelpFormatter -import lrst -import generate_html -from plot_utils import * +# import lrst + +# Print the package name +# print("Package name: " + __name__) +if __name__ == 'src.cli': + # print("Running locally.") + from lib import lrst # For debugging + from src import generate_html + from src.plot_utils import * +else: + # print("Running from installed package.") + import lrst + import generate_html + from plot_utils import * logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +prg_name = "LongReadSum" -def get_log_level(log_l): - if log_l == lrst_global.LOG_ALL: - return logging.ALL - elif log_l == lrst_global.LOG_DEBUG: - return logging.DEBUG - elif log_l == lrst_global.LOG_INFO: - return logging.INFO - elif log_l == lrst_global.LOG_WARN: - return logging.WARN - elif log_l == lrst_global.LOG_ERROR: - return logging.ERROR - elif log_l == lrst_global.LOG_FATAL: - return logging.FATAL - elif log_l == lrst_global.LOG_OFF: - return logging.OFF - else: - return logging.ERROR +# Return the log level type from the code (Default: ERROR) +def get_log_level(level_code): + switch = { + 1: logging.DEBUG, + 2: logging.INFO, + 3: logging.WARN, + 4: logging.ERROR, + 5: logging.FATAL, + } + return switch.get(level_code, logging.ERROR) def get_common_param(margs): @@ -43,9 +48,11 @@ def get_common_param(margs): margs: Command line input arguments (dict). """ param_dict = {} + param_dict["prg_name"] = prg_name this_error_str = "" - if (margs.input == None or margs.input == "") and (margs.inputs == None or margs.inputs == "") and (margs.inputPattern == None or margs.inputPattern == ""): + if (margs.input == None or margs.input == "") and (margs.inputs == None or margs.inputs == "") and ( + margs.inputPattern == None or margs.inputPattern == ""): this_error_str += "No input file(s) are provided. \n" else: # Group parameters into an array @@ -59,7 +66,7 @@ def get_common_param(margs): if not (margs.inputPattern == None or margs.inputPattern == ""): pat_split = margs.inputPattern.split("*") param_dict["input_files"].extend( - glob(os.path.join("*".join(pat_split[:-1]), "*"+pat_split[-1]))) + glob(os.path.join("*".join(pat_split[:-1]), "*" + pat_split[-1]))) # Number of reads to sample read_count = margs.readCount @@ -80,13 +87,13 @@ def get_common_param(margs): try: if not os.path.isdir(output_dir): os.makedirs(output_dir) - if not os.path.isdir(output_dir + '/'+lrst_global.default_image_path): + if not os.path.isdir(output_dir + '/' + getDefaultImageFolder()): os.makedirs(output_dir + '/' + - lrst_global.default_image_path) + getDefaultImageFolder()) except OSError as e: this_error_str += "Cannot create folder for " + \ - param_dict["output_folder"]+" \n" + param_dict["output_folder"] + " \n" param_dict["out_prefix"] = margs.outprefix if (margs.log == None or margs.log == ""): @@ -133,7 +140,7 @@ def fq_module(margs): input_para.downsample_percentage = param_dict["downsample_percentage"] input_para.other_flags = 0 - input_para.user_defined_fastq_base_qual_offset = margs.udqual; + input_para.user_defined_fastq_base_qual_offset = margs.udqual; input_para.output_folder = str(param_dict["output_folder"]) input_para.out_prefix = str(param_dict["out_prefix"]) @@ -144,10 +151,11 @@ def fq_module(margs): fq_output = lrst.Output_FQ() exit_code = lrst.callFASTQModule(input_para, fq_output) if exit_code == 0: - create_base_quality_plots(fq_output, param_dict, "FASTQ: Basic statistics") + plot_filepaths = create_base_quality_plots(fq_output, param_dict, "FASTQ: Basic statistics") for static in [True, False]: fq_html_gen = generate_html.ST_HTML_Generator( - [["basic_st", "read_length_st","read_length_hist", "base_st", "basic_info", "base_quality", "read_avg_base_quality"], "FASTQ QC", param_dict], static=static) + [["basic_st", "read_length_st", "read_length_hist", "base_st", "basic_info", "base_quality", + "read_avg_base_quality"], "FASTQ QC", param_dict], plot_filepaths, static=static) fq_html_gen.generate_st_html() logging.info("Completed.") @@ -187,13 +195,14 @@ def fa_module(margs): if exit_code == 0: logging.info("QC generated.") logging.info("Generating output files...") - from src import plot_for_FA - plot_for_FA.fa_plot(fa_output, param_dict) + from src import fasta_plot + plot_filepaths = fasta_plot.plot(fa_output, param_dict) # TODO: Unused 'static' variable results in redundant function call for static in [True, False]: fa_html_gen = generate_html.ST_HTML_Generator( - [["basic_st", "read_length_st","read_length_hist", "base_st", "basic_info"], "FASTA QC", param_dict], static=True) + [["basic_st", "read_length_st", "read_length_hist", "base_st", "basic_info"], "FASTA QC", + param_dict], plot_filepaths, static=True) fa_html_gen.generate_st_html() else: @@ -219,24 +228,23 @@ def bam_module(margs): input_para.threads = param_dict["threads"] input_para.rdm_seed = param_dict["random_seed"] input_para.downsample_percentage = param_dict["downsample_percentage"] - input_para.other_flags = (1 if param_dict["detail"]>0 else 0) ; + input_para.other_flags = (1 if param_dict["detail"] > 0 else 0); input_para.output_folder = str(param_dict["output_folder"]) input_para.out_prefix = str(param_dict["out_prefix"]) - for _ipf in param_dict["input_files"]: input_para.add_input_file(str(_ipf)) bam_output = lrst.Output_BAM() exit_code = lrst.callBAMModule(input_para, bam_output) if exit_code == 0: - logging.info("QC generated.") logging.info("Generating output files...") - from src import plot_for_BAM - plot_for_BAM.bam_plot(bam_output, param_dict) + from src import bam_plot + plot_filepaths = bam_plot.plot(bam_output, param_dict) for static in [True, False]: bam_html_gen = generate_html.ST_HTML_Generator( - [["basic_st","map_st", "err_st", "read_length_st", "read_length_hist", "base_st", "basic_info", "base_quality"], "BAM QC", param_dict], static=static) + [["basic_st", "map_st", "err_st", "read_length_st", "read_length_hist", "base_st", "basic_info", + "base_quality"], "BAM QC", param_dict], plot_filepaths, static=static) bam_html_gen.generate_st_html() else: @@ -244,6 +252,7 @@ def bam_module(margs): logging.info("Done.") + def seqtxt_module(margs): """ Run the sequencing_summary.txt filetype module. @@ -263,7 +272,7 @@ def seqtxt_module(margs): input_para.downsample_percentage = param_dict["downsample_percentage"] input_para.other_flags = margs.seq # Default = 1 input_para.other_flags = (input_para.other_flags << 4) - input_para.other_flags += (1 if param_dict["detail"]>0 else 0) + input_para.other_flags += (1 if param_dict["detail"] > 0 else 0) input_para.other_flags = (input_para.other_flags << 4) input_para.other_flags += int(margs.sum_type) @@ -278,20 +287,24 @@ def seqtxt_module(margs): if exit_code == 0: logging.info("QC generated.") logging.info("Generating output files...") - from src import plot_for_SeqTxt - plot_for_SeqTxt.plot(seqtxt_output, param_dict) + from src import seqtxt_plot + plot_filepaths = seqtxt_plot.plot(seqtxt_output, param_dict) for static in [True, False]: - if margs.seq==0: - f5_html_gen = generate_html.ST_HTML_Generator([["basic_st", "read_length_st","read_length_hist","base_st","basic_info"], "sequencing_summary.txt QC", param_dict], static=static); + if margs.seq == 0: + f5_html_gen = generate_html.ST_HTML_Generator( + [["basic_st", "read_length_st", "read_length_hist", "base_st", "basic_info"], + "sequencing_summary.txt QC", param_dict], plot_filepaths, static=static) else: f5_html_gen = generate_html.ST_HTML_Generator( - [["basic_st", "read_length_st","read_length_hist", "basic_info"], "sequencing_summary.txt QC", param_dict], static=static) + [["basic_st", "read_length_st", "read_length_hist", "basic_info"], "sequencing_summary.txt QC", + param_dict], plot_filepaths, static=static) f5_html_gen.generate_st_html() else: logging.error("QC did not generate.") logging.info("Done.") + def fast5_module(margs): """ Run the FAST5 filetype module. @@ -321,16 +334,18 @@ def fast5_module(margs): if exit_code == 0: logging.info("QC generated.") logging.info("Generating output files...") - create_base_quality_plots(fast5_output, param_dict, "FAST5: Basic statistics") + plot_filepaths = create_base_quality_plots(fast5_output, param_dict, "FAST5: Basic statistics") for static in [True, False]: fast5_html_obj = generate_html.ST_HTML_Generator( - [["basic_st", "read_length_st","read_length_hist", "base_st", "basic_info", "base_quality", "read_avg_base_quality"], "FAST5 QC", param_dict], static=static) + [["basic_st", "read_length_st", "read_length_hist", "base_st", "basic_info", "base_quality", + "read_avg_base_quality"], "FAST5 QC", param_dict], plot_filepaths, static=static) fast5_html_obj.generate_st_html() else: logging.error("QC did not generate.") logging.info("Done.") + def fast5_signal_module(margs): """ Run the FAST5 filetype module with signal statistics output. @@ -361,12 +376,12 @@ def fast5_signal_module(margs): if exit_code == 0: logging.info("QC generated.") logging.info("Generating output files...") - from src import plot_for_FAST5s - dynamic_plots = plot_for_FAST5s.plot(fast5_output, param_dict) + from src import fast5_signal_plot + dynamic_plots, plot_filepaths = fast5_signal_plot.plot(fast5_output, param_dict) # Generate a dynamic HTML file fast5_html_obj = generate_html.ST_HTML_Generator( - [[], "FAST5 signal QC", param_dict], static=False) + [[], "FAST5 signal QC", param_dict], plot_filepaths, static=False) fast5_html_obj.generate_st_html(signal_plots=dynamic_plots) else: logging.error("QC did not generate.") @@ -400,7 +415,7 @@ def fast5_signal_module(margs): input_files_group.add_argument( "-I", "--inputs", type=str, default=None, - help="Multiple comma-separated input filepaths",) + help="Multiple comma-separated input filepaths", ) input_files_group.add_argument( "-P", "--inputPattern", type=str, default=None, @@ -424,11 +439,11 @@ def fast5_signal_module(margs): common_grp_param.add_argument( "-g", "--log", type=str, default="log_output.log", help="Log file") -common_grp_param.add_argument("-G", "--Log_level", type=int, default=lrst_global.LOG_ERROR, +common_grp_param.add_argument("-G", "--Log_level", type=int, default=4, help="Level for logging: ALL(0) < DEBUG(1) < INFO(2) < WARN(3) < ERROR(4) < FATAL(5) < OFF(6). Default: 4 (ERROR)") common_grp_param.add_argument("-o", "--outputfolder", type=str, - default="output_" + lrst_global.prg_name, help="The output folder.") + default="output_" + prg_name, help="The output folder.") common_grp_param.add_argument("-t", "--thread", type=int, default=1, help="The number of threads used. Default: 1.") common_grp_param.add_argument("-Q", "--outprefix", type=str, @@ -469,11 +484,11 @@ def fast5_signal_module(margs): # FAST5 signal mode inputs fast5_signal_parser = subparsers.add_parser('f5s', - parents=[parent_parser], - help="FAST5 file input with signal statistics output", - description="For example:\n" - "python %(prog)s -R 5 10 -i input.fast5 -o /output_directory/", - formatter_class=RawTextHelpFormatter) + parents=[parent_parser], + help="FAST5 file input with signal statistics output", + description="For example:\n" + "python %(prog)s -R 5 10 -i input.fast5 -o /output_directory/", + formatter_class=RawTextHelpFormatter) fast5_signal_parser.set_defaults(func=fast5_signal_module) # sequencing_summary.txt inputs @@ -503,11 +518,11 @@ def fast5_signal_module(margs): # ===== def main(): if sys.version_info[0] < 2: - logging.info(lrst_global.prg_name + - " could not be run with lower version than python 2.7.") + logging.info(prg_name + + " could not be run with lower version than python 2.7.") else: if sys.version_info[1] < 6: - logging.info(lrst_global.prg_name+" could be run with python 2.7.") + logging.info(prg_name + " could be run with python 2.7.") else: if len(sys.argv) < 2: parser.print_help() diff --git a/src/FAST5_module.cpp b/src/fast5_module.cpp similarity index 96% rename from src/FAST5_module.cpp rename to src/fast5_module.cpp index 87758a1..a13698e 100644 --- a/src/FAST5_module.cpp +++ b/src/fast5_module.cpp @@ -21,7 +21,7 @@ Class for calling FAST5 statistics modules. #include #include -#include "FAST5_module.h" +#include "fast5_module.h" #include "ComFunction.h" #include "H5Cpp.h" @@ -586,14 +586,14 @@ int generateQCForFAST5(Input_Para &_input_data, Output_FAST5 &output_data) // Sort the read lengths in descending order std::vector read_lengths = output_data.long_read_info.read_lengths; - std::sort(read_lengths.begin(), read_lengths.end(), std::greater()); + std::sort(read_lengths.begin(), read_lengths.end(), std::greater()); // Get the max read length - int max_read_length = read_lengths.at(0); + int64_t max_read_length = read_lengths.at(0); output_data.long_read_info.longest_read_length = max_read_length; // Get the median read length - int median_read_length = read_lengths[read_lengths.size() / 2]; + int64_t median_read_length = read_lengths[read_lengths.size() / 2]; output_data.long_read_info.median_read_length = median_read_length; // Get the mean read length @@ -625,17 +625,17 @@ int generateQCForFAST5(Input_Para &_input_data, Output_FAST5 &output_data) // Create the summary file std::cout << "Writing summary file: " << read_summary_file.c_str() << std::endl; read_summary_fp = fopen(read_summary_file.c_str(), "w"); - fprintf(read_summary_fp, "total number of reads\t%ld\n", output_data.long_read_info.total_num_reads); + fprintf(read_summary_fp, "total number of reads\t%d\n", output_data.long_read_info.total_num_reads); fprintf(read_summary_fp, "total number of bases\t%ld\n", output_data.long_read_info.total_num_bases); - fprintf(read_summary_fp, "longest read length\t%lu\n", output_data.long_read_info.longest_read_length); - fprintf(read_summary_fp, "N50 read length\t%ld\n", output_data.long_read_info.n50_read_length); + fprintf(read_summary_fp, "longest read length\t%d\n", output_data.long_read_info.longest_read_length); + fprintf(read_summary_fp, "N50 read length\t%d\n", output_data.long_read_info.n50_read_length); fprintf(read_summary_fp, "mean read length\t%.2f\n", output_data.long_read_info.mean_read_length); - fprintf(read_summary_fp, "median read length\t%ld\n", output_data.long_read_info.median_read_length); + fprintf(read_summary_fp, "median read length\t%d\n", output_data.long_read_info.median_read_length); fprintf(read_summary_fp, "GC%%\t%.2f\n", output_data.long_read_info.gc_cnt * 100); fprintf(read_summary_fp, "\n\n"); for (int percent = 5; percent < 100; percent += 5) { - fprintf(read_summary_fp, "N%02d read length\t%.ld\n", percent, output_data.long_read_info.NXX_read_length[percent]); + fprintf(read_summary_fp, "N%02d read length\t%.d\n", percent, output_data.long_read_info.NXX_read_length[percent]); } fprintf(read_summary_fp, "\n\n"); @@ -643,7 +643,7 @@ int generateQCForFAST5(Input_Para &_input_data, Output_FAST5 &output_data) fprintf(read_summary_fp, "GC content\tnumber of reads\n"); for (int gc_ratio = 0; gc_ratio < 100; gc_ratio++) { - fprintf(read_summary_fp, "GC=%d%%\t%ld\n", gc_ratio, output_data.long_read_info.read_gc_content_count[gc_ratio]); + fprintf(read_summary_fp, "GC=%d%%\t%d\n", gc_ratio, output_data.long_read_info.read_gc_content_count[gc_ratio]); } @@ -651,14 +651,14 @@ int generateQCForFAST5(Input_Para &_input_data, Output_FAST5 &output_data) fprintf(read_summary_fp, "base quality\tnumber of bases\n"); for (int baseq = 0; baseq <= 60; baseq++) { - fprintf(read_summary_fp, "%d\t%ld\n", baseq, output_data.seq_quality_info.base_quality_distribution[baseq]); + fprintf(read_summary_fp, "%d\t%d\n", baseq, output_data.seq_quality_info.base_quality_distribution[baseq]); } fprintf(read_summary_fp, "\n\n"); fprintf(read_summary_fp, "read average base quality\tnumber of reads\n"); for (int baseq = 0; baseq <= 60; baseq++) { - fprintf(read_summary_fp, "%d\t%ld\n", baseq, output_data.seq_quality_info.read_average_base_quality_distribution[baseq]); + fprintf(read_summary_fp, "%d\t%d\n", baseq, output_data.seq_quality_info.read_average_base_quality_distribution[baseq]); } fclose(read_summary_fp); } diff --git a/src/plot_for_FAST5s.py b/src/fast5_signal_plot.py similarity index 92% rename from src/plot_for_FAST5s.py rename to src/fast5_signal_plot.py index 99b42d6..002fdc9 100644 --- a/src/plot_for_FAST5s.py +++ b/src/fast5_signal_plot.py @@ -9,11 +9,12 @@ import numpy as np import plotly.graph_objs as go from random import sample -\ + if __package__ == 'src': - from src import lrst_global + from src.plot_utils import * else: - import lrst_global + from plot_utils import * + def plot(fast5_output, para_dict): """ @@ -22,10 +23,11 @@ def plot(fast5_output, para_dict): out_path = para_dict["output_folder"] # Set up the global variable with HTML titles - lrst_global.plot_filenames["basic_st"] = {} - lrst_global.plot_filenames["basic_st"]['file'] = "" - lrst_global.plot_filenames["basic_st"]['title'] = "Summary Table" - lrst_global.plot_filenames["basic_st"]['description'] = "FAST5: Basic statistics" + plot_filepaths = getDefaultPlotFilenames() + plot_filepaths["basic_st"] = {} + plot_filepaths["basic_st"]['file'] = "" + plot_filepaths["basic_st"]['title'] = "Summary Table" + plot_filepaths["basic_st"]['description'] = "FAST5: Basic statistics" # Get values read_count = fast5_output.getReadCount() @@ -38,7 +40,7 @@ def plot(fast5_output, para_dict): table_str += int_str_for_format.format("#Total Reads", read_count) table_str += int_str_for_format.format("#Total Bases", total_base_count) table_str += "\n\n" - lrst_global.plot_filenames["basic_st"]['detail'] = table_str + plot_filepaths["basic_st"]['detail'] = table_str # Randomly sample a small set of reads if it is a large dataset read_count_max = para_dict["read_count"] @@ -146,4 +148,4 @@ def plot(fast5_output, para_dict): dynamic_html = fig.to_html(full_html=False) output_html_plots.update({nth_read_name: dynamic_html}) - return output_html_plots + return output_html_plots, plot_filepaths diff --git a/src/FASTA_module.cpp b/src/fasta_module.cpp similarity index 90% rename from src/FASTA_module.cpp rename to src/fasta_module.cpp index 356b4ca..aef98fa 100644 --- a/src/FASTA_module.cpp +++ b/src/fasta_module.cpp @@ -11,7 +11,7 @@ FASTA_module.cpp: #include #include #include "kseq.h" -#include "FASTA_module.h" +#include "fasta_module.h" KSEQ_INIT(gzFile, gzread) // this is a macro defined in kseq.h @@ -23,7 +23,7 @@ static int qc1fasta(const char *input_file, Output_FA &py_output_fa, FILE *read_ kseq_t *seq; char *read_seq; char *read_name; - int read_len; + int64_t read_len; double read_gc_cnt; Basic_Seq_Statistics &long_read_info = py_output_fa.long_read_info; @@ -39,13 +39,13 @@ static int qc1fasta(const char *input_file, Output_FA &py_output_fa, FILE *read_ if (read_len == 0) {continue;} read_name = seq->name.s; read_seq = seq->seq.s; - if ((uint64_t)read_len > long_read_info.longest_read_length) + if ((int64_t)read_len > long_read_info.longest_read_length) { long_read_info.longest_read_length = read_len; } long_read_info.total_num_reads += 1; long_read_info.total_num_bases += read_len; - if ((uint64_t)read_len < long_read_info.read_length_count.size()) { + if (read_len < (int64_t) long_read_info.read_length_count.size()) { long_read_info.read_length_count[read_len] += 1; } else { long_read_info.read_length_count.resize(read_len + 1000, 0); @@ -76,7 +76,7 @@ static int qc1fasta(const char *input_file, Output_FA &py_output_fa, FILE *read_ } read_gc_cnt = 100.0 * read_gc_cnt / (double)read_len; long_read_info.read_gc_content_count[(int)(read_gc_cnt + 0.5)] += 1; - fprintf(read_details_fp, "%s\t%d\t%.2f\n", read_name, read_len, read_gc_cnt); + fprintf(read_details_fp, "%s\t%ld\t%.2f\n", read_name, read_len, read_gc_cnt); } kseq_destroy(seq); @@ -166,13 +166,13 @@ int qc_fasta_files(Input_Para &_input_data, Output_FA &py_output_fa) py_output_fa.long_read_info.gc_cnt = g_c / a_tu_g_c; int percent = 1; - int64_t num_bases_sum = 0; + uint64_t num_bases_sum = 0; int64_t num_reads_sum = 0; py_output_fa.long_read_info.median_read_length = -1; for (int read_len = py_output_fa.long_read_info.read_length_count.size() - 1; read_len > 0; read_len--) { num_reads_sum += py_output_fa.long_read_info.read_length_count[read_len]; - num_bases_sum += py_output_fa.long_read_info.read_length_count[read_len] * read_len; + num_bases_sum += (uint64_t) (py_output_fa.long_read_info.read_length_count[read_len] * read_len); if (num_reads_sum * 2 > py_output_fa.long_read_info.total_num_reads && py_output_fa.long_read_info.median_read_length < 0) { py_output_fa.long_read_info.median_read_length = read_len; @@ -194,17 +194,17 @@ int qc_fasta_files(Input_Para &_input_data, Output_FA &py_output_fa) py_output_fa.long_read_info.mean_read_length = (double)py_output_fa.long_read_info.total_num_bases / (double)py_output_fa.long_read_info.total_num_reads; read_summary_fp = fopen(read_summary_file.c_str(), "w"); - fprintf(read_summary_fp, "total number of reads\t%ld\n", py_output_fa.long_read_info.total_num_reads); + fprintf(read_summary_fp, "total number of reads\t%d\n", py_output_fa.long_read_info.total_num_reads); fprintf(read_summary_fp, "total number of bases\t%ld\n", py_output_fa.long_read_info.total_num_bases); - fprintf(read_summary_fp, "longest read length\t%lu\n", py_output_fa.long_read_info.longest_read_length); - fprintf(read_summary_fp, "N50 read length\t%ld\n", py_output_fa.long_read_info.n50_read_length); + fprintf(read_summary_fp, "longest read length\t%d\n", py_output_fa.long_read_info.longest_read_length); + fprintf(read_summary_fp, "N50 read length\t%d\n", py_output_fa.long_read_info.n50_read_length); fprintf(read_summary_fp, "mean read length\t%.2f\n", py_output_fa.long_read_info.mean_read_length); - fprintf(read_summary_fp, "median read length\t%ld\n", py_output_fa.long_read_info.median_read_length); + fprintf(read_summary_fp, "median read length\t%d\n", py_output_fa.long_read_info.median_read_length); fprintf(read_summary_fp, "GC%%\t%.2f\n", py_output_fa.long_read_info.gc_cnt * 100); fprintf(read_summary_fp, "\n\n"); for (int percent = 5; percent < 100; percent += 5) { - fprintf(read_summary_fp, "N%02d read length\t%.ld\n", percent, py_output_fa.long_read_info.NXX_read_length[percent]); + fprintf(read_summary_fp, "N%02d read length\t%.d\n", percent, py_output_fa.long_read_info.NXX_read_length[percent]); } fprintf(read_summary_fp, "\n\n"); @@ -212,7 +212,7 @@ int qc_fasta_files(Input_Para &_input_data, Output_FA &py_output_fa) fprintf(read_summary_fp, "GC content\tnumber of reads\n"); for (int gc_ratio = 0; gc_ratio <= 100; gc_ratio++) { - fprintf(read_summary_fp, "GC=%d%%\t%ld\n", gc_ratio, py_output_fa.long_read_info.read_gc_content_count[gc_ratio]); + fprintf(read_summary_fp, "GC=%d%%\t%d\n", gc_ratio, py_output_fa.long_read_info.read_gc_content_count[gc_ratio]); } fclose(read_summary_fp); } diff --git a/src/fasta_plot.py b/src/fasta_plot.py new file mode 100644 index 0000000..0f506ed --- /dev/null +++ b/src/fasta_plot.py @@ -0,0 +1,55 @@ +""" +plot_for_FA.py: +Use the formatted statistics from our C++ module output text files to generate summary plots in image format. +""" + +if __package__ == 'src': + from src.plot_utils import * +else: + from plot_utils import * + + +def create_summary_table(fa_output, plot_filepaths): + plot_filepaths["basic_st"] = {} + plot_filepaths["basic_st"]['file'] = "" + plot_filepaths["basic_st"]['title'] = "Basic statistics" + plot_filepaths["basic_st"]['description'] = "FASTA: Basic statistics" + + table_str = "\n\n\n" + table_str += "\n" + int_str_for_format = "" + double_str_for_format = "" + table_str += int_str_for_format.format("#Total Reads", + fa_output.long_read_info.total_num_reads) + table_str += int_str_for_format.format("#Total Bases", fa_output.long_read_info.total_num_bases) + table_str += int_str_for_format.format("Longest Read Length", fa_output.long_read_info.longest_read_length) + table_str += int_str_for_format.format("N50", fa_output.long_read_info.n50_read_length) + table_str += double_str_for_format.format("GC Content(%)", fa_output.long_read_info.gc_cnt * 100) + table_str += double_str_for_format.format("Mean Read Length", fa_output.long_read_info.mean_read_length) + table_str += int_str_for_format.format("Median Read Length", fa_output.long_read_info.median_read_length) + table_str += "\n\n
MeasurementStatistics
{}{:,d}
{}{:.1f}
" + + plot_filepaths["basic_st"]['detail'] = table_str + + +def plot(fa_output, para_dict): + out_path = para_dict["output_folder"] + plot_filepaths = getDefaultPlotFilenames() + get_image_path = lambda x: os.path.join(out_path, plot_filepaths[x]['file']) + + # Set the default matplotlib font size + setDefaultFontSize(12) + + # Get the font size for plotly plots + font_size = para_dict["fontsize"] + + # Generate plots + create_summary_table(fa_output, plot_filepaths) + + # Save plot images using statistics generated from the C++ module + plot_read_length_stats([fa_output.long_read_info], get_image_path('read_length_st'), subtitles=['Long Reads']) + plot_base_counts([fa_output.long_read_info], get_image_path('base_st'), subtitles=['Long Reads']) + plot_basic_info([fa_output.long_read_info], get_image_path('basic_info'), categories=['Long Reads']) + histogram(fa_output.long_read_info, get_image_path('read_length_hist'), font_size) + + return plot_filepaths diff --git a/src/FASTQ_module.cpp b/src/fastq_module.cpp similarity index 91% rename from src/FASTQ_module.cpp rename to src/fastq_module.cpp index 73cf398..4e976e7 100644 --- a/src/FASTQ_module.cpp +++ b/src/fastq_module.cpp @@ -12,7 +12,7 @@ // https://github.com/lh3/seqtk #include "kseq.h" #include "kseq.h" -#include "FASTQ_module.h" +#include "fastq_module.h" KSEQ_INIT(gzFile, gzread) // this is a macro defined in kseq.h @@ -44,7 +44,7 @@ static int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_ read_name = seq->name.s; read_seq = seq->seq.s; raw_read_qual = seq->qual.s; - if ((uint64_t)read_len > long_read_info.longest_read_length) + if (read_len > long_read_info.longest_read_length) { long_read_info.longest_read_length = read_len; } @@ -248,14 +248,14 @@ int qc_fastq_files(Input_Para &_input_data, Output_FQ &output_data) // Sort the read lengths in descending order std::vector read_lengths = output_data.long_read_info.read_lengths; - std::sort(read_lengths.begin(), read_lengths.end(), std::greater()); + std::sort(read_lengths.begin(), read_lengths.end(), std::greater()); // Get the max read length - int max_read_length = read_lengths.at(0); + int64_t max_read_length = read_lengths.at(0); output_data.long_read_info.longest_read_length = max_read_length; // Get the median read length - int median_read_length = read_lengths[read_lengths.size() / 2]; + int64_t median_read_length = read_lengths[read_lengths.size() / 2]; output_data.long_read_info.median_read_length = median_read_length; // Get the mean read length @@ -285,17 +285,17 @@ int qc_fastq_files(Input_Para &_input_data, Output_FQ &output_data) output_data.long_read_info.n05_read_length = output_data.long_read_info.NXX_read_length[5]; read_summary_fp = fopen(read_summary_file.c_str(), "w"); - fprintf(read_summary_fp, "total number of reads\t%ld\n", output_data.long_read_info.total_num_reads); + fprintf(read_summary_fp, "total number of reads\t%d\n", output_data.long_read_info.total_num_reads); fprintf(read_summary_fp, "total number of bases\t%ld\n", output_data.long_read_info.total_num_bases); - fprintf(read_summary_fp, "longest read length\t%lu\n", output_data.long_read_info.longest_read_length); - fprintf(read_summary_fp, "N50 read length\t%ld\n", output_data.long_read_info.n50_read_length); + fprintf(read_summary_fp, "longest read length\t%d\n", output_data.long_read_info.longest_read_length); + fprintf(read_summary_fp, "N50 read length\t%d\n", output_data.long_read_info.n50_read_length); fprintf(read_summary_fp, "mean read length\t%.2f\n", output_data.long_read_info.mean_read_length); - fprintf(read_summary_fp, "median read length\t%ld\n", output_data.long_read_info.median_read_length); + fprintf(read_summary_fp, "median read length\t%d\n", output_data.long_read_info.median_read_length); fprintf(read_summary_fp, "GC%%\t%.2f\n", output_data.long_read_info.gc_cnt * 100); fprintf(read_summary_fp, "\n\n"); for (int percent = 5; percent < 100; percent += 5) { - fprintf(read_summary_fp, "N%02d read length\t%.ld\n", percent, output_data.long_read_info.NXX_read_length[percent]); + fprintf(read_summary_fp, "N%02d read length\t%.d\n", percent, output_data.long_read_info.NXX_read_length[percent]); } fprintf(read_summary_fp, "\n\n"); @@ -303,22 +303,21 @@ int qc_fastq_files(Input_Para &_input_data, Output_FQ &output_data) fprintf(read_summary_fp, "GC content\tnumber of reads\n"); for (int gc_ratio = 0; gc_ratio < 100; gc_ratio++) { - fprintf(read_summary_fp, "GC=%d%%\t%ld\n", gc_ratio, output_data.long_read_info.read_gc_content_count[gc_ratio]); + fprintf(read_summary_fp, "GC=%d%%\t%d\n", gc_ratio, output_data.long_read_info.read_gc_content_count[gc_ratio]); } - fprintf(read_summary_fp, "\n\n"); fprintf(read_summary_fp, "base quality\tnumber of bases\n"); for (int baseq = 0; baseq <= 60; baseq++) { - fprintf(read_summary_fp, "%d\t%ld\n", baseq, output_data.seq_quality_info.base_quality_distribution[baseq]); + fprintf(read_summary_fp, "%d\t%d\n", baseq, output_data.seq_quality_info.base_quality_distribution[baseq]); } fprintf(read_summary_fp, "\n\n"); fprintf(read_summary_fp, "read average base quality\tnumber of reads\n"); for (int baseq = 0; baseq <= 60; baseq++) { - fprintf(read_summary_fp, "%d\t%ld\n", baseq, output_data.seq_quality_info.read_average_base_quality_distribution[baseq]); + fprintf(read_summary_fp, "%d\t%d\n", baseq, output_data.seq_quality_info.read_average_base_quality_distribution[baseq]); } fclose(read_summary_fp); } diff --git a/src/generate_html.py b/src/generate_html.py index e48d298..4453aee 100644 --- a/src/generate_html.py +++ b/src/generate_html.py @@ -4,23 +4,20 @@ import base64 -if __package__ == 'src': - from src import lrst_global # Contains our image filepaths -else: - import lrst_global # Contains our image filepaths - class ST_HTML_Generator: - def __init__(self, para_list, static=True): + def __init__(self, para_list, plot_filepaths, static=True): self.image_key_list = para_list[0] # List of statistics variables to look for self.header_info = para_list[1] # The webpage's title self.input_para = para_list[2] # List of the input parameters used for these statistics self.static = static # Static vs. dynamic webpage boolean + self.plot_filepaths = plot_filepaths + self.prg_name = self.input_para["prg_name"] # Program name if len(self.input_para["input_files"]) > 1: - self.more_input_files = True; + self.more_input_files = True else: - self.more_input_files = False; + self.more_input_files = False def generate_header(self): if self.static: @@ -231,7 +228,7 @@ def generate_header(self):
{} Report
- '''.format(lrst_global.prg_name)) + '''.format(self.prg_name)) # for _af in self.input_para["input_files"]: # self.html_writer.write( "
"+_af); # self.html_writer.write( "
"+ self.input_para["input_files"][0] ) @@ -247,13 +244,14 @@ def generate_left(self): _imki = 0 for _imk in self.image_key_list: self.html_writer.write('
  • ') + self.html_writer.write( - '' + lrst_global.plot_filenames[_imk]['title'] + '') + '' + self.plot_filepaths[_imk]['title'] + '') _imki += 1; self.html_writer.write('
  • ') self.html_writer.write('
  • ') - self.html_writer.write('List of input files') + self.html_writer.write('Input File List') _imki += 1; self.html_writer.write('
  • ') @@ -266,21 +264,21 @@ def generate_right(self): for _imk in self.image_key_list: self.html_writer.write('
    ') self.html_writer.write( - '

    ' + lrst_global.plot_filenames[_imk]['description'] + '

    ') + '

    ' + self.plot_filepaths[_imk]['description'] + '

    ') # self.html_writer.write(''+lrst_global.plot_filenames[_imk]['description']+'

    ') - if 'dynamic' in lrst_global.plot_filenames[_imk] and self.static == False: - self.html_writer.write(lrst_global.plot_filenames[_imk]['dynamic']) + if 'dynamic' in self.plot_filepaths[_imk] and self.static == False: + self.html_writer.write(self.plot_filepaths[_imk]['dynamic']) else: if _imk == "basic_st": - self.html_writer.write(lrst_global.plot_filenames["basic_st"]['detail']) + self.html_writer.write(self.plot_filepaths["basic_st"]['detail']) else: m_image_file = open( - self.input_para["output_folder"] + '/' + lrst_global.plot_filenames[_imk]['file'], 'rb'); + self.input_para["output_folder"] + '/' + self.plot_filepaths[_imk]['file'], 'rb'); self.html_writer.write('' + lrst_global.plot_filenames[_imk][
+                        m_image_file.read()).decode('utf-8') + '

    ') m_image_file.close() @@ -309,7 +307,7 @@ def generate_left_signal_data(self, read_names): url_index = 0 self.html_writer.write('
  • ') self.html_writer.write( - '' + lrst_global.plot_filenames["basic_st"]['title'] + '') + '' + self.plot_filepaths["basic_st"]['title'] + '') url_index += 1 # Add the read plot section list @@ -336,8 +334,8 @@ def generate_right_signal_data(self, read_names, signal_plot): url_index = 0 self.html_writer.write('
    ') self.html_writer.write( - '

    ' + lrst_global.plot_filenames["basic_st"]['description'] + '

    ') - self.html_writer.write(lrst_global.plot_filenames["basic_st"]['detail']) + '

    ' + self.plot_filepaths["basic_st"]['description'] + '

    ') + self.html_writer.write(self.plot_filepaths["basic_st"]['detail']) url_index += 1 # Add the read plot section @@ -367,7 +365,7 @@ def generate_right_signal_data(self, read_names, signal_plot): def generate_end(self): self.html_writer.write( '

    '.format( - lrst_global.prg_name, lrst_global.prg_name)) + self.prg_name, self.prg_name)) self.html_writer.write("") self.html_writer.write("") diff --git a/src/hts_reader.cpp b/src/hts_reader.cpp new file mode 100644 index 0000000..b9264be --- /dev/null +++ b/src/hts_reader.cpp @@ -0,0 +1,225 @@ +/* + +BamReader.cpp: +Class for reading a set number of records from a BAM file. Used for multi-threading. + +*/ + + +#include +#include +#include +#include +#include + +#include "hts_reader.h" +#include "ComFunction.h" + +// HTSReader constructor +HTSReader::HTSReader(const std::string & bam_file_name){ + this->bam_file = hts_open(bam_file_name.c_str(), "r"); + this->header = sam_hdr_read(this->bam_file); +} + +// HTSReader destructor +HTSReader::~HTSReader(){ + bam_hdr_destroy(this->header); + hts_close(this->bam_file); +} + +// Update read and base counts +int HTSReader::updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics *basic_qc, std::vector & base_quality_distribution){ + int exit_code = 0; + + // Update the total number of reads + basic_qc->total_num_reads++; + + // Update read length statistics + int read_length = (int) record->core.l_qseq; + basic_qc->total_num_bases += (uint64_t) read_length; // Update the total number of bases + basic_qc->read_lengths.push_back(read_length); + + // Loop and count the number of each base + uint8_t *seq = bam_get_seq(record); + for (int i = 0; i < read_length; i++) { + // Get the base quality and update the base quality histogram + int base_quality = (int)bam_get_qual(record)[i]; + base_quality_distribution[base_quality]++; + + // Get the base and update the base count + char base = seq_nt16_str[bam_seqi(seq, i)]; + switch (base) { + case 'A': + basic_qc->total_a_cnt++; + break; + case 'C': + basic_qc->total_c_cnt++; + break; + case 'G': + basic_qc->total_g_cnt++; + break; + case 'T': + basic_qc->total_tu_cnt++; + break; + case 'N': + basic_qc->total_n_cnt++; + std::cerr << "Warning: N base found in read " << bam_get_qname(record) << std::endl; + break; + default: + std::cerr << "Error reading nucleotide: " << base << std::endl; + exit_code = 1; + break; + } + } + + return exit_code; +} + +// Read the next batch of records from the BAM file and store QC in the output_data object +int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mutex & read_mutex){ + int record_count = 0; + int exit_code = 0; + + // Set up the base quality histogram + std::vector base_quality_distribution(256, 0); + + // Loop through each alignment record in the BAM file + // Do QC on each record and store the results in the output_data object + while ((record_count < batch_size) && (exit_code >= 0)) { + // Create a record object + bam1_t* record = bam_init1(); + + // read the next record in a thread-safe manner + read_mutex.lock(); + exit_code = sam_read1(this->bam_file, this->header, record); + read_mutex.unlock(); + + if (exit_code < 0) { + this->reading_complete = true; + bam_destroy1(record); + break; // error or EOF + } + + // Determine if this is an unmapped read + if (record->core.flag & BAM_FUNMAP) { + Basic_Seq_Statistics *basic_qc = &output_data.unmapped_long_read_info; + + // Update read and base QC + this->updateReadAndBaseCounts(record, basic_qc, base_quality_distribution); + + } else { + // Set up the basic QC object + Basic_Seq_Statistics *basic_qc = &output_data.mapped_long_read_info; + + // Calculate base alignment statistics on non-secondary alignments + if (!(record->core.flag & BAM_FSECONDARY)) { + + // Update the number of mismatches + uint8_t *nmTag = bam_aux_get(record, "NM"); + output_data.num_mismatched_bases += bam_aux2i(nmTag); + + // Loop through the cigar string and count the number of insertions, deletions, and matches + uint32_t *cigar = bam_get_cigar(record); + for (uint32_t i = 0; i < record->core.n_cigar; i++) { + int cigar_op = bam_cigar_op(cigar[i]); + uint64_t cigar_len = (uint64_t)bam_cigar_oplen(cigar[i]); + switch (cigar_op) { + case BAM_CMATCH: + output_data.num_matched_bases += cigar_len; + break; + case BAM_CINS: + output_data.num_ins_bases += cigar_len; + break; + case BAM_CDEL: + output_data.num_del_bases += cigar_len; + break; + case BAM_CSOFT_CLIP: + output_data.num_clip_bases += cigar_len; + break; + case BAM_CHARD_CLIP: + output_data.num_clip_bases += cigar_len; + break; + default: + break; + } + } + } + + // Determine if this is a secondary alignment (not included in QC, only read count) + if (record->core.flag & BAM_FSECONDARY) { + output_data.num_secondary_alignment++; + + // Get the alignment's query name (the read name) + std::string query_name = bam_get_qname(record); + + // Update the read's secondary alignments (count once per read) + output_data.reads_with_secondary[query_name] = true; + + // Determine if this is a supplementary alignment (not included in QC, only read count) + } else if (record->core.flag & BAM_FSUPPLEMENTARY) { + output_data.num_supplementary_alignment++; + + // Get the alignment's query name (the read name) + std::string query_name = bam_get_qname(record); + + // Update the read's supplementary alignments (count once per read) + output_data.reads_with_supplementary[query_name] = true; + + // Determine if this is a primary alignment + } else if (!(record->core.flag & BAM_FSECONDARY || record->core.flag & BAM_FSUPPLEMENTARY)) { + // Determine if this is a forward or reverse read + if (record->core.flag & BAM_FREVERSE) { + output_data.forward_alignment++; + } else { + output_data.reverse_alignment++; + } + output_data.num_primary_alignment++; // Update the number of primary alignments + + // Update read and base QC + this->updateReadAndBaseCounts(record, basic_qc, base_quality_distribution); + + // Calculate the percent GC content + int percent_gc = round((basic_qc->total_g_cnt + basic_qc->total_c_cnt) / (double) (basic_qc->total_a_cnt + basic_qc->total_c_cnt + basic_qc->total_g_cnt + basic_qc->total_tu_cnt) * 100); + + // Update the GC content histogram + basic_qc->read_gc_content_count.push_back(percent_gc); + } else { + std::cerr << "Error: Unknown alignment type" << std::endl; + std::cerr << "Flag: " << record->core.flag << std::endl; + } + } + + // Delete the record object + bam_destroy1(record); + + record_count++; + } + + // Update the base quality histogram + output_data.seq_quality_info.base_quality_distribution = base_quality_distribution; + + return exit_code; +} + +// Return if the file has any more records +bool HTSReader::hasNextRecord(){ + return !this->reading_complete; +} + +// Return the number of records in the BAM file using the BAM index +int64_t HTSReader::getNumRecords(const std::string & bam_filename){ + samFile* bam_file = sam_open(bam_filename.c_str(), "r"); + bam_hdr_t* bam_header = sam_hdr_read(bam_file); + bam1_t* bam_record = bam_init1(); + + int64_t num_reads = 0; + while (sam_read1(bam_file, bam_header, bam_record) >= 0) { + num_reads++; + } + + bam_destroy1(bam_record); + bam_hdr_destroy(bam_header); + sam_close(bam_file); + + return num_reads; +} diff --git a/src/InputStructure.cpp b/src/input_parameters.cpp similarity index 95% rename from src/InputStructure.cpp rename to src/input_parameters.cpp index 8f0c005..3a0d1c1 100644 --- a/src/InputStructure.cpp +++ b/src/input_parameters.cpp @@ -1,6 +1,6 @@ #include -#include "InputStructure.h" +#include "input_parameters.h" Input_Para::Input_Para(){ // Set default parameters diff --git a/src/lrst.i b/src/lrst.i index 4bc01ec..06649ff 100644 --- a/src/lrst.i +++ b/src/lrst.i @@ -9,25 +9,24 @@ lrst.i: SWIG module defining the Python wrapper for our C++ modules %{ #include #include -#include "InputStructure.h" -#include "OutputStructures.h" -#include "ModuleCaller.h" +#include "input_parameters.h" +#include "output_data.h" +#include "module_caller.h" %} -typedef long int int64_t; -%apply long int { int64_t }; +%apply long long { int64_t }; // Maps int64_t to long long in Python +%apply unsigned long long { uint64_t }; // Maps uint64_t to unsigned long long in Python +%apply long { long int }; // Maps long int to long in Python + %include %include %include -namespace std{ - %template(IntVector) vector; - %template(DoubleVector) vector; - %template(Int64Vector) vector; - %template(Int2DVector) vector>; -}; +%template(IntVector) std::vector; +%template(DoubleVector) std::vector; +%template(Int2DVector) std::vector>; // These are the header functions wrapped by our lrst module (Like an 'import') -%include "InputStructure.h" // Contains InputPara for passing parameters to C++ -%include "OutputStructures.h" // Contains data structures for storing statistics for each file type -%include "ModuleCaller.h" // Functions for calling the C++ statistics computation modules +%include "input_parameters.h" // Contains InputPara for passing parameters to C++ +%include "output_data.h" // Contains data structures for storing statistics for each file type +%include "module_caller.h" // Functions for calling the C++ statistics computation modules diff --git a/src/lrst_global.py b/src/lrst_global.py deleted file mode 100644 index 1990077..0000000 --- a/src/lrst_global.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -lrst_global.py: -Format image filenames based on the provided output folder parameter. -Set the log error code variables. -""" - - -prg_name = "LongReadSum" -LOG_ALL = 0; -LOG_DEBUG = 1; -LOG_INFO = 2; -LOG_WARN = 3; -LOG_ERROR = 4; -LOG_FATAL = 5; -LOG_OFF = 6; - -default_image_path = 'img/' -default_image_suf = '.png' - -plot_filenames= {\ -# for fq/fa -"read_length_distr": {'file':default_image_path+"read_length_distr"+default_image_suf, 'title':"Read Length", 'description':"Read Length Distribution"},\ -# for bam -"map_st": {'file':default_image_path+"map_st"+default_image_suf, 'title':"Map Information", 'description':"Read Mapping Statistics"},\ - -"err_st": {'file':default_image_path+"err_st"+default_image_suf, 'title':"Base Alignment and Error Statistics", 'description':"Base Alignment and Error Statistics"},\ - -"read_length_st": {'file':default_image_path+"read_length_st"+default_image_suf, 'title':"Read Length Statistics", 'description':"Read Length Statistics"},\ - -"base_st": {'file':default_image_path+"base_st"+default_image_suf, 'title':"Base Count Statistics", 'description':"Base Count Statistics", 'summary':""},\ - -"basic_info": {'file':default_image_path+"basic_info"+default_image_suf, 'title':"Basic Statistics", 'description':"Basic Statistics", 'summary':""},\ - -"read_length_hist":{'file':default_image_path+"read_length_hist"+default_image_suf, 'title':"Read Length Histogram", 'description':"Read Length Histogram", 'summary':""}, - -"base_quality":{'file':default_image_path+"base_quality"+default_image_suf, 'title':"Base Quality Histogram", 'description':"Base Quality Histogram"}, - -"read_avg_base_quality":{'file':default_image_path+"read_avg_base_quality"+default_image_suf, 'title': "Read Base Quality Histogram", 'description': "Read Base Quality Histogram"}, - -"pos_quality":{'file':default_image_path+"pos_quality"+default_image_suf, 'title':"Base Position Quality", 'description':"Base Position Quality"}, -} diff --git a/src/ModuleCaller.cpp b/src/module_caller.cpp similarity index 74% rename from src/ModuleCaller.cpp rename to src/module_caller.cpp index b7b5d1a..2a34836 100644 --- a/src/ModuleCaller.cpp +++ b/src/module_caller.cpp @@ -1,15 +1,16 @@ #include -#include "ModuleCaller.h" -#include "BAM_module.h" -#include "SeqTxt_module.h" -#include "FASTQ_module.h" -#include "FASTA_module.h" -#include "FAST5_module.h" +#include "module_caller.h" +#include "bam_module.h" +#include "seqtxt_module.h" +#include "fastq_module.h" +#include "fasta_module.h" +#include "fast5_module.h" int callBAMModule(Input_Para &_input_data, Output_BAM &py_output_bam) { - BAM_Module _bam_module(_input_data); - int exit_code = _bam_module.calculateStatistics(py_output_bam); +// BAM_Module _bam_module(_input_data); + BAM_Module module; + int exit_code = module.calculateStatistics(_input_data, py_output_bam); return exit_code; } diff --git a/src/OutputStructures.cpp b/src/output_data.cpp similarity index 63% rename from src/OutputStructures.cpp rename to src/output_data.cpp index 3265b0d..69cfd35 100644 --- a/src/OutputStructures.cpp +++ b/src/output_data.cpp @@ -3,8 +3,8 @@ #include // sqrt #include -#include "OutputStructures.h" -#include "BasicStatistics.h" +#include "output_data.h" +#include "basic_statistics.h" // Base class for storing error output. Output_Info::Output_Info(){ @@ -12,17 +12,12 @@ Output_Info::Output_Info(){ error_str = ""; } - // Base class for storing basic QC data Basic_Seq_Statistics::Basic_Seq_Statistics(){ read_length_count.resize(MAX_READ_LENGTH); for(int _i_=0; _i_longest_read_length < basic_qc.longest_read_length){ + this->longest_read_length = basic_qc.longest_read_length; } - total_num_reads += t_seq_st.total_num_reads ; - total_num_bases += t_seq_st.total_num_bases ; - if ( longest_read_length < t_seq_st.longest_read_length){ - longest_read_length = t_seq_st.longest_read_length; - } + // Update base counts + this->total_a_cnt += basic_qc.total_a_cnt; + this->total_c_cnt += basic_qc.total_c_cnt; + this->total_g_cnt += basic_qc.total_g_cnt; + this->total_tu_cnt += basic_qc.total_tu_cnt; + this->total_n_cnt += basic_qc.total_n_cnt; - total_a_cnt += t_seq_st.total_a_cnt ; - total_c_cnt += t_seq_st.total_c_cnt ; - total_g_cnt += t_seq_st.total_g_cnt ; - total_tu_cnt += t_seq_st.total_tu_cnt ; - total_n_cnt += t_seq_st.total_n_cnt ; + // Update total number of bases + this->total_num_bases += basic_qc.total_num_bases; - // Add read lengths - this->read_lengths.insert(this->read_lengths.end(), t_seq_st.read_lengths.begin(), t_seq_st.read_lengths.end()); + // Update read counts + this->total_num_reads += basic_qc.total_num_reads; + + // Add read lengths + this->read_lengths.insert(this->read_lengths.end(), basic_qc.read_lengths.begin(), basic_qc.read_lengths.end()); + + // Add GC content + this->read_gc_content_count.insert(this->read_gc_content_count.end(), basic_qc.read_gc_content_count.begin(), basic_qc.read_gc_content_count.end()); } // Calculates NXX scores and GC content for BAM files @@ -131,62 +121,74 @@ void Basic_Seq_Statistics::global_sum(){ } else { // Add the G + C bases - double g_c = this->total_g_cnt + this->total_c_cnt; + uint64_t gc_total = this->total_g_cnt + this->total_c_cnt; - // Add all bases - double a_tu_g_c = g_c + this->total_a_cnt + this->total_tu_cnt; + // Get total base counts + uint64_t base_total = this->total_a_cnt + this->total_c_cnt + this->total_g_cnt + this->total_tu_cnt + this->total_n_cnt; // Check that our total base counts match what was stored (That our code works) - int _total_num_bases = this->total_num_bases; - if (a_tu_g_c != (double)_total_num_bases) + uint64_t base_total_from_reads = this->total_num_bases; + if (base_total != base_total_from_reads) { std::cerr << "Total number of bases is not consistent." << std::endl; - std::cout << _total_num_bases << std::endl; - std::cout << a_tu_g_c << std::endl; + std::cout << "From reads: " << base_total_from_reads << std::endl; + std::cout << "From bases: " << base_total << std::endl; } else { // Calculate GC-content - this->gc_cnt = g_c / a_tu_g_c; + double percent_gc = (double) gc_total / base_total; + this->gc_cnt = percent_gc; - // Sort the read lengths in descending order - std::vector _read_lengths = this->read_lengths; - std::sort(_read_lengths.begin(), _read_lengths.end(), std::greater()); + // Calculate N50 and other N-scores + this->calculate_NXX_scores(); + } + } +} - // Get the max read length - int max_read_length = _read_lengths.at(0); - this->longest_read_length = max_read_length; +// Calculates NXX scores and other basic read length statistics +void Basic_Seq_Statistics::calculate_NXX_scores(){ - // Get the median read length - double _median_read_length = _read_lengths[_read_lengths.size() / 2]; - this->median_read_length = _median_read_length; + // Sort the read lengths in descending order + std::sort(this->read_lengths.begin(), this->read_lengths.end(), std::greater()); - // Get the mean read length - float _mean_read_length = (double)_total_num_bases / (double)_read_lengths.size(); - this->mean_read_length = _mean_read_length; + // Get total base counts + uint64_t base_total = this->total_num_bases; - // Calculate N50 and other N-scores - this->NXX_read_length.resize(101, 0); - for (int percent_value = 1; percent_value <= 100; percent_value++) - { - // Get the base percentage threshold for this N-score - double base_threshold = (double)_total_num_bases * (percent_value / 100.0); - - // Calculate the NXX score - double current_base_count = 0; - int current_read_index = -1; - while (current_base_count < base_threshold) { - current_read_index ++; - current_base_count += _read_lengths.at(current_read_index); - } - int nxx_read_length = _read_lengths.at(current_read_index); - this->NXX_read_length[percent_value] = nxx_read_length; - } - - // Set common score variables - this->n50_read_length = this->NXX_read_length[50]; - this->n95_read_length = this->NXX_read_length[95]; - this->n05_read_length = this->NXX_read_length[5]; + // Get the max read length + int64_t max_length = this->read_lengths.at(0); + this->longest_read_length = max_length; + + // Get the median read length + double median_length = this->read_lengths[this->read_lengths.size() / 2]; + this->median_read_length = median_length; + + // Get the mean read length + float mean_length = (double)base_total / (double)this->read_lengths.size(); + this->mean_read_length = mean_length; + + // Initialize the NXX scores + this->NXX_read_length.resize(100, 0); + for (int percent_value = 1; percent_value <= 100; percent_value++) + { + // Get the base percentage threshold for this N-score + double base_threshold = (double) (base_total * (double) (percent_value / 100.0)); + + + // Calculate the NXX score + double current_base_count = 0; + int current_read_index = 0; + while (current_base_count < base_threshold) { + current_base_count += this->read_lengths.at(current_read_index); + current_read_index++; } + int nxx_read_index = current_read_index-1; + int nxx_read_length = this->read_lengths.at(nxx_read_index); + this->NXX_read_length[percent_value] = nxx_read_length; } + + // Set common score variables + this->n50_read_length = this->NXX_read_length[50]; + this->n95_read_length = this->NXX_read_length[95]; + this->n05_read_length = this->NXX_read_length[5]; } // Calculates NXX scores for sequencing_summary.txt files @@ -200,47 +202,8 @@ void Basic_Seq_Statistics::global_sum_no_gc(){ this->n05_read_length = 0; } else { - // Check that our total base counts match what was stored (That our code works) - int _total_num_bases = this->total_num_bases; - - // Sort the read lengths in descending order - std::vector _read_lengths = this->read_lengths; - std::sort(_read_lengths.begin(), _read_lengths.end(), std::greater()); - - // Get the max read length - int max_read_length = _read_lengths.at(0); - this->longest_read_length = max_read_length; - - // Get the median read length - double _median_read_length = _read_lengths[_read_lengths.size() / 2]; - this->median_read_length = _median_read_length; - - // Get the mean read length - float _mean_read_length = (double)_total_num_bases / (double)_read_lengths.size(); - this->mean_read_length = _mean_read_length; - // Calculate N50 and other N-scores - this->NXX_read_length.resize(101, 0); - for (int percent_value = 1; percent_value <= 100; percent_value++) - { - // Get the base percentage threshold for this N-score - double base_threshold = (double)_total_num_bases * (percent_value / 100.0); - - // Calculate the NXX score - double current_base_count = 0; - int current_read_index = -1; - while (current_base_count < base_threshold) { - current_read_index ++; - current_base_count += _read_lengths.at(current_read_index); - } - int nxx_read_length = _read_lengths.at(current_read_index); - this->NXX_read_length[percent_value] = nxx_read_length; - } - - // Set common score variables - this->n50_read_length = this->NXX_read_length[50]; - this->n95_read_length = this->NXX_read_length[95]; - this->n05_read_length = this->NXX_read_length[5]; + this->calculate_NXX_scores(); } } @@ -370,84 +333,38 @@ Output_BAM::Output_BAM(){ Output_BAM::~Output_BAM(){ } -void Output_BAM::reset(){ - for(int _i_=0; _i_reads_with_supplementary.insert( output_data.reads_with_supplementary.begin(), output_data.reads_with_supplementary.end() ); + this->num_reads_with_supplementary_alignment = this->reads_with_supplementary.size(); - num_primary_alignment = ZeroDefault; - num_secondary_alignment = ZeroDefault; - num_reads_with_secondary_alignment = ZeroDefault; - num_supplementary_alignment = ZeroDefault; - num_reads_with_supplementary_alignment = ZeroDefault; - num_reads_with_both_secondary_supplementary_alignment = ZeroDefault; + // Update the secondary alignment information + this->reads_with_secondary.insert( output_data.reads_with_secondary.begin(), output_data.reads_with_secondary.end() ); + this->num_reads_with_secondary_alignment = this->reads_with_secondary.size(); - forward_alignment = ZeroDefault; - reverse_alignment = ZeroDefault; - - min_map_quality = MoneDefault; - max_map_quality = MoneDefault; - - num_matched_bases = ZeroDefault; - num_mismatched_bases = ZeroDefault; - num_ins_bases = ZeroDefault; - num_del_bases = ZeroDefault; - num_clip_bases = ZeroDefault; - - mapped_long_read_info.reset(); - unmapped_long_read_info.reset(); - mapped_seq_quality_info.reset(); - unmapped_seq_quality_info.reset(); - - long_read_info.reset(); - seq_quality_info.reset(); -} + forward_alignment += output_data.forward_alignment; + reverse_alignment += output_data.reverse_alignment; -void Output_BAM::add(Output_BAM& t_output_bam){ - for(int _i_=0; _i_seq_quality_info.base_quality_distribution[i] += output_data.seq_quality_info.base_quality_distribution[i]; } - for(int _i_=0; _i_ t_output_bam.min_map_quality){ - min_map_quality = t_output_bam.min_map_quality; - } - if ( max_map_quality < t_output_bam.max_map_quality ){ - max_map_quality = t_output_bam.max_map_quality; - } - - num_matched_bases += t_output_bam.num_matched_bases; - num_mismatched_bases += t_output_bam.num_mismatched_bases; - num_ins_bases += t_output_bam.num_ins_bases; - num_del_bases += t_output_bam.num_del_bases; - num_clip_bases += t_output_bam.num_clip_bases; - mapped_long_read_info.add(t_output_bam.mapped_long_read_info); - unmapped_long_read_info.add(t_output_bam.unmapped_long_read_info); - mapped_seq_quality_info.add(t_output_bam.mapped_seq_quality_info); - unmapped_seq_quality_info.add(t_output_bam.unmapped_seq_quality_info); + num_matched_bases += output_data.num_matched_bases; + num_mismatched_bases += output_data.num_mismatched_bases; + num_ins_bases += output_data.num_ins_bases; + num_del_bases += output_data.num_del_bases; + num_clip_bases += output_data.num_clip_bases; - long_read_info.add(t_output_bam.mapped_long_read_info); - long_read_info.add(t_output_bam.unmapped_long_read_info); - seq_quality_info.add(t_output_bam.mapped_seq_quality_info); - seq_quality_info.add(t_output_bam.unmapped_seq_quality_info); + mapped_long_read_info.add(output_data.mapped_long_read_info); + unmapped_long_read_info.add(output_data.unmapped_long_read_info); - // Add read lengths + long_read_info.add(output_data.mapped_long_read_info); + long_read_info.add(output_data.unmapped_long_read_info); } void Output_BAM::global_sum(){ @@ -459,10 +376,67 @@ void Output_BAM::global_sum(){ long_read_info.global_sum(); seq_quality_info.global_sum(); + // Loop through each read and check if it is in both the secondary and supplementary sets + for ( auto it = this->reads_with_secondary.begin(); it != this->reads_with_secondary.end(); ++it ){ + std::string read_id = it->first; + if ( this->reads_with_supplementary.find( read_id ) != this->reads_with_supplementary.end() ){ + this->num_reads_with_both_secondary_supplementary_alignment++; + } + } + if ( min_map_quality==MoneDefault){ min_map_quality=ZeroDefault; } if ( max_map_quality==MoneDefault){ max_map_quality=ZeroDefault; } } +// Save the output to a file +void Output_BAM::save_summary(std::string &output_file, Input_Para ¶ms, Output_BAM &output_data){ + FILE *fp = fopen(output_file.c_str(), "w"); + if (fp == NULL){ + fprintf(stderr, "Error: cannot open file %s\n", output_file.c_str()); + } else { + // Save basic statistics + fprintf(fp, "Total number of reads\t%d\n", output_data.long_read_info.total_num_reads); + fprintf(fp, "Total number of bases\t%ld\n", output_data.long_read_info.total_num_bases); + fprintf(fp, "Longest read length\t%d\n", output_data.long_read_info.longest_read_length); + fprintf(fp, "N50 read length\t%d\n", output_data.long_read_info.n50_read_length); + fprintf(fp, "Mean read length\t%.2f\n", output_data.long_read_info.mean_read_length); + fprintf(fp, "Median read length\t%d\n", output_data.long_read_info.median_read_length); + fprintf(fp, "GC%%\t%.2f\n", output_data.long_read_info.gc_cnt * 100); + fprintf(fp, "\n"); + + // Save the mapping statistics + fprintf(fp, "Total number of mapped reads\t%d\n", output_data.mapped_long_read_info.total_num_reads); + fprintf(fp, "Total number of mapped bases\t%ld\n", output_data.mapped_long_read_info.total_num_bases); + fprintf(fp, "Longest mapped read length\t%d\n", output_data.mapped_long_read_info.longest_read_length); + fprintf(fp, "N50 mapped read length\t%d\n", output_data.mapped_long_read_info.n50_read_length); + fprintf(fp, "Mean mapped read length\t%.2f\n", output_data.mapped_long_read_info.mean_read_length); + fprintf(fp, "Median mapped read length\t%d\n", output_data.mapped_long_read_info.median_read_length); + fprintf(fp, "GC%%\t%.2f\n", output_data.mapped_long_read_info.gc_cnt * 100); + fprintf(fp, "\n"); + + // Save the read alignment statistics + fprintf(fp, "Total number of primary alignments\t%ld\n", output_data.num_primary_alignment); + fprintf(fp, "Total number of secondary alignments\t%ld\n", output_data.num_secondary_alignment); + fprintf(fp, "Total number of supplementary alignments\t%ld\n", output_data.num_supplementary_alignment); + fprintf(fp, "Total number of reads with secondary alignments\t%ld\n", output_data.num_reads_with_secondary_alignment); + fprintf(fp, "Total number of reads with supplementary alignments\t%ld\n", output_data.num_reads_with_supplementary_alignment); + fprintf(fp, "Total number of reads with both secondary and supplementary alignments\t%ld\n", output_data.num_reads_with_both_secondary_supplementary_alignment); + fprintf(fp, "Total number of reads with forward alignments\t%ld\n", output_data.forward_alignment); + fprintf(fp, "Total number of reads with reverse alignments\t%ld\n", output_data.reverse_alignment); + fprintf(fp, "Total number of reverse alignment\t%ld\n", output_data.reverse_alignment); + fprintf(fp, "\n"); + + // Save the base alignment statistics + fprintf(fp, "Total number of matched bases\t%ld\n", output_data.num_matched_bases); + fprintf(fp, "Total number of mismatched bases\t%ld\n", output_data.num_mismatched_bases); + fprintf(fp, "Total number of insertions\t%ld\n", output_data.num_ins_bases); + fprintf(fp, "Total number of deletions\t%ld\n", output_data.num_del_bases); + fprintf(fp, "Total number of soft clipped bases\t%ld\n", output_data.num_clip_bases); + + // Close the file + fclose(fp); + } +} // sequencing_summary.txt output Basic_SeqTxt_Statistics::Basic_SeqTxt_Statistics(){ diff --git a/src/plot_for_BAM.py b/src/plot_for_BAM.py deleted file mode 100644 index 5bc84cf..0000000 --- a/src/plot_for_BAM.py +++ /dev/null @@ -1,109 +0,0 @@ -""" -plot_for_BAM.py: -Use the formatted statistics from our C++ module output text files to generate summary plots in image format. -""" - -if __package__ == 'src': - from src import lrst_global - from src.utils import * -else: - import lrst_global - from utils import * - -def plot_alignment_numbers(data, path): - fig, axes = plt.subplots(figsize =(8, 6)) - - numbers_list=[[data.num_primary_alignment, data.num_supplementary_alignment, data.num_secondary_alignment, - data.num_reads_with_supplementary_alignment, data.num_reads_with_secondary_alignment, - data.num_reads_with_both_secondary_supplementary_alignment, data.forward_alignment, data.reverse_alignment]] - - category=['Primary Alignments', 'Supplementary Alignments', 'Secondary Alignments', 'Reads with Supplementary Alignments','Reads with Secondary Alignments','Reads with Secondary and Supplementary Alignments', 'Forward Alignments', 'Reverse Alignments'] - category=[wrap(x) for x in category] - - category_list=itertools.cycle([category]) - xlabel_list=itertools.cycle(['Counts']) - ylabel_list=itertools.cycle(['']) - subtitle_list=[None] - bar_plot(fig, numbers_list, category_list, xlabel_list, ylabel_list, subtitle_list, path, orientation='h') - -def plot_errors(bam_output, path): - fig, axes = plt.subplots(1,1, figsize =(8, 6)) - - numbers_list=[[bam_output.num_matched_bases, bam_output.num_mismatched_bases, bam_output.num_ins_bases, bam_output.num_del_bases, bam_output.num_clip_bases]] - - category=['Matched Bases', 'Mismatched Bases', 'Inserted Bases', 'Deleted Bases', 'Clipped Bases'] - category=[wrap(x) for x in category] - - category_list=itertools.cycle([category]) - xlabel_list=itertools.cycle(['Counts']) - ylabel_list=itertools.cycle([None]) - subtitle_list=[None] - bar_plot(fig, numbers_list, category_list, xlabel_list, ylabel_list, subtitle_list, path, orientation='h') - -def generate_bs( bam_output, para_dict ): - lrst_global.plot_filenames["basic_st"] = {}; - lrst_global.plot_filenames["basic_st"]['file'] = "" - lrst_global.plot_filenames["basic_st"]['title'] = "Basic statistics" - lrst_global.plot_filenames["basic_st"]['description'] = "BAM: Basic statistics" - - table_str = "\n\n\n" - table_str += "\n" - int_str_for_format = "" - double_str_for_format = "" - table_str += int_str_for_format.format("#Total Reads", \ - bam_output.mapped_long_read_info.total_num_reads, \ - bam_output.unmapped_long_read_info.total_num_reads, \ - bam_output.long_read_info.total_num_reads); - table_str += int_str_for_format.format("#Total Bases", \ - bam_output.mapped_long_read_info.total_num_bases, \ - bam_output.unmapped_long_read_info.total_num_bases, \ - bam_output.long_read_info.total_num_bases); - table_str += int_str_for_format.format("Longest Read Length", \ - bam_output.mapped_long_read_info.longest_read_length, \ - bam_output.unmapped_long_read_info.longest_read_length, \ - bam_output.long_read_info.longest_read_length); - table_str += int_str_for_format.format("N50", \ - bam_output.mapped_long_read_info.n50_read_length, \ - bam_output.unmapped_long_read_info.n50_read_length, \ - bam_output.long_read_info.n50_read_length); - table_str += double_str_for_format.format("GC Content(%)", \ - bam_output.mapped_long_read_info.gc_cnt*100, \ - bam_output.unmapped_long_read_info.gc_cnt*100, \ - bam_output.long_read_info.gc_cnt*100); - table_str += double_str_for_format.format("Mean Read Length", \ - bam_output.mapped_long_read_info.mean_read_length, \ - bam_output.unmapped_long_read_info.mean_read_length, \ - bam_output.long_read_info.mean_read_length); - table_str += int_str_for_format.format("Median Read Length", \ - bam_output.mapped_long_read_info.median_read_length, \ - bam_output.unmapped_long_read_info.median_read_length, \ - bam_output.long_read_info.median_read_length); - table_str += "\n\n
    MeasurementMappedUnmappedAll
    {}{:,d}{:,d}{:,d}
    {}{:.1f}{:.1f}{:.1f}
    " - - lrst_global.plot_filenames["basic_st"]['detail'] = table_str; - - -def bam_plot( bam_output, para_dict ): - - out_path=para_dict["output_folder"] - get_image_path=lambda x: os.path.join(out_path,lrst_global.plot_filenames[x]['file']) - - # Set the default matplotlib font size - setDefaultFontSize(12) - - # Get the font size for plotly plots - font_size = para_dict["fontsize"] - - # Create table - generate_bs(bam_output, para_dict) - - # Generate plots - plot_alignment_numbers(bam_output, get_image_path('map_st')) - plot_errors(bam_output, get_image_path('err_st')) - - plot_read_length_stats([bam_output.long_read_info, bam_output.mapped_long_read_info, bam_output.unmapped_long_read_info], get_image_path('read_length_st'), subtitles=['All Reads', 'Mapped Reads', 'Unmapped Reads']) - plot_base_counts([bam_output.long_read_info, bam_output.mapped_long_read_info, bam_output.unmapped_long_read_info], get_image_path('base_st'), subtitles=['All Reads', 'Mapped Reads', 'Unmapped Reads']) - plot_basic_info([bam_output.long_read_info, bam_output.mapped_long_read_info, bam_output.unmapped_long_read_info], get_image_path('basic_info'), categories=['All Reads', 'Mapped Reads', 'Unmapped Reads']) - - lrst_global.plot_filenames['read_length_hist']['dynamic'] = histogram(bam_output.long_read_info, get_image_path('read_length_hist'), font_size) - lrst_global.plot_filenames['base_quality']['dynamic'] = base_quality(bam_output.seq_quality_info, get_image_path('base_quality'), font_size) diff --git a/src/plot_for_FA.py b/src/plot_for_FA.py deleted file mode 100644 index ddc2de3..0000000 --- a/src/plot_for_FA.py +++ /dev/null @@ -1,60 +0,0 @@ -""" -plot_for_FA.py: -Use the formatted statistics from our C++ module output text files to generate summary plots in image format. -""" - -if __package__ == 'src': - from src import lrst_global - from src.utils import * -else: - import lrst_global - from utils import * - - -def generate_bs( fa_output, para_dict ): - lrst_global.plot_filenames["basic_st"] = {}; - lrst_global.plot_filenames["basic_st"]['file'] = "" - lrst_global.plot_filenames["basic_st"]['title'] = "Basic statistics" - lrst_global.plot_filenames["basic_st"]['description'] = "FASTA: Basic statistics" - - table_str = "\n\n\n" - table_str += "\n" - int_str_for_format = "" - double_str_for_format = "" - table_str += int_str_for_format.format("#Total Reads", \ - fa_output.long_read_info.total_num_reads); - table_str += int_str_for_format.format("#Total Bases", \ - fa_output.long_read_info.total_num_bases); - table_str += int_str_for_format.format("Longest Read Length", \ - fa_output.long_read_info.longest_read_length); - table_str += int_str_for_format.format("N50", \ - fa_output.long_read_info.n50_read_length); - table_str += double_str_for_format.format("GC Content(%)", \ - fa_output.long_read_info.gc_cnt*100); - table_str += double_str_for_format.format("Mean Read Length", \ - fa_output.long_read_info.mean_read_length); - table_str += int_str_for_format.format("Median Read Length", \ - fa_output.long_read_info.median_read_length); - table_str += "\n\n
    MeasurementStatistics
    {}{:,d}
    {}{:.1f}
    " - - lrst_global.plot_filenames["basic_st"]['detail'] = table_str; - -def fa_plot( fa_output, para_dict ): - - out_path=para_dict["output_folder"] - get_image_path=lambda x: os.path.join(out_path,lrst_global.plot_filenames[x]['file']) - - # Set the default matplotlib font size - setDefaultFontSize(12) - - # Get the font size for plotly plots - font_size = para_dict["fontsize"] - - # Generate plots - generate_bs( fa_output, para_dict) - - # Save plot images using statistics generated from the C++ module - plot_read_length_stats([fa_output.long_read_info], get_image_path('read_length_st'), subtitles=['Long Reads']) - plot_base_counts([fa_output.long_read_info], get_image_path('base_st'), subtitles=['Long Reads']) - plot_basic_info([fa_output.long_read_info], get_image_path('basic_info'), categories=['Long Reads']) - histogram(fa_output.long_read_info, get_image_path('read_length_hist'), font_size) diff --git a/src/plot_for_SeqTxt.py b/src/plot_for_SeqTxt.py deleted file mode 100644 index 70cca1f..0000000 --- a/src/plot_for_SeqTxt.py +++ /dev/null @@ -1,69 +0,0 @@ -""" -plot_for_SeqTxt.py: -Use the formatted statistics from our C++ module output text files to generate summary plots in image format. -""" - -if __package__ == 'src': - from src import lrst_global - from src.utils import * -else: - import lrst_global - from utils import * - -def generate_bs( output_statistics, para_dict ): - lrst_global.plot_filenames["basic_st"] = {}; - lrst_global.plot_filenames["basic_st"]['file'] = "" - lrst_global.plot_filenames["basic_st"]['title'] = "Basic statistics" - lrst_global.plot_filenames["basic_st"]['description'] = "Sequencing Summary: Basic statistics" - - table_str = "\n\n\n" - table_str += "\n" - int_str_for_format = "" - double_str_for_format = "" - table_str += int_str_for_format.format("#Total Reads", \ - output_statistics.passed_long_read_info.long_read_info.total_num_reads, \ - output_statistics.failed_long_read_info.long_read_info.total_num_reads, \ - output_statistics.all_long_read_info.long_read_info.total_num_reads); - table_str += int_str_for_format.format("#Total Bases", \ - output_statistics.passed_long_read_info.long_read_info.total_num_bases, \ - output_statistics.failed_long_read_info.long_read_info.total_num_bases, \ - output_statistics.all_long_read_info.long_read_info.total_num_bases); - table_str += int_str_for_format.format("Longest Read Length", \ - output_statistics.passed_long_read_info.long_read_info.longest_read_length, \ - output_statistics.failed_long_read_info.long_read_info.longest_read_length, \ - output_statistics.all_long_read_info.long_read_info.longest_read_length); - table_str += int_str_for_format.format("N50", \ - output_statistics.passed_long_read_info.long_read_info.n50_read_length, \ - output_statistics.failed_long_read_info.long_read_info.n50_read_length, \ - output_statistics.all_long_read_info.long_read_info.n50_read_length); - table_str += double_str_for_format.format("Mean Read Length", \ - output_statistics.passed_long_read_info.long_read_info.mean_read_length, \ - output_statistics.failed_long_read_info.long_read_info.mean_read_length, \ - output_statistics.all_long_read_info.long_read_info.mean_read_length); - table_str += int_str_for_format.format("Median Read Length", \ - output_statistics.passed_long_read_info.long_read_info.median_read_length, \ - output_statistics.failed_long_read_info.long_read_info.median_read_length, \ - output_statistics.all_long_read_info.long_read_info.median_read_length); - table_str += "\n\n
    MeasurementPassedFailedAll
    {}{:,d}{:,d}{:,d}
    {}{:.1f}{:.1f}{:.1f}
    " - - lrst_global.plot_filenames["basic_st"]['detail'] = table_str; - - -def plot( output_statistics, para_dict ): - out_path=para_dict["output_folder"] - get_image_path=lambda x: os.path.join(out_path,lrst_global.plot_filenames[x]['file']) - - # Set the default matplotlib font size - setDefaultFontSize(12) - - # Get the font size for plotly plots - font_size = para_dict["fontsize"] - - # Create table - generate_bs( output_statistics, para_dict ); - - # Generate plots - plot_read_length_stats([output_statistics.all_long_read_info.long_read_info, output_statistics.passed_long_read_info.long_read_info, output_statistics.failed_long_read_info.long_read_info], get_image_path('read_length_st'), subtitles=['All Reads', 'Passed Reads', 'Failed Reads']) - plot_base_counts([output_statistics.all_long_read_info.long_read_info, output_statistics.passed_long_read_info.long_read_info, output_statistics.failed_long_read_info.long_read_info], get_image_path('base_st'), subtitles=['All Reads', 'Passed Reads', 'Failed Reads']) - plot_basic_info([output_statistics.all_long_read_info.long_read_info, output_statistics.passed_long_read_info.long_read_info, output_statistics.failed_long_read_info.long_read_info], get_image_path('basic_info'), categories=['All Reads', 'Passed Reads', 'Failed Reads']) - histogram(output_statistics.all_long_read_info.long_read_info, get_image_path('read_length_hist'), font_size) diff --git a/src/plot_utils.py b/src/plot_utils.py index 96f5762..80400fe 100644 --- a/src/plot_utils.py +++ b/src/plot_utils.py @@ -7,14 +7,54 @@ import plotly.graph_objs as go from plotly.subplots import make_subplots -if __package__ == 'src': - from src import lrst_global -else: - import lrst_global - logging.getLogger('matplotlib.font_manager').setLevel(logging.ERROR) +# Return the default image path +def getDefaultImageFolder(): + return 'img/' + + +# Return the default image suffix +def getDefaultImageSuffix(): + return '.png' + + +# Return a dictionary of default plot filenames +def getDefaultPlotFilenames(): + default_image_path = getDefaultImageFolder() + default_image_suf = getDefaultImageSuffix() + + plot_filenames = { # for fq/fa + "read_length_distr": {'file': default_image_path + "read_length_distr" + default_image_suf, + 'title': "Read Length", 'description': "Read Length Distribution"}, # for bam + "map_st": {'file': default_image_path + "map_st" + default_image_suf, 'title': "Map Information", + 'description': "Read Mapping Statistics"}, + "err_st": {'file': default_image_path + "err_st" + default_image_suf, + 'title': "Base Alignment and Error Statistics", + 'description': "Base Alignment and Error Statistics"}, + "read_length_st": {'file': default_image_path + "read_length_st" + default_image_suf, + 'title': "Read Length Statistics", 'description': "Read Length Statistics"}, + "base_st": {'file': default_image_path + "base_st" + default_image_suf, 'title': "Base Count Statistics", + 'description': "Base Count Statistics", 'summary': ""}, + "basic_info": {'file': default_image_path + "basic_info" + default_image_suf, 'title': "Basic Statistics", + 'description': "Basic Statistics", 'summary': ""}, + "read_length_hist": {'file': default_image_path + "read_length_hist" + default_image_suf, + 'title': "Read Length Histogram", 'description': "Read Length Histogram", 'summary': ""}, + + "base_quality": {'file': default_image_path + "base_quality" + default_image_suf, + 'title': "Base Quality Histogram", 'description': "Base Quality Histogram"}, + + "read_avg_base_quality": {'file': default_image_path + "read_avg_base_quality" + default_image_suf, + 'title': "Read Base Quality Histogram", 'description': "Read Base Quality Histogram"}, + + "pos_quality": {'file': default_image_path + "pos_quality" + default_image_suf, + 'title': "Base Position Quality", 'description': "Base Position Quality"}, + } + + return plot_filenames + + def setDefaultFontSize(font_size): """Set the default font size for matplotlib plots.""" plt.rcParams.update({'font.size': font_size}) @@ -22,7 +62,8 @@ def setDefaultFontSize(font_size): def fmt(x): """Format numbers for plots.""" - format_x = np.format_float_scientific(x, exp_digits=4) + format_x = "{:,}".format(round(x)) + return format_x @@ -57,7 +98,7 @@ def plot_base_counts(data, path, subtitles=None, categories=None): ylabel_list = itertools.cycle(['Counts']) subtitle_list = subtitles bar_plot(fig, numbers_list, category_list, xlabel_list, ylabel_list, subtitle_list, path) - # lrst_global.plot_filenames['base_st']['summary']='GC Content: {:.2%}'.format(bam_output.mapped_long_read_info.gc_cnt) + # plot_filepaths['base_st']['summary']='GC Content: {:.2%}'.format(bam_output.mapped_long_read_info.gc_cnt) def plot_basic_info(data, path, subtitles=None, categories=None): @@ -174,6 +215,78 @@ def histogram(data, path, font_size): return html_obj +def read_lengths_histogram(data, path, font_size): + """Plot the read length histograms.""" + annotation_size = 10 # Annotation font size + mean, median, n50 = data.mean_read_length, data.median_read_length, data.n50_read_length + + # Read the read lengths array in float64 format + read_lengths = np.array(data.read_lengths, dtype=np.float64) + + # Calculate a histogram of read lengths + hist, edges = np.histogram(read_lengths, bins=10) + + # Create a figure with two subplots + fig = make_subplots( + rows=2, cols=1, + subplot_titles=("Read Length Histogram", "Log Read Length Histogram"), vertical_spacing=0.3) + fig.update_layout(showlegend=False, autosize=False) + + customdata = np.dstack((edges[:-1], edges[1:], hist))[0, :, :] + fig.add_trace(go.Bar(x=edges, y=hist, customdata=customdata, + hovertemplate='Length: %{customdata[0]:.0f}-%{customdata[1]:.0f}bp
    Counts:%{customdata[' + '2]:.0f}', + marker_color='#36a5c7'), row=1, col=1) + + fig.add_vline(mean, line_width=1, line_dash="dash", annotation_text='Mean', annotation_bgcolor="black", + annotation_textangle=90, row=1, col=1) + fig.add_vline(median, line_width=1, line_dash="dash", annotation_text='Median', annotation_bgcolor="blue", + annotation_textangle=90, row=1, col=1) + fig.add_vline(n50, line_width=1, line_dash="dash", annotation_text='N50', annotation_bgcolor="green", + annotation_textangle=90, row=1, col=1) + + # Log histogram + # Get the log10 histogram of read lengths + read_lengths_log = np.log10(read_lengths, out=np.zeros_like(read_lengths), where=(read_lengths != 0)) + log_hist, log_edges = np.histogram(read_lengths_log, bins=len(edges)) + + xd = log_edges + customdata = np.dstack((np.power(10, log_edges)[:-1], np.power(10, log_edges)[1:], log_hist))[0, :, :] + yd = log_hist + fig.add_trace(go.Bar(x=xd, y=yd, customdata=customdata, + hovertemplate='Length: %{customdata[0]:.0f}-%{customdata[1]:.0f}bp
    Counts:%{customdata[2]:.0f}', + marker_color='#36a5c7'), row=2, col=1) + + fig.add_vline(np.log10(mean), line_width=1, line_dash="dash", annotation_text='Mean', annotation_bgcolor="black", + annotation_textangle=90, row=2, col=1) + fig.add_vline(np.log10(median), line_width=1, line_dash="dash", annotation_text='Median', annotation_bgcolor="blue", + annotation_textangle=90, row=2, col=1) + fig.add_vline(np.log10(n50), line_width=1, line_dash="dash", annotation_text='N50', annotation_bgcolor="green", + annotation_textangle=90, row=2, col=1) + fig.update_annotations(font=dict(color="white")) + + # Set tick value range for the log scale + tick_vals = list(range(0, 5)) + fig.update_xaxes( + range=[0, 5], + tickmode='array', + tickvals=tick_vals, + ticktext=['{:,}'.format(10 ** x) for x in tick_vals], + ticks="outside", title_text='Read Length (Log Scale)', title_standoff=0, row=2, col=1) + + fig.update_xaxes(ticks="outside", title_text='Read Length', title_standoff=0, row=1, col=1) + fig.update_yaxes(ticks="outside", title_text='Counts', title_standoff=0) + + # Set font sizes + fig.update_layout(font=dict(size=font_size), autosize=True) + + fig.update_annotations(font_size=annotation_size) + html_obj = fig.to_html(full_html=False) + fig.write_image(path, engine="kaleido") + + return html_obj + + def base_quality(data, path, font_size): """ Save the 'Base quality' plot image. @@ -184,7 +297,8 @@ def base_quality(data, path, font_size): customdata = np.dstack((xd, yd))[0, :, :] fig.add_trace(go.Bar(x=xd, y=yd, customdata=customdata, - hovertemplate='Base Quality: %{customdata[0]:.0f}
    Base Counts:%{customdata[1]:.0f}', + hovertemplate='Base Quality: %{customdata[0]:.0f}
    Base Counts:%{customdata[' + '1]:.0f}', marker_color='#36a5c7')) fig.update_xaxes(ticks="outside", dtick=10, title_text='Base Quality', title_standoff=0) @@ -213,11 +327,11 @@ def read_avg_base_quality(data, path, font_size): return fig.to_html(full_html=False) -def create_statistics_table(module_output, table_title="Basic Statistics"): - lrst_global.plot_filenames["basic_st"] = {} - lrst_global.plot_filenames["basic_st"]['file'] = "" - lrst_global.plot_filenames["basic_st"]['title'] = "Summary Table" - lrst_global.plot_filenames["basic_st"]['description'] = table_title +def create_statistics_table(module_output, plot_filepaths, table_title="Basic Statistics"): + plot_filepaths["basic_st"] = {} + plot_filepaths["basic_st"]['file'] = "" + plot_filepaths["basic_st"]['title'] = "Summary Table" + plot_filepaths["basic_st"]['description'] = table_title table_str = "\n\n\n" table_str += "\n" @@ -239,7 +353,9 @@ def create_statistics_table(module_output, table_title="Basic Statistics"): module_output.long_read_info.median_read_length) table_str += "\n\n
    MeasurementStatistics
    " - lrst_global.plot_filenames["basic_st"]['detail'] = table_str + plot_filepaths["basic_st"]['detail'] = table_str + + return plot_filepaths def create_base_quality_plots(module_output, para_dict, table_title): @@ -247,7 +363,8 @@ def create_base_quality_plots(module_output, para_dict, table_title): Generate HTML plots for base and base quality (BAM, FASTQ, FAST5). """ out_path = para_dict["output_folder"] - get_image_path = lambda x: os.path.join(out_path, lrst_global.plot_filenames[x]['file']) + plot_filepaths = getDefaultPlotFilenames() + get_image_path = lambda x: os.path.join(out_path, plot_filepaths[x]['file']) # Set the default matplotlib font size setDefaultFontSize(12) @@ -256,7 +373,7 @@ def create_base_quality_plots(module_output, para_dict, table_title): font_size = para_dict["fontsize"] # Create the statistics table - create_statistics_table(module_output, table_title) + plot_filepaths = create_statistics_table(module_output, plot_filepaths, table_title) # Create basic plots basic_data = module_output.long_read_info @@ -266,13 +383,15 @@ def create_base_quality_plots(module_output, para_dict, table_title): # Read length histogram length_hist_path = get_image_path('read_length_hist') - lrst_global.plot_filenames['read_length_hist']['dynamic'] = histogram(basic_data, length_hist_path, font_size) + plot_filepaths['read_length_hist']['dynamic'] = histogram(basic_data, length_hist_path, font_size) # Base quality histogram quality_data = module_output.seq_quality_info quality_hist_path = get_image_path('base_quality') - lrst_global.plot_filenames['base_quality']['dynamic'] = base_quality(quality_data, quality_hist_path, font_size) + plot_filepaths['base_quality']['dynamic'] = base_quality(quality_data, quality_hist_path, font_size) # Read quality histogram read_quality_dynamic = read_avg_base_quality(quality_data, get_image_path('read_avg_base_quality'), font_size) - lrst_global.plot_filenames['read_avg_base_quality']['dynamic'] = read_quality_dynamic + plot_filepaths['read_avg_base_quality']['dynamic'] = read_quality_dynamic + + return plot_filepaths diff --git a/src/SeqTxt_module.cpp b/src/seqtxt_module.cpp similarity index 98% rename from src/SeqTxt_module.cpp rename to src/seqtxt_module.cpp index aaa0458..93b01b2 100644 --- a/src/SeqTxt_module.cpp +++ b/src/seqtxt_module.cpp @@ -9,7 +9,7 @@ Class for calling FAST5 statistics modules. #include #include -#include "SeqTxt_module.h" +#include "seqtxt_module.h" #include "ComFunction.h" @@ -274,8 +274,10 @@ void SeqTxt_Module::SeqTxt_do_thread(std::ifstream* file_stream, Input_Para& ref // Store the read length seqtxt_statistics->long_read_info.read_lengths.push_back(sequence_base_count); - if ( seqtxt_statistics->long_read_info.longest_read_length < ref_thread_data.stored_records[read_ss_i].sequence_length_template){ - seqtxt_statistics->long_read_info.longest_read_length = ref_thread_data.stored_records[read_ss_i].sequence_length_template; + // Update the longest read length + int64_t current_read_length = (int64_t) ref_thread_data.stored_records[read_ss_i].sequence_length_template; + if ( seqtxt_statistics->long_read_info.longest_read_length < current_read_length){ + seqtxt_statistics->long_read_info.longest_read_length = current_read_length; } seqtxt_statistics->long_read_info.read_length_count[ ref_thread_data.stored_records[read_ss_i].sequence_length_template{:,d}{:,d}{:,d}" + double_str_for_format = "{}{:.1f}{:.1f}{:.1f}" + table_str += int_str_for_format.format("#Total Reads", + seqtxt_output.passed_long_read_info.long_read_info.total_num_reads, + seqtxt_output.failed_long_read_info.long_read_info.total_num_reads, + seqtxt_output.all_long_read_info.long_read_info.total_num_reads) + table_str += int_str_for_format.format("#Total Bases", + seqtxt_output.passed_long_read_info.long_read_info.total_num_bases, + seqtxt_output.failed_long_read_info.long_read_info.total_num_bases, + seqtxt_output.all_long_read_info.long_read_info.total_num_bases) + table_str += int_str_for_format.format("Longest Read Length", + seqtxt_output.passed_long_read_info.long_read_info.longest_read_length, + seqtxt_output.failed_long_read_info.long_read_info.longest_read_length, + seqtxt_output.all_long_read_info.long_read_info.longest_read_length) + table_str += int_str_for_format.format("N50", + seqtxt_output.passed_long_read_info.long_read_info.n50_read_length, + seqtxt_output.failed_long_read_info.long_read_info.n50_read_length, + seqtxt_output.all_long_read_info.long_read_info.n50_read_length) + table_str += double_str_for_format.format("Mean Read Length", + seqtxt_output.passed_long_read_info.long_read_info.mean_read_length, + seqtxt_output.failed_long_read_info.long_read_info.mean_read_length, + seqtxt_output.all_long_read_info.long_read_info.mean_read_length) + table_str += int_str_for_format.format("Median Read Length", + seqtxt_output.passed_long_read_info.long_read_info.median_read_length, + seqtxt_output.failed_long_read_info.long_read_info.median_read_length, + seqtxt_output.all_long_read_info.long_read_info.median_read_length) + table_str += "\n\n" + + plot_filepaths["basic_st"]['detail'] = table_str + + +def plot(seqtxt_output, para_dict): + out_path = para_dict["output_folder"] + plot_filepaths = getDefaultPlotFilenames() + get_image_path = lambda x: os.path.join(out_path, plot_filepaths[x]['file']) + + # Set the default matplotlib font size + setDefaultFontSize(12) + + # Get the font size for plotly plots + font_size = para_dict["fontsize"] + + # Create table + create_summary_table(seqtxt_output, plot_filepaths) + + # Generate plots + plot_read_length_stats( + [seqtxt_output.all_long_read_info.long_read_info, seqtxt_output.passed_long_read_info.long_read_info, + seqtxt_output.failed_long_read_info.long_read_info], get_image_path('read_length_st'), + subtitles=['All Reads', 'Passed Reads', 'Failed Reads']) + plot_base_counts( + [seqtxt_output.all_long_read_info.long_read_info, seqtxt_output.passed_long_read_info.long_read_info, + seqtxt_output.failed_long_read_info.long_read_info], get_image_path('base_st'), + subtitles=['All Reads', 'Passed Reads', 'Failed Reads']) + plot_basic_info( + [seqtxt_output.all_long_read_info.long_read_info, seqtxt_output.passed_long_read_info.long_read_info, + seqtxt_output.failed_long_read_info.long_read_info], get_image_path('basic_info'), + categories=['All Reads', 'Passed Reads', 'Failed Reads']) + histogram(seqtxt_output.all_long_read_info.long_read_info, get_image_path('read_length_hist'), font_size) + + return plot_filepaths