From 925a19bda8290b3d5838aa7c8a0d250c8722286e Mon Sep 17 00:00:00 2001 From: Massimiliano Rossi Date: Mon, 5 Aug 2024 09:13:51 -0400 Subject: [PATCH] Add ref occurrence to MEM's output (#6) * Add ref,pos to MEM's output - untested * Update klib commit * Fix typos * Add extended-output flag for MEMS * Bump version --- CMakeLists.txt | 2 +- README.md | 8 +++-- pipeline/moni.in | 5 ++- src/mems.cpp | 72 ++++++++++++++++++++++++++++----------- thirdparty/CMakeLists.txt | 5 ++- 5 files changed, 67 insertions(+), 25 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b46b1aa..1b4b9bd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,7 +14,7 @@ message(STATUS "Install directory: ${CMAKE_INSTALL_PREFIX}") project(moni) SET(VERSION_MAJOR "0") SET(VERSION_MINOR "2") -SET(VERSION_PATCH "0") +SET(VERSION_PATCH "1") SET(VERSION "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}") message("version: ${VERSION}") diff --git a/README.md b/README.md index 64edfba..2ecee7b 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ [![Release](https://img.shields.io/github/release/maxrossi91/moni.svg)](https://github.com/maxrossi91/moni/releases) -[![Downloads](https://img.shields.io/github/downloads/maxrossi91/moni/total?logo=github)](https://github.com/maxrossi91/moni/releases/download/v0.2.0/moni_0.2.0_amd64.deb) +[![Downloads](https://img.shields.io/github/downloads/maxrossi91/moni/total?logo=github)](https://github.com/maxrossi91/moni/releases/download/v0.2.1/moni_0.2.1_amd64.deb) # MONI ```console @@ -9,7 +9,7 @@ | |\/| | | | | . ` | | | | | | | |__| | |\ |_| |_ |_| |_|\____/|_| \_|_____| - ver 0.2.0 + ver 0.2.1 ``` A Pangenomics Index for Finding MEMs. @@ -57,7 +57,7 @@ usage: moni ms [-h] -i INDEX -p PATTERN [-o OUTPUT] [-t THREADS] ### Computing the matching statistics with MONI: ``` -usage: moni mems [-h] -i INDEX -p PATTERN [-o OUTPUT] [-t THREADS] +usage: moni mems [-h] -i INDEX -p PATTERN [-o OUTPUT] [-e] [-t THREADS] -h, --help show this help message and exit -i INDEX, --index INDEX reference index base name (default: None) @@ -65,6 +65,8 @@ usage: moni mems [-h] -i INDEX -p PATTERN [-o OUTPUT] [-t THREADS] the input query (default: None) -o OUTPUT, --output OUTPUT output directory path (default: .) + -e, --extended-output + output MEM occurrence in the reference (default: False) -t THREADS, --threads THREADS number of helper threads (default: 1) -g GRAMMAR, --grammar GRAMMAR diff --git a/pipeline/moni.in b/pipeline/moni.in index 1ccb13b..0713e34 100755 --- a/pipeline/moni.in +++ b/pipeline/moni.in @@ -11,7 +11,7 @@ Description = """ | |\/| | | | | . ` | | | | | | | |__| | |\ |_| |_ |_| |_|\____/|_| \_|_____| - ver 0.2.0 + ver 0.2.1 A Pangenomics Index for Finding MEMs. Build the index for highly repetive files using the approach @@ -554,6 +554,8 @@ class run_helper(threading.Thread): command += " -b {} -A {} -B {} -O {} -E {} -L {} ".format(args.batch,args.smatch, args.smismatch, args.gapo, args.gape, args.extl) if args.grammar == "shaped": command += " -q" + if exe_name == "MONI-MEMS" and args.extended_output: + command += " -e" if args.output != ".": command += " -o {}".format(args.output) @@ -650,6 +652,7 @@ def main(): mems_parser.add_argument('-i', '--index', help='reference index folder', type=str, required=True) mems_parser.add_argument('-p', '--pattern', help='the input query', type=str, required=True) mems_parser.add_argument('-o', '--output', help='output file prefix', type=str, default='.') + mems_parser.add_argument('-e', '--extended-output', help='output MEM occurrence in the reference', action='store_true') mems_parser.add_argument('-t', '--threads', help='number of helper threads', default=1, type=int) mems_parser.add_argument('-g', '--grammar', help='select the grammar [plain, shaped]', type=str, default='plain') mems_parser.set_defaults(which='mems') diff --git a/src/mems.cpp b/src/mems.cpp index 825c0cf..9bf9e07 100644 --- a/src/mems.cpp +++ b/src/mems.cpp @@ -40,6 +40,8 @@ extern "C" { #include #include +#include + //////////////////////////////////////////////////////////////////////////////// /// kseq extra //////////////////////////////////////////////////////////////////////////////// @@ -242,7 +244,7 @@ class mems_c { auto pointers = ms.query(read->seq.s, read->seq.l); std::vector lengths(pointers.size()); - std::vector> mems; + std::vector> mems; size_t l = 0; for (size_t i = 0; i < pointers.size(); ++i) @@ -255,7 +257,7 @@ class mems_c l = (l == 0 ? 0 : (l - 1)); if((i == 0) or (lengths[i] >= lengths[i-1])) - mems.push_back(make_pair(i,lengths[i])); + mems.push_back(make_tuple(i,lengths[i],pos)); } // Original MS computation @@ -276,7 +278,7 @@ class mems_c fwrite(read->name.s, sizeof(char),h_length,out); size_t q_length = mems.size(); fwrite(&q_length, sizeof(size_t), 1,out); - fwrite(mems.data(), sizeof(std::pair),q_length,out); + fwrite(mems.data(), sizeof(std::tuple),q_length,out); } protected: @@ -424,11 +426,12 @@ typedef std::pair> pattern_t; struct Args { std::string filename = ""; - std::string patterns = ""; // path to patterns file - std::string output = ""; // output file prefix - size_t l = 25; // minumum MEM length - size_t th = 1; // number of threads - bool shaped_slp = false; // use shaped slp + std::string patterns = ""; // path to patterns file + std::string output = ""; // output file prefix + size_t l = 25; // minumum MEM length + size_t th = 1; // number of threads + bool shaped_slp = false; // use shaped slp + bool extended_output = false; // use shaped slp }; void parseArgs(int argc, char *const argv[], Args &arg) @@ -437,16 +440,17 @@ void parseArgs(int argc, char *const argv[], Args &arg) extern char *optarg; extern int optind; - std::string usage("usage: " + std::string(argv[0]) + " infile [-p patterns] [-o output] [-t threads] [-l len] [-q shaped_slp] [-b batch]\n\n" + + std::string usage("usage: " + std::string(argv[0]) + " infile [-p patterns] [-o output] [-t threads] [-l len] [-q shaped_slp] [-e extended_output] [-b batch]\n\n" + "Copmputes the matching statistics of the reads in the pattern against the reference index in infile.\n" + - "shaped_slp: [boolean] - use shaped slp. (def. false)\n" + - " pattens: [string] - path to patterns file.\n" + - " output: [string] - output file prefix.\n" + - " len: [integer] - minimum MEM lengt (def. 25)\n" + - " thread: [integer] - number of threads (def. 1)\n"); + " shaped_slp: [boolean] - use shaped slp. (def. false)\n" + + "extended_output: [boolean] - print one MEM occurrence in ref. (def. false)\n" + + " pattens: [string] - path to patterns file.\n" + + " output: [string] - output file prefix.\n" + + " len: [integer] - minimum MEM lengt (def. 25)\n" + + " thread: [integer] - number of threads (def. 1)\n"); std::string sarg; - while ((c = getopt(argc, argv, "l:hp:o:t:")) != -1) + while ((c = getopt(argc, argv, "l:hp:o:t:qe")) != -1) { switch (c) { @@ -467,6 +471,9 @@ void parseArgs(int argc, char *const argv[], Args &arg) case 'q': arg.shaped_slp = true; break; + case 'e': + arg.extended_output = true; + break; case 'h': error(usage); case '?': @@ -528,6 +535,22 @@ void dispatcher(Args &args) auto mem_peak = malloc_count_peak(); verbose("Memory peak: ", malloc_count_peak()); + seqidx idx; + + std::string filename_idx = args.filename + idx.get_file_extension(); + verbose("Loading fasta index file: " + filename_idx); + t_insert_start = std::chrono::high_resolution_clock::now(); + + ifstream fs_idx(filename_idx); + idx.load(fs_idx); + fs_idx.close(); + + t_insert_end = std::chrono::high_resolution_clock::now(); + + verbose("Fasta index loading complete"); + verbose("Memory peak: ", malloc_count_peak()); + verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); + verbose("Printing plain output"); t_insert_start = std::chrono::high_resolution_clock::now(); @@ -547,7 +570,7 @@ void dispatcher(Args &args) size_t length = 0; size_t m = 100; // Reserved size for pointers and lengths - std::vector> mem(m); + std::vector> mem(m); size_t s = 100; // Reserved size for read name char* rname = (char *)malloc(s * sizeof(char)); while (!feof(in_fd) and fread(&length, sizeof(size_t), 1, in_fd) > 0) @@ -577,13 +600,24 @@ void dispatcher(Args &args) mem.resize(m); } - if ((fread(mem.data(), sizeof(std::pair), length, in_fd)) != length) + if ((fread(mem.data(), sizeof(std::tuple), length, in_fd)) != length) error("fread() file " + std::string(tmp_filename) + " failed"); // TODO: Store the fasta headers somewhere // f_mems << ">" + std::to_string(n_seq) << endl; - for (size_t i = 0; i < length; ++i) - f_mems << "(" << mem[i].first << "," << mem[i].second << ") "; + if (args.extended_output){ + for (size_t i = 0; i < length; ++i) + { + std::pair pos = idx.index(std::get<2>(mem[i])); + f_mems << "(" << std::get<0>(mem[i]) << "," << std::get<1>(mem[i]) << "," << pos.first << "," << pos.second << ") "; + } + } else { + for (size_t i = 0; i < length; ++i) + { + f_mems << "(" << std::get<0>(mem[i]) << "," << std::get<1>(mem[i]) << ") "; + } + + } f_mems << endl; n_seq++; diff --git a/thirdparty/CMakeLists.txt b/thirdparty/CMakeLists.txt index 87593e7..40aa511 100644 --- a/thirdparty/CMakeLists.txt +++ b/thirdparty/CMakeLists.txt @@ -18,10 +18,13 @@ if(NOT malloc_count_POPULATED) target_include_directories(memprofile PUBLIC "${malloc_count_SOURCE_DIR}") endif() - ## Add klib +# # Add klib +set(KLIB_COMMIT "9a063b33efd841fcc42d4b9f68cb78bb528bf75b") + FetchContent_Declare( klib GIT_REPOSITORY https://github.com/attractivechaos/klib + GIT_TAG ${KLIB_COMMIT} ) FetchContent_GetProperties(klib)