Skip to content

Commit

Permalink
Add ref occurrence to MEM's output (#6)
Browse files Browse the repository at this point in the history
* Add ref,pos to MEM's output - untested

* Update klib commit

* Fix typos

* Add extended-output flag for MEMS

* Bump version
  • Loading branch information
maxrossi91 authored Aug 5, 2024
1 parent dee7c88 commit 925a19b
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 25 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ message(STATUS "Install directory: ${CMAKE_INSTALL_PREFIX}")
project(moni)
SET(VERSION_MAJOR "0")
SET(VERSION_MINOR "2")
SET(VERSION_PATCH "0")
SET(VERSION_PATCH "1")
SET(VERSION "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}")
message("version: ${VERSION}")

Expand Down
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[![Release](https://img.shields.io/github/release/maxrossi91/moni.svg)](https://github.com/maxrossi91/moni/releases)
[![Downloads](https://img.shields.io/github/downloads/maxrossi91/moni/total?logo=github)](https://github.com/maxrossi91/moni/releases/download/v0.2.0/moni_0.2.0_amd64.deb)
[![Downloads](https://img.shields.io/github/downloads/maxrossi91/moni/total?logo=github)](https://github.com/maxrossi91/moni/releases/download/v0.2.1/moni_0.2.1_amd64.deb)

# MONI
```console
Expand All @@ -9,7 +9,7 @@
| |\/| | | | | . ` | | |
| | | | |__| | |\ |_| |_
|_| |_|\____/|_| \_|_____|
ver 0.2.0
ver 0.2.1
```
A Pangenomics Index for Finding MEMs.

Expand Down Expand Up @@ -57,14 +57,16 @@ usage: moni ms [-h] -i INDEX -p PATTERN [-o OUTPUT] [-t THREADS]

### Computing the matching statistics with MONI:
```
usage: moni mems [-h] -i INDEX -p PATTERN [-o OUTPUT] [-t THREADS]
usage: moni mems [-h] -i INDEX -p PATTERN [-o OUTPUT] [-e] [-t THREADS]
-h, --help show this help message and exit
-i INDEX, --index INDEX
reference index base name (default: None)
-p PATTERN, --pattern PATTERN
the input query (default: None)
-o OUTPUT, --output OUTPUT
output directory path (default: .)
-e, --extended-output
output MEM occurrence in the reference (default: False)
-t THREADS, --threads THREADS
number of helper threads (default: 1)
-g GRAMMAR, --grammar GRAMMAR
Expand Down
5 changes: 4 additions & 1 deletion pipeline/moni.in
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Description = """
| |\/| | | | | . ` | | |
| | | | |__| | |\ |_| |_
|_| |_|\____/|_| \_|_____|
ver 0.2.0
ver 0.2.1
A Pangenomics Index for Finding MEMs.
Build the index for highly repetive files using the approach
Expand Down Expand Up @@ -554,6 +554,8 @@ class run_helper(threading.Thread):
command += " -b {} -A {} -B {} -O {} -E {} -L {} ".format(args.batch,args.smatch, args.smismatch, args.gapo, args.gape, args.extl)
if args.grammar == "shaped":
command += " -q"
if exe_name == "MONI-MEMS" and args.extended_output:
command += " -e"
if args.output != ".":
command += " -o {}".format(args.output)

Expand Down Expand Up @@ -650,6 +652,7 @@ def main():
mems_parser.add_argument('-i', '--index', help='reference index folder', type=str, required=True)
mems_parser.add_argument('-p', '--pattern', help='the input query', type=str, required=True)
mems_parser.add_argument('-o', '--output', help='output file prefix', type=str, default='.')
mems_parser.add_argument('-e', '--extended-output', help='output MEM occurrence in the reference', action='store_true')
mems_parser.add_argument('-t', '--threads', help='number of helper threads', default=1, type=int)
mems_parser.add_argument('-g', '--grammar', help='select the grammar [plain, shaped]', type=str, default='plain')
mems_parser.set_defaults(which='mems')
Expand Down
72 changes: 53 additions & 19 deletions src/mems.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ extern "C" {
#include <PlainSlp.hpp>
#include <FixedBitLenCode.hpp>

#include <seqidx.hpp>

////////////////////////////////////////////////////////////////////////////////
/// kseq extra
////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -242,7 +244,7 @@ class mems_c
{
auto pointers = ms.query(read->seq.s, read->seq.l);
std::vector<size_t> lengths(pointers.size());
std::vector<std::pair<size_t,size_t>> mems;
std::vector<std::tuple<size_t,size_t,size_t>> mems;

size_t l = 0;
for (size_t i = 0; i < pointers.size(); ++i)
Expand All @@ -255,7 +257,7 @@ class mems_c
l = (l == 0 ? 0 : (l - 1));

if((i == 0) or (lengths[i] >= lengths[i-1]))
mems.push_back(make_pair(i,lengths[i]));
mems.push_back(make_tuple(i,lengths[i],pos));
}

// Original MS computation
Expand All @@ -276,7 +278,7 @@ class mems_c
fwrite(read->name.s, sizeof(char),h_length,out);
size_t q_length = mems.size();
fwrite(&q_length, sizeof(size_t), 1,out);
fwrite(mems.data(), sizeof(std::pair<size_t,size_t>),q_length,out);
fwrite(mems.data(), sizeof(std::tuple<size_t,size_t,size_t>),q_length,out);
}

protected:
Expand Down Expand Up @@ -424,11 +426,12 @@ typedef std::pair<std::string, std::vector<uint8_t>> pattern_t;
struct Args
{
std::string filename = "";
std::string patterns = ""; // path to patterns file
std::string output = ""; // output file prefix
size_t l = 25; // minumum MEM length
size_t th = 1; // number of threads
bool shaped_slp = false; // use shaped slp
std::string patterns = ""; // path to patterns file
std::string output = ""; // output file prefix
size_t l = 25; // minumum MEM length
size_t th = 1; // number of threads
bool shaped_slp = false; // use shaped slp
bool extended_output = false; // use shaped slp
};

void parseArgs(int argc, char *const argv[], Args &arg)
Expand All @@ -437,16 +440,17 @@ void parseArgs(int argc, char *const argv[], Args &arg)
extern char *optarg;
extern int optind;

std::string usage("usage: " + std::string(argv[0]) + " infile [-p patterns] [-o output] [-t threads] [-l len] [-q shaped_slp] [-b batch]\n\n" +
std::string usage("usage: " + std::string(argv[0]) + " infile [-p patterns] [-o output] [-t threads] [-l len] [-q shaped_slp] [-e extended_output] [-b batch]\n\n" +
"Copmputes the matching statistics of the reads in the pattern against the reference index in infile.\n" +
"shaped_slp: [boolean] - use shaped slp. (def. false)\n" +
" pattens: [string] - path to patterns file.\n" +
" output: [string] - output file prefix.\n" +
" len: [integer] - minimum MEM lengt (def. 25)\n" +
" thread: [integer] - number of threads (def. 1)\n");
" shaped_slp: [boolean] - use shaped slp. (def. false)\n" +
"extended_output: [boolean] - print one MEM occurrence in ref. (def. false)\n" +
" pattens: [string] - path to patterns file.\n" +
" output: [string] - output file prefix.\n" +
" len: [integer] - minimum MEM lengt (def. 25)\n" +
" thread: [integer] - number of threads (def. 1)\n");

std::string sarg;
while ((c = getopt(argc, argv, "l:hp:o:t:")) != -1)
while ((c = getopt(argc, argv, "l:hp:o:t:qe")) != -1)
{
switch (c)
{
Expand All @@ -467,6 +471,9 @@ void parseArgs(int argc, char *const argv[], Args &arg)
case 'q':
arg.shaped_slp = true;
break;
case 'e':
arg.extended_output = true;
break;
case 'h':
error(usage);
case '?':
Expand Down Expand Up @@ -528,6 +535,22 @@ void dispatcher(Args &args)
auto mem_peak = malloc_count_peak();
verbose("Memory peak: ", malloc_count_peak());

seqidx idx;

std::string filename_idx = args.filename + idx.get_file_extension();
verbose("Loading fasta index file: " + filename_idx);
t_insert_start = std::chrono::high_resolution_clock::now();

ifstream fs_idx(filename_idx);
idx.load(fs_idx);
fs_idx.close();

t_insert_end = std::chrono::high_resolution_clock::now();

verbose("Fasta index loading complete");
verbose("Memory peak: ", malloc_count_peak());
verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());

verbose("Printing plain output");
t_insert_start = std::chrono::high_resolution_clock::now();

Expand All @@ -547,7 +570,7 @@ void dispatcher(Args &args)

size_t length = 0;
size_t m = 100; // Reserved size for pointers and lengths
std::vector<std::pair<size_t,size_t>> mem(m);
std::vector<std::tuple<size_t,size_t,size_t>> mem(m);
size_t s = 100; // Reserved size for read name
char* rname = (char *)malloc(s * sizeof(char));
while (!feof(in_fd) and fread(&length, sizeof(size_t), 1, in_fd) > 0)
Expand Down Expand Up @@ -577,13 +600,24 @@ void dispatcher(Args &args)
mem.resize(m);
}

if ((fread(mem.data(), sizeof(std::pair<size_t,size_t>), length, in_fd)) != length)
if ((fread(mem.data(), sizeof(std::tuple<size_t,size_t,size_t>), length, in_fd)) != length)
error("fread() file " + std::string(tmp_filename) + " failed");

// TODO: Store the fasta headers somewhere
// f_mems << ">" + std::to_string(n_seq) << endl;
for (size_t i = 0; i < length; ++i)
f_mems << "(" << mem[i].first << "," << mem[i].second << ") ";
if (args.extended_output){
for (size_t i = 0; i < length; ++i)
{
std::pair<std::string, size_t> pos = idx.index(std::get<2>(mem[i]));
f_mems << "(" << std::get<0>(mem[i]) << "," << std::get<1>(mem[i]) << "," << pos.first << "," << pos.second << ") ";
}
} else {
for (size_t i = 0; i < length; ++i)
{
f_mems << "(" << std::get<0>(mem[i]) << "," << std::get<1>(mem[i]) << ") ";
}

}
f_mems << endl;

n_seq++;
Expand Down
5 changes: 4 additions & 1 deletion thirdparty/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,13 @@ if(NOT malloc_count_POPULATED)
target_include_directories(memprofile PUBLIC "${malloc_count_SOURCE_DIR}")
endif()

## Add klib
# # Add klib
set(KLIB_COMMIT "9a063b33efd841fcc42d4b9f68cb78bb528bf75b")

FetchContent_Declare(
klib
GIT_REPOSITORY https://github.com/attractivechaos/klib
GIT_TAG ${KLIB_COMMIT}
)

FetchContent_GetProperties(klib)
Expand Down

0 comments on commit 925a19b

Please sign in to comment.