Skip to content

Commit

Permalink
Bugfix related to newscan.x and creation of SLP (#5)
Browse files Browse the repository at this point in the history
* avoid using newscan.x until it can be updated

* add code to handle case where first w-mer is a trigger string, in order to generate correct SLP downstream.

* fix bug in ms_rle_string constructor that errorneously compares signed to unsigned chars

* fixed issue with previous commit, make sure array indexes are all positive
  • Loading branch information
oma219 authored Feb 25, 2024
1 parent ffdb1af commit dee7c88
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 8 deletions.
10 changes: 7 additions & 3 deletions include/ms/ms_rle_string.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,15 +274,19 @@ ms_rle_string<ri::sparse_sd_vector, ri::huff_string>::ms_rle_string(std::ifstrea
{
size_t length = 0;
lengths.read((char *)&length, 5);
if (run_heads_s[i] <= TERMINATOR) // change 0 to 1

uint8_t curr_ch = unsigned(run_heads_s[i]);
if (curr_ch <= TERMINATOR) { // change 0 to 1
run_heads_s[i] = TERMINATOR;
curr_ch = TERMINATOR;
}

if(i % B == B - 1)
runs_bv_onset.push_back(this->n + length - 1);

assert(length > 0);
runs_per_letter_bv_i[run_heads_s[i]] += length;
runs_per_letter_bv[run_heads_s[i]].push_back(runs_per_letter_bv_i[run_heads_s[i]] - 1);
runs_per_letter_bv_i[curr_ch] += length;
runs_per_letter_bv[curr_ch].push_back(runs_per_letter_bv_i[curr_ch] - 1);

this->n += length;
}
Expand Down
5 changes: 5 additions & 0 deletions pipeline/moni.in
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,11 @@ def build(args):
if args.f and args.threads > 0 and (".fq" in args.reference or ".fastq" in args.reference or ".fnq" in args.reference):
print("moni does not current support FASTQ format! Exiting...", flush=True)
return

# added to avoid using newscan.x until issue is further investigated
if args.f and args.threads > 0:
print("\nerror: moni's multithreading is not currently available.\n")
exit(0)

filename = os.path.basename(args.reference)
if args.output != ".":
Expand Down
87 changes: 82 additions & 5 deletions src/compress_dictionary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,27 @@ void parseArgs(int argc, char *const argv[], Args &arg)

//********** end argument options ********************

std::string execute_cmd(const char* cmd) {
std::array<char, 256> buffer{};
std::string output = "";

std::string cmd_plus_stderr = std::string(cmd) + " 2>&1";
FILE* pipe = popen(cmd_plus_stderr.data(), "r"); // Extract stderr as well
if (!pipe) {throw std::runtime_error("popen() failed!");}

try {
std::size_t bytes;
while ((bytes = fread(buffer.data(), sizeof(char), sizeof(buffer), pipe))) {
output += std::string(buffer.data(), bytes);
}
} catch (...) {
pclose(pipe);
throw std::runtime_error("Error occurred while reading popen() stream.");
}
pclose(pipe);
return output;
}

int main(int argc, char *const argv[])
{

Expand Down Expand Up @@ -135,23 +156,79 @@ int main(int argc, char *const argv[])

verbose("Generating phrases");
uint8_t* ptr = dict.data(); // Beginning of the current phrase
for(auto length: lengths)

bool empty_first_phrase = false;
for(size_t i = 0; i < lengths.size(); i++)
{
size_t compressed_length = length - args.w;
size_t compressed_length = lengths[i] - args.w;

// special case: starts with a trigger string
if (i==0 && compressed_length == 0) {
ptr += lengths[i] + 1;
empty_first_phrase = true;
continue;
} else if (i > 0 && compressed_length == 0) {
error("encountered a length=0 phrase after removing trigger string, which should not occur.");
}

if ((fwrite(&compressed_length, 4, 1, dicz_len)) != 1)
error("fwrite() file " + std::string(dicz_len_filename) + " failed");

if ((fwrite(ptr, sizeof(uint8_t), compressed_length, dicz)) != compressed_length)
error("fwrite() file " + std::string(dicz_filename) + " failed");

ptr += length + 1;
ptr += lengths[i] + 1;
}


fclose(dicz);
fclose(dicz_len);

// re-writes parse file to shift down all the phrase ids by 1
// since we removed the empty beginning phrase
if (empty_first_phrase) {
verbose("alert: found that the first phrase length is 0"
" so we will rewrite *.parse file to generated correct SLP.");

// read in all the phrase ids in parse
std::string parse_filename = args.filename + ".parse";
std::vector<uint32_t> parse_arr;
read_file(parse_filename.c_str(), parse_arr);

// make sure first phrase is lowest lexicographically and then remove it
if (parse_arr[0] != 1)
error("parse should being with lowest lexicographic phrase.");
parse_arr.erase(parse_arr.begin());

// rename the old parse file as *.parse_with_empty_phrase
std::ostringstream command_stream;
command_stream << "mv " << parse_filename << " " << (args.filename + ".parse_with_empty_phrase");
auto mv_log = execute_cmd(command_stream.str().c_str());

verbose("executed this command: " + command_stream.str());

// open new parse file for writing
FILE* new_parse_file;
if ((new_parse_file = fopen((args.filename + ".parse").c_str(), "w")) == nullptr)
verbose("open() file " + std::string(args.filename + ".parse" + " failed"));

// iterate through each element of parse and decrement by 1
for (size_t i = 0; i < parse_arr.size(); i++) {
if (parse_arr[i] == 1)
error("issue occurred when creating new parse file.");
parse_arr[i]--;

// write it out
if ((fwrite(&parse_arr[i], 4, 1, new_parse_file)) != 1)
verbose("fwrite() file " + std::string(args.filename + ".parse") + " failed");
}
fclose(new_parse_file);
}







std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now();

verbose("Memory peak: ", malloc_count_peak());
Expand Down

0 comments on commit dee7c88

Please sign in to comment.