Skip to content

Commit

Permalink
Feature/better compression (#5)
Browse files Browse the repository at this point in the history
Better internal compression of examined sequences for lower RAM usage
  • Loading branch information
sebastiandeorowicz authored and agudys committed Oct 9, 2024
1 parent d1c44e9 commit 45bab7f
Show file tree
Hide file tree
Showing 7 changed files with 158 additions and 26 deletions.
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# LZ-ANI

![version](https://img.shields.io/badge/version-1.1.0-blue.svg)
[![GitHub Actions CI](../../workflows/GitHub%20Actions%20CI/badge.svg)](../../actions/workflows/main.yml)
[![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)

Expand Down
4 changes: 2 additions & 2 deletions src/defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
#include <string>
#include "params.h"

const std::string LZ_ANI_VER = "lz-ani 1.1.1";
const std::string LZ_ANI_DATE = "2024-09-12";
const std::string LZ_ANI_VER = "lz-ani 1.2.0";
const std::string LZ_ANI_DATE = "2024-10-09";
const std::string LZ_ANI_AUTHORS = "Sebastian Deorowicz, Adam Gudys";
const std::string LZ_ANI_INFO = LZ_ANI_VER + " (" + LZ_ANI_DATE + ") by " + LZ_ANI_AUTHORS;

Expand Down
6 changes: 3 additions & 3 deletions src/lz_matcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ void CLZMatcher::do_matching()

// Prepare reference
auto sr_iter = seq_reservoir.get_sequence(local_task_no);
parser.prepare_reference(seq_view(sr_iter->data, sr_iter->len), sr_iter->no_parts);
parser.prepare_reference(seq_view(sr_iter->data, sr_iter->len, params.internal_packing), sr_iter->no_parts);

uint64_t to_add = filter.is_empty() ? local_task_no : (uint64_t)filter.get_row(local_task_no).size();
auto to_print = global_no_pairs.fetch_add(to_add);
Expand All @@ -204,7 +204,7 @@ void CLZMatcher::do_matching()
continue;

auto sr_iter = seq_reservoir.get_sequence(id);
parser.prepare_data(seq_view(sr_iter->data, sr_iter->len), sr_iter->no_parts);
parser.prepare_data(seq_view(sr_iter->data, sr_iter->len, params.internal_packing), sr_iter->no_parts);

parser.parse();

Expand All @@ -221,7 +221,7 @@ void CLZMatcher::do_matching()
for (auto& id : filter.get_row(local_task_no))
{
auto sr_iter = seq_reservoir.get_sequence(id);
parser.prepare_data(seq_view(sr_iter->data, sr_iter->len), sr_iter->no_parts);
parser.prepare_data(seq_view(sr_iter->data, sr_iter->len, params.internal_packing), sr_iter->no_parts);

parser.parse();

Expand Down
3 changes: 2 additions & 1 deletion src/lz_matcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ class CLZMatcher

public:
CLZMatcher(CParams& params) :
params(params)
params(params),
seq_reservoir(params.internal_packing)
{
if (!params.output_alignment_file_name.empty())
{
Expand Down
7 changes: 7 additions & 0 deletions src/params.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@

using namespace std;

enum class internal_packing_t { none, two_in_byte, three_in_byte };
enum class alphabet_t { dna, aminoacid };

enum class output_type_t {single_txt, two_tsv};
//enum class output_component_t {seq_id1,seq_id2,seq_idx1,seq_idx2,len1,len2,total_ani,global_ani,local_ani,cov,len_ratio,nt_match,nt_mismatch,no_reg};
enum class output_component_t {query,reference,qidx,ridx,qlen,rlen,tani,gani,ani,qcov,rcov,len_ratio,nt_match,nt_mismatch,num_alns};
Expand All @@ -44,6 +47,10 @@ struct CParams
double filter_thr = 0.0;
bool output_in_percent = false;

alphabet_t alphabet = alphabet_t::dna;
internal_packing_t internal_packing = internal_packing_t::three_in_byte;
// internal_packing_t internal_packing = internal_packing_t::two_in_byte;

output_type_t output_type = output_type_t::two_tsv;

vector<string> input_file_names;
Expand Down
42 changes: 42 additions & 0 deletions src/seq_reservoir.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,47 @@
// ****************************************************************************
void CSeqReservoir::append(const string& name, const string& seq)
{
uint8_t* ptr_seq = nullptr;

switch (internal_packing)
{
case internal_packing_t::none:
ptr_seq = (uint8_t*)mma_seq.allocate(seq.length());

for (size_t i = 0; i < seq.length(); ++i)
ptr_seq[i] = dna_code[seq[i]];
break;
case internal_packing_t::two_in_byte:
ptr_seq = (uint8_t*)mma_seq.allocate((seq.length() + 1) / 2);

for (size_t i = 0; i < seq.length() / 2; ++i)
ptr_seq[i] = (uint8_t)((dna_code[seq[2 * i]] << 4) + dna_code[seq[2 * i + 1]]);

if (seq.length() & 1)
ptr_seq[seq.length() / 2] = (uint8_t)(dna_code[seq[seq.length() - 1]] << 4);
break;
case internal_packing_t::three_in_byte:
ptr_seq = (uint8_t*)mma_seq.allocate((seq.length() + 2) / 3);

for (size_t i = 0; i < seq.length() / 3; ++i)
ptr_seq[i] = (uint8_t)(36 * dna_code[seq[3 * i]] + 6 * dna_code[seq[3 * i + 1]] + dna_code[seq[3 * i + 2]]);

switch (seq.length() % 3)
{
case 2:
ptr_seq[seq.length() / 3] = (uint8_t)(36 * dna_code[seq[seq.length() - 2]] + 6 * dna_code[seq.back()]);
break;
case 1:
ptr_seq[seq.length() / 3] = (uint8_t)(36 * dna_code[seq.back()]);
break;
case 0:
// Nothing
break;
}
break;
}

/*
#ifdef USE_PACKED_SEQS
uint8_t* ptr_seq = (uint8_t*) mma_seq.allocate((seq.length() + 1) / 2);
Expand All @@ -30,6 +71,7 @@ void CSeqReservoir::append(const string& name, const string& seq)
for (size_t i = 0; i < seq.length(); ++i)
ptr_seq[i] = dna_code[seq[i]];
#endif
*/

auto p = find(name.begin(), name.end(), ' ');

Expand Down
121 changes: 102 additions & 19 deletions src/seq_reservoir.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,38 +25,65 @@

using namespace std;

#define USE_PACKED_SEQS
// #define USE_PACKED_SEQS

class seq_view
{
const uint8_t* data;
uint32_t len;
const uint32_t len;
const internal_packing_t internal_packing;

public:
seq_view(const uint8_t *data = 0, const uint32_t len = 0) :
data(data), len(len)
seq_view(const uint8_t *data = 0, const uint32_t len = 0, internal_packing_t internal_packing = internal_packing_t::none) :
data(data),
len(len),
internal_packing(internal_packing)
{}

void assign(uint8_t* _data, uint32_t _len)
/* void assign(uint8_t* _data, uint32_t _len, internal_packing_t _internal_packing = internal_packing_t::none)
{
data = _data;
len = len;
}
len = _len;
internal_packing = _internal_packing;
}*/

uint8_t operator[](const uint32_t pos) const
{
if (pos & 1)
return data[pos / 2] & 0xf;
else
return data[pos / 2] >> 4;
switch (internal_packing)
{
case internal_packing_t::none:
return data[pos];
break;
case internal_packing_t::two_in_byte:
if (pos & 1)
return data[pos / 2] & 0xf;
else
return data[pos / 2] >> 4;
break;
case internal_packing_t::three_in_byte:
uint32_t pos_div_3 = pos / 3;

switch (pos - 3 * pos_div_3)
{
case 0:
return data[pos_div_3] / 36;
case 1:
return (data[pos_div_3] / 6) % 6;
case 2:
return data[pos_div_3] % 6;
}
break;
}

return 0; // Never should be here
}

uint32_t size() const
{
return len;
}

static void pack(uint8_t* dest, uint8_t* src, uint32_t len)
/* static void pack2(uint8_t* dest, uint8_t* src, uint32_t len)
{
for (uint32_t i = 0; i < len / 2; ++i)
dest[i] = (src[2 * i] << 4) + src[2 * i + 1];
Expand All @@ -65,7 +92,7 @@ class seq_view
dest[len / 2] = src[len - 1] << 4;
}
static void unpack(uint8_t * dest, uint8_t * src, uint32_t len)
static void unpack2(uint8_t* dest, uint8_t* src, uint32_t len)
{
for (uint32_t i = 0; i < len / 2; ++i)
{
Expand All @@ -76,6 +103,50 @@ class seq_view
if (len & 1)
dest[len - 1] = src[len / 2] >> 4;
}
static void pack3(uint8_t* dest, uint8_t* src, uint32_t len)
{
for (uint32_t i = 0; i < len / 3; ++i)
dest[i] = 36 * src[3 * i] + 6 * src[3 * i + 1] + src[3 * i + 2];
switch (len % 3)
{
case 2:
dest[len / 3] = 6 * src[len - 2] + src[len - 1];
break;
case 1:
dest[len / 3] = src[len - 1];
break;
case 0:
// Nothing
break;
}
}
static void unpack3(uint8_t * dest, uint8_t * src, uint32_t len)
{
uint32_t len_div_3 = len / 3;
for (uint32_t i = 0; i < len_div_3; ++i)
{
dest[3 * i] = src[i] / 36;
dest[3 * i + 1] = src[i] / 6 - 6 * dest[3 * i];
dest[3 * i + 2] = src[i] - 36 * dest[3 * i] - 6 * dest[3 * i + 1];
}
switch (len % 3)
{
case 2:
dest[len - 2] = src[len_div_3] / 6;
dest[len - 1] = src[len_div_3] - 6 * dest[len - 2];
break;
case 1:
dest[len - 1] = src[len_div_3];
case 0:
// Nothing
break;
}
}*/
};

class CSeqReservoir
Expand Down Expand Up @@ -116,21 +187,33 @@ class CSeqReservoir
refresh::memory_monotonic_unsafe mma_seq;
refresh::memory_monotonic_unsafe mma_name;

internal_packing_t internal_packing = internal_packing_t::none;
alphabet_t alphabet = alphabet_t::dna;

array<uint8_t, 256> dna_code;

void append(const string& name, const string& seq);

public:
CSeqReservoir() :
CSeqReservoir(internal_packing_t internal_packing = internal_packing_t::none /*, alphabet_t alphabet = alphabet_t::dna*/) :
mma_seq(32 << 20, 16),
mma_name(1 << 20, 16),
internal_packing(internal_packing),
// alphabet(alphabet),
dna_code{}
{
fill(dna_code.begin(), dna_code.end(), code_N_seq);
dna_code['A'] = dna_code['a'] = code_A;
dna_code['C'] = dna_code['c'] = code_C;
dna_code['G'] = dna_code['g'] = code_G;
dna_code['T'] = dna_code['t'] = code_T;
if (alphabet == alphabet_t::dna)
{
fill(dna_code.begin(), dna_code.end(), code_N_seq);
dna_code['A'] = dna_code['a'] = code_A;
dna_code['C'] = dna_code['c'] = code_C;
dna_code['G'] = dna_code['g'] = code_G;
dna_code['T'] = dna_code['t'] = code_T;
}
else if (alphabet == alphabet_t::aminoacid)
{
assert(0);
}
}

bool load_fasta(const vector<string>& fasta_files, uint32_t sep_len, uint32_t verbosity_level);
Expand Down

0 comments on commit 45bab7f

Please sign in to comment.