Skip to content

Commit

Permalink
Merge pull request #55 from SGSSGene/patch/merge
Browse files Browse the repository at this point in the history
Patch/merge
  • Loading branch information
SGSSGene authored Aug 2, 2024
2 parents ab6ea0e + 398f7f7 commit e7c8844
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 28 deletions.
2 changes: 1 addition & 1 deletion fmindex_collection-config.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ endif()

set(LIBSAIS_USE_OPENMP ${OpenMP_C_FOUND})
set(LIBSAIS_BUILD_SHARED_LIB OFF)
CPMAddPackage("gh:IlyaGrebnov/libsais@2.8.4")
CPMAddPackage("gh:IlyaGrebnov/libsais@2.8.5")

if (FMC_USE_SDSL)
CPMAddPackage("gh:SGSSGene/sdsl-lite@3.0.3-2")
Expand Down
6 changes: 6 additions & 0 deletions src/fmindex-collection/fmindex/FMIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@ struct FMIndex {

FMIndex(Sequences auto const& _input, size_t samplingRate, size_t threadNbr) {
auto [totalSize, inputText, inputSizes] = createSequences(_input);
assert([&]() {
for (auto c : inputText) {
if (c >= Sigma) return false;
}
return true;
}());

auto [bwt, csa] = [&, &inputText=inputText, &inputSizes=inputSizes] () {
auto sa = createSA(inputText, threadNbr);
Expand Down
8 changes: 8 additions & 0 deletions src/fmindex-collection/fmindex/RBiFMIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,14 @@ struct RBiFMIndex {
RBiFMIndex(Sequences auto const& _input, size_t samplingRate, size_t threadNbr) {
auto [totalSize, inputText, inputSizes] = createSequencesAndReverse(_input);

// Check only valid characters are used
assert([&]() {
for (auto c : inputText) {
if (c >= Sigma) return false;
}
return true;
}());

// create BurrowsWheelerTransform and CompressedSuffixArray
auto [bwt, csa] = [&, &inputText=inputText, &inputSizes=inputSizes] () {
auto sa = createSA(inputText, threadNbr);
Expand Down
51 changes: 34 additions & 17 deletions src/fmindex-collection/fmindex/merge.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,14 @@
#include "FMIndex.h"
#include "BiFMIndex.h"

#include <cassert>
#include <type_traits>
#include <vector>

namespace fmindex_collection {

/**
* creates the R array for interleaving FMIndices
* creates the R array for interleaving FM-Indices
*/
template <typename OccLhs, typename OccRhs, typename value_t = size_t>
auto computeInterleavingR(OccLhs const& lhsOcc, OccRhs const& rhsOcc) -> std::vector<bool> {
Expand All @@ -21,18 +22,29 @@ auto computeInterleavingR(OccLhs const& lhsOcc, OccRhs const& rhsOcc) -> std::ve
}

auto R = std::vector<bool>{};
// auto R = std::vector<value_t>{};
R.resize(lhsOcc.size() + rhsOcc.size(), false);

size_t idx1{};
size_t idx2{};

for (size_t i{0}; i < rhsOcc.size(); ++i) {
auto c = rhsOcc.symbol(idx2);
idx1 = lhsOcc.rank(idx1, c);
idx2 = rhsOcc.rank(idx2, c);
R[idx1 + idx2] = true;
auto nbrOfSeqRhs = rhsOcc.rank(rhsOcc.size(), 0);
for (size_t n{}; n < nbrOfSeqRhs; ++n) {
size_t idx1{};
size_t idx2{n};
uint8_t c{};
do {
assert(idx1 + idx2 < R.size());
assert(R[idx1 + idx2] == false);
R[idx1 + idx2] = true;
c = rhsOcc.symbol(idx2);
idx1 = lhsOcc.rank(idx1, c);
idx2 = rhsOcc.rank(idx2, c);
} while(c != 0);
}
assert([&]() {
size_t a{};
for (auto b : R) {
a += b;
}
return a;
}() == rhsOcc.size());
return R;
}

Expand Down Expand Up @@ -64,27 +76,28 @@ auto mergeImpl(FMIndex<OccLhs, TCSA> const& index1, FMIndex<OccRhs, TCSA> const&
size_t idx1{}, idx2{};
for (bool v : R) {
if (!v) {
assert(idx1 < index1.occ.size());
mergedBWT.push_back(index1.occ.symbol(idx1));
addSSAEntry(index1, idx1, seqOffset1);
idx1 += 1;
} else {
assert(idx2 < index2.occ.size());
mergedBWT.push_back(index2.occ.symbol(idx2));
addSSAEntry(index2, idx2, seqOffset2);
idx2 += 1;
}
}
R.clear();


return {mergedBWT, std::move(csa)};
}

template <typename Res = void, typename OccLhs, typename OccRhs, typename TCSA>
auto merge(FMIndex<OccLhs, TCSA> const& index1, FMIndex<OccRhs, TCSA> const& index2) -> FMIndex<std::conditional_t<std::is_void_v<Res>, OccLhs, Res>, TCSA> {
if (index1.size() >= index2.size()) {
// if (index1.size() >= index2.size()) {
return mergeImpl<Res>(index1, index2, 0, index1.occ.rank(index1.size(), 0));
}
return mergeImpl<Res>(index2, index1, index2.occ.rank(index2.size(), 0), 0);
// }
// return mergeImpl<Res>(index2, index1, index2.occ.rank(index2.size(), 0), 0);
}


Expand All @@ -111,10 +124,12 @@ auto mergeImpl(BiFMIndex<OccLhs, TCSA> const& index1, BiFMIndex<OccRhs, TCSA> co
size_t idx1{}, idx2{};
for (bool v : R) {
if (!v) {
assert(idx1 < index1.occ.size());
mergedBWT.push_back(index1.occ.symbol(idx1));
addSSAEntry(index1, idx1, seqOffset1);
idx1 += 1;
} else {
assert(idx2 < index2.occ.size());
mergedBWT.push_back(index2.occ.symbol(idx2));
addSSAEntry(index2, idx2, seqOffset2);
idx2 += 1;
Expand All @@ -132,9 +147,11 @@ auto mergeImpl(BiFMIndex<OccLhs, TCSA> const& index1, BiFMIndex<OccRhs, TCSA> co
size_t idx1{}, idx2{};
for (bool v : R) {
if (!v) {
assert(idx1 < index1.occRev.size());
mergedBWTRev.push_back(index1.occRev.symbol(idx1));
idx1 += 1;
} else {
assert(idx2 < index2.occRev.size());
mergedBWTRev.push_back(index2.occRev.symbol(idx2));
idx2 += 1;
}
Expand All @@ -146,10 +163,10 @@ auto mergeImpl(BiFMIndex<OccLhs, TCSA> const& index1, BiFMIndex<OccRhs, TCSA> co

template <typename Res = void, typename OccLhs, typename OccRhs, typename TCSA>
auto merge(BiFMIndex<OccLhs, TCSA> const& index1, BiFMIndex<OccRhs, TCSA> const& index2) -> BiFMIndex<std::conditional_t<std::is_void_v<Res>, OccLhs, Res>, TCSA> {
if (index1.size() >= index2.size()) {
// if (index1.size() >= index2.size()) {
return mergeImpl(index1, index2, 0, index1.occ.rank(index1.size(), 0));
}
return mergeImpl(index2, index1, index2.occ.rank(index2.size(), 0), 0);
// }
// return mergeImpl(index2, index1, index2.occ.rank(index2.size(), 0), 0);
}

}
9 changes: 8 additions & 1 deletion src/fmindex-collection/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -235,10 +235,17 @@ template <typename Index>
auto reconstructText(Index const& index) -> std::vector<std::vector<uint8_t>> {
auto nbrOfSeq = index.occ.rank(index.size(), 0);
auto texts = std::vector<std::vector<uint8_t>>{};
auto seqIds = std::vector<std::tuple<size_t, size_t>>{};
for (size_t i{}; i < nbrOfSeq; ++i) {
texts.push_back(reconstructText(index, i));
seqIds.emplace_back(std::get<0>(index.locate(i)), i);
}
return texts;
std::ranges::sort(seqIds);
auto res = std::vector<std::vector<uint8_t>>{};
for (auto [seqId, idx] : seqIds) {
res.emplace_back(std::move(texts[idx]));
}
return res;
}

}
18 changes: 9 additions & 9 deletions src/test_fmindex-collection/fmindex/checkMerge.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ TEST_CASE("checking merging of fmindices", "[FMIndex][merge]") {
{ 2, 8, 18},
};
auto expectedSA = std::vector<std::tuple<size_t, size_t>> {
{0, 8},
{1, 8},
{0, 8},
{0, 0},
{0, 1},
{0, 2},
Expand Down Expand Up @@ -77,8 +77,8 @@ TEST_CASE("checking merging of fmindices", "[FMIndex][merge]") {

auto texts = reconstructText(index12);
REQUIRE(texts.size() == 2);
CHECK(texts[0] == data2[0]);
CHECK(texts[1] == data1[0]);
CHECK(texts[0] == data1[0]);
CHECK(texts[1] == data2[0]);
}

TEST_CASE("checking merging of fmindices", "[BiFMIndex][merge]") {
Expand Down Expand Up @@ -118,8 +118,8 @@ TEST_CASE("checking merging of fmindices", "[BiFMIndex][merge]") {
{ 2, 8, 18},
};
auto expectedSA = std::vector<std::tuple<size_t, size_t>> {
{0, 8},
{1, 8},
{0, 8},
{0, 0},
{0, 1},
{0, 2},
Expand Down Expand Up @@ -153,8 +153,8 @@ TEST_CASE("checking merging of fmindices", "[BiFMIndex][merge]") {

auto texts = reconstructText(index12);
REQUIRE(texts.size() == 2);
CHECK(texts[0] == data2[0]);
CHECK(texts[1] == data1[0]);
CHECK(texts[0] == data1[0]);
CHECK(texts[1] == data2[0]);

SECTION("merging index12 and index3 into index123") {
auto index123 = merge(index12, index3);
Expand Down Expand Up @@ -189,9 +189,9 @@ TEST_CASE("checking merging of fmindices", "[BiFMIndex][merge]") {
{3, 13, 27},
};
auto expectedSA = std::vector<std::tuple<size_t, size_t>> {
{0, 8},
{2, 8},
{1, 8},
{0, 8},
{0, 0},
{0, 1},
{2, 4},
Expand Down Expand Up @@ -233,9 +233,9 @@ TEST_CASE("checking merging of fmindices", "[BiFMIndex][merge]") {

auto texts = reconstructText(index123);
REQUIRE(texts.size() == 3);
CHECK(texts[0] == data3[0]);
CHECK(texts[0] == data1[0]);
CHECK(texts[1] == data2[0]);
CHECK(texts[2] == data1[0]);
CHECK(texts[2] == data3[0]);
}
}
}

0 comments on commit e7c8844

Please sign in to comment.