Skip to content

Commit

Permalink
Merge pull request #27 from SGSSGene/feat/rbifmindex
Browse files Browse the repository at this point in the history
Feat/rbifmindex
  • Loading branch information
SGSSGene authored Sep 4, 2023
2 parents accc333 + 004f7b3 commit a19d898
Show file tree
Hide file tree
Showing 6 changed files with 425 additions and 2 deletions.
155 changes: 155 additions & 0 deletions src/fmindex-collection/RBiFMIndex.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
// -----------------------------------------------------------------------------------------------------
// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
// shipped with this file.
// -----------------------------------------------------------------------------------------------------
#pragma once

#include "CSA.h"
#include "occtable/concepts.h"
#include "utils.h"

#include <algorithm>

namespace fmindex_collection {

template <OccTable Table, typename TCSA = CSA>
struct RBiFMIndex {
static size_t constexpr Sigma = Table::Sigma;

using TTable = Table;

Table occ;
TCSA csa;

//private:
RBiFMIndex(std::span<uint8_t const> bwt, TCSA _csa)
: occ{bwt}
, csa{std::move(_csa)}
{
// compute last row
auto ct = std::array<uint64_t, Sigma>{};
for (auto v : bwt) {
ct[v] += 1;
}
for (size_t i{1}; i < ct.size(); ++i) {
ct[i] = ct[i-1] + ct[i];
}
// check last row is correct
for (size_t sym{0}; sym < Sigma; ++sym) {
if (occ.rank(occ.size(), sym) != ct[sym]) {
auto e = std::string{"Wrong rank for the last entry."}
+ " Got different values for forward index."
+ " sym: " + std::to_string(sym)
+ " got: " + std::to_string(occ.rank(occ.size(), sym))
+ " expected: " + std::to_string(ct[sym]);
throw std::runtime_error(e);
}
}
if constexpr (requires(Table t) {{ t.hasValue(size_t{}) }; }) {
for (size_t i{0}; i < occ.size(); ++i) {
if (csa.value(i).has_value()) {
occ.setValue(i);
}
}
}
}

public:
/**!\brief Creates a RBiFMIndex with a specified sampling rate
*
* \param _input a list of sequences
* \param samplingRate rate of the sampling
*/
RBiFMIndex(Sequences auto const& _input, size_t samplingRate, size_t threadNbr)
: occ{cereal_tag{}}
, csa{cereal_tag{}}
{
auto [totalSize, inputText, inputSizes] = createSequencesAndReverse(_input, samplingRate);

// create BurrowsWheelerTransform and CompressedSuffixArray
auto [bwt, csa] = [&, &inputText=inputText, &inputSizes=inputSizes] () {
auto sa = createSA(inputText, threadNbr);
auto bwt = createBWT(inputText, sa);
auto csa = TCSA(std::move(sa), samplingRate, inputSizes);
return std::make_tuple(std::move(bwt), std::move(csa));
}();

decltype(inputText){}.swap(inputText); // inputText memory can be deleted

*this = RBiFMIndex{bwt, std::move(csa)};
}


/*!\brief Specific c'tor for serialization use
*/
RBiFMIndex(cereal_tag)
: occ{cereal_tag{}}
, csa{cereal_tag{}}
{}

size_t memoryUsage() const requires OccTableMemoryUsage<Table> {
return occ.memoryUsage() + csa.memoryUsage();
}

size_t size() const {
return occ.size();
}

auto locate(size_t idx) const -> std::tuple<size_t, size_t> {
if constexpr (requires(Table t) {{ t.hasValue(size_t{}) }; }) {
bool v = occ.hasValue(idx);
uint64_t steps{};
while(!v) {
idx = occ.rank_symbol(idx);
steps += 1;
v = occ.hasValue(idx);
}
auto [chr, pos] = csa.value(idx);
return {chr, pos+steps};

} else {
auto opt = csa.value(idx);
uint64_t steps{};
while(!opt) {
if constexpr (requires(Table t) { { t.rank_symbol(size_t{}) }; }) {
idx = occ.rank_symbol(idx);
} else {
idx = occ.rank(idx, occ.symbol(idx));
}
steps += 1;
opt = csa.value(idx);
}
auto [chr, pos] = *opt;
return {chr, pos+steps};
}
}

auto locate(size_t idx, size_t maxSteps) const -> std::optional<std::tuple<size_t, size_t>> {
auto opt = csa.value(idx);
uint64_t steps{};
for (;!opt and maxSteps > 0; --maxSteps) {
idx = occ.rank(idx, occ.symbol(idx));
steps += 1;
opt = csa.value(idx);
}
if (opt) {
std::get<1>(*opt) += steps;
}
return opt;
}


auto single_locate_step(size_t idx) const -> std::optional<std::tuple<size_t, size_t>> {
return csa.value(idx);
}


template <typename Archive>
void serialize(Archive& ar) {
ar(occ, csa);
}
};

}
196 changes: 196 additions & 0 deletions src/fmindex-collection/RBiFMIndexCursor.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
// -----------------------------------------------------------------------------------------------------
// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
// shipped with this file.
// -----------------------------------------------------------------------------------------------------
#pragma once

#include "RBiFMIndex.h"

namespace fmindex_collection {

template <typename Index>
struct LeftRBiFMIndexCursor;

template <typename Index>
struct RBiFMIndexCursor {
static constexpr size_t Sigma = Index::Sigma;
static constexpr bool Reversed = false;

Index const* index{};
size_t lb;
size_t lbRev;
size_t len{};
RBiFMIndexCursor() noexcept = default;
RBiFMIndexCursor(Index const& index) noexcept
: RBiFMIndexCursor{index, 0, 0, index.size()}
{}
RBiFMIndexCursor(Index const& index, size_t lb, size_t lbRev, size_t len) noexcept
: index{&index}
, lb{lb}
, lbRev{lbRev}
, len{len}
{}

bool operator==(RBiFMIndexCursor const& _other) const noexcept {
return lb == _other.lb
&& len == _other.len;
}
bool empty() const {
return len == 0;
}
size_t count() const {
return len;
}
auto extendLeft() const -> std::array<RBiFMIndexCursor, Sigma> {
auto const& occ = index->occ;
if constexpr (OccTablePrefetch<Index>) {
occ.prefetch(lb+len);
}
auto [rs1, prs1] = occ.all_ranks(lb);
auto [rs2, prs2] = occ.all_ranks(lb+len);

auto cursors = std::array<RBiFMIndexCursor, Sigma>{};
cursors[0] = RBiFMIndexCursor{*index, rs1[0], lbRev, rs2[0] - rs1[0]};
cursors[0].prefetchLeft();
for (size_t i{1}; i < Sigma; ++i) {
cursors[i] = RBiFMIndexCursor{*index, rs1[i], lbRev + prs2[i-1] - prs1[i-1], rs2[i] - rs1[i]};
}
return cursors;
}

auto extendRight() const -> std::array<RBiFMIndexCursor, Sigma> {
auto const& occ = index->occ;
if constexpr (OccTablePrefetch<Index>) {
occ.prefetch(lbRev+len);
}
auto [rs1, prs1] = occ.all_ranks(lbRev);
auto [rs2, prs2] = occ.all_ranks(lbRev+len);

auto cursors = std::array<RBiFMIndexCursor, Sigma>{};
cursors[0] = RBiFMIndexCursor{*index, lb, rs1[0], rs2[0] - rs1[0]};
cursors[0].prefetchRight();
for (size_t i{1}; i < Sigma; ++i) {
cursors[i] = RBiFMIndexCursor{*index, lb + prs2[i-1] - prs1[i-1], rs1[i], rs2[i] - rs1[i]};
}
return cursors;
}
void prefetchLeft() const {
if constexpr (OccTablePrefetch<Index>) {
auto& occ = index->occ;
occ.prefetch(lb);
occ.prefetch(lb+len);
}
}
void prefetchRight() const {
if constexpr (OccTablePrefetch<Index>) {
auto& occ = index->occ;
occ.prefetch(lbRev);
occ.prefetch(lbRev+len);
}
}

auto extendLeft(size_t symb) const -> RBiFMIndexCursor {
assert(symb > 0);
auto& occ = index->occ;
size_t newLb = occ.rank(lb, symb);
size_t newLbRev = lbRev + occ.prefix_rank(lb+len, symb-1) - occ.prefix_rank(lb, symb-1);
size_t newLen = occ.rank(lb+len, symb) - newLb;
auto newCursor = RBiFMIndexCursor{*index, newLb, newLbRev, newLen};
newCursor.prefetchLeft();
return newCursor;
}
auto extendRight(size_t symb) const -> RBiFMIndexCursor {
assert(symb > 0);
auto& occ = index->occ;
size_t newLb = lb + occ.prefix_rank(lbRev+len, symb-1) - occ.prefix_rank(lbRev, symb-1);
size_t newLbRev = occ.rank(lbRev, symb);
size_t newLen = occ.rank(lbRev+len, symb) - newLbRev;
auto newCursor = RBiFMIndexCursor{*index, newLb, newLbRev, newLen};
newCursor.prefetchRight();
return newCursor;
}
};

template <typename Index>
auto begin(RBiFMIndexCursor<Index> const& _cursor) {
return _cursor.lb;
}
template <typename Index>
auto end(RBiFMIndexCursor<Index> const& _cursor) {
return _cursor.lb + _cursor.len;
}

template <typename Index>
struct LeftRBiFMIndexCursor {
static constexpr size_t Sigma = Index::Sigma;
static constexpr bool Reversed = false;

Index const* index;
size_t lb;
size_t len;
LeftRBiFMIndexCursor(RBiFMIndexCursor<Index> const& _other)
: index{_other.index}
, lb{_other.lb}
, len{_other.len}
{}
LeftRBiFMIndexCursor()
: index{nullptr}
{}
LeftRBiFMIndexCursor(Index const& index)
: LeftRBiFMIndexCursor{index, 0, index.size()}
{}
LeftRBiFMIndexCursor(Index const& index, size_t lb, size_t len)
: index{&index}
, lb{lb}
, len{len}
{}
bool empty() const {
return len == 0;
}
size_t count() const {
return len;
}
auto extendLeft() const -> std::array<LeftRBiFMIndexCursor, Sigma> {
auto const& occ = index->occ;
auto [rs1, prs1] = occ.all_ranks(lb);
auto [rs2, prs2] = occ.all_ranks(lb+len);

auto cursors = std::array<LeftRBiFMIndexCursor, Sigma>{};
cursors[0] = LeftRBiFMIndexCursor{*index, rs1[0], rs2[0] - rs1[0]};
for (size_t i{1}; i < Sigma; ++i) {
cursors[i] = LeftRBiFMIndexCursor{*index, rs1[i], rs2[i] - rs1[i]};
// cursors[i].prefetchLeft();
}
return cursors;
}

auto extendLeft(size_t symb) const -> LeftRBiFMIndexCursor {
assert(symb > 0);
auto& occ = index->occ;

size_t newLb = occ.rank(lb, symb);
size_t newLen = occ.rank(lb+len, symb) - newLb;
if constexpr (OccTablePrefetch<Index>) {
occ.prefetch(newLb);
occ.prefetch(newLb + newLen);
}

auto newCursor = LeftRBiFMIndexCursor{*index, newLb, newLen};
return newCursor;
}
};
}

namespace std {

template <typename index_t>
struct hash<fmindex_collection::RBiFMIndexCursor<index_t>> {
auto operator()(fmindex_collection::RBiFMIndexCursor<index_t> const& cursor) const -> size_t {
return hash<size_t>()(cursor.lb)
^ hash<size_t>()(cursor.len);
}
};

}
2 changes: 2 additions & 0 deletions src/fmindex-collection/fmindex-collection.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#include "BiFMIndex.h"
#include "BiFMIndex_32.h"
#include "BiFMIndexCursor.h"
#include "RBiFMIndex.h"
#include "RBiFMIndexCursor.h"
#include "ReverseFMIndex.h"
#include "ReverseFMIndexCursor.h"
#include "CSA.h"
Expand Down
4 changes: 2 additions & 2 deletions src/fmindex-collection/search/SearchNg21.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ template <typename index_t, typename search_scheme_t, typename delegate_t>
struct Search {
constexpr static size_t Sigma = index_t::Sigma;

using cursor_t = BiFMIndexCursor<index_t>;
using cursor_t = select_cursor_t<index_t>;
using BlockIter = typename search_scheme_t::const_iterator;

index_t const& index;
Expand Down Expand Up @@ -163,7 +163,7 @@ struct Search {

template <typename index_t, typename query_t, typename search_scheme_t, typename search_scheme_reordered_t, typename delegate_t>
void search_reordered(index_t const& index, query_t&& query, search_scheme_t const& search_scheme, search_scheme_reordered_t& reordered, delegate_t&& delegate) {
using cursor_t = BiFMIndexCursor<index_t>;
using cursor_t = select_cursor_t<index_t>;
using R = std::decay_t<decltype(delegate(std::declval<cursor_t>(), 0))>;

auto internal_delegate = [&]() {
Expand Down
Loading

0 comments on commit a19d898

Please sign in to comment.