-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #27 from SGSSGene/feat/rbifmindex
Feat/rbifmindex
- Loading branch information
Showing
6 changed files
with
425 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
// ----------------------------------------------------------------------------------------------------- | ||
// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin | ||
// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik | ||
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License | ||
// shipped with this file. | ||
// ----------------------------------------------------------------------------------------------------- | ||
#pragma once | ||
|
||
#include "CSA.h" | ||
#include "occtable/concepts.h" | ||
#include "utils.h" | ||
|
||
#include <algorithm> | ||
|
||
namespace fmindex_collection { | ||
|
||
template <OccTable Table, typename TCSA = CSA> | ||
struct RBiFMIndex { | ||
static size_t constexpr Sigma = Table::Sigma; | ||
|
||
using TTable = Table; | ||
|
||
Table occ; | ||
TCSA csa; | ||
|
||
//private: | ||
RBiFMIndex(std::span<uint8_t const> bwt, TCSA _csa) | ||
: occ{bwt} | ||
, csa{std::move(_csa)} | ||
{ | ||
// compute last row | ||
auto ct = std::array<uint64_t, Sigma>{}; | ||
for (auto v : bwt) { | ||
ct[v] += 1; | ||
} | ||
for (size_t i{1}; i < ct.size(); ++i) { | ||
ct[i] = ct[i-1] + ct[i]; | ||
} | ||
// check last row is correct | ||
for (size_t sym{0}; sym < Sigma; ++sym) { | ||
if (occ.rank(occ.size(), sym) != ct[sym]) { | ||
auto e = std::string{"Wrong rank for the last entry."} | ||
+ " Got different values for forward index." | ||
+ " sym: " + std::to_string(sym) | ||
+ " got: " + std::to_string(occ.rank(occ.size(), sym)) | ||
+ " expected: " + std::to_string(ct[sym]); | ||
throw std::runtime_error(e); | ||
} | ||
} | ||
if constexpr (requires(Table t) {{ t.hasValue(size_t{}) }; }) { | ||
for (size_t i{0}; i < occ.size(); ++i) { | ||
if (csa.value(i).has_value()) { | ||
occ.setValue(i); | ||
} | ||
} | ||
} | ||
} | ||
|
||
public: | ||
/**!\brief Creates a RBiFMIndex with a specified sampling rate | ||
* | ||
* \param _input a list of sequences | ||
* \param samplingRate rate of the sampling | ||
*/ | ||
RBiFMIndex(Sequences auto const& _input, size_t samplingRate, size_t threadNbr) | ||
: occ{cereal_tag{}} | ||
, csa{cereal_tag{}} | ||
{ | ||
auto [totalSize, inputText, inputSizes] = createSequencesAndReverse(_input, samplingRate); | ||
|
||
// create BurrowsWheelerTransform and CompressedSuffixArray | ||
auto [bwt, csa] = [&, &inputText=inputText, &inputSizes=inputSizes] () { | ||
auto sa = createSA(inputText, threadNbr); | ||
auto bwt = createBWT(inputText, sa); | ||
auto csa = TCSA(std::move(sa), samplingRate, inputSizes); | ||
return std::make_tuple(std::move(bwt), std::move(csa)); | ||
}(); | ||
|
||
decltype(inputText){}.swap(inputText); // inputText memory can be deleted | ||
|
||
*this = RBiFMIndex{bwt, std::move(csa)}; | ||
} | ||
|
||
|
||
/*!\brief Specific c'tor for serialization use | ||
*/ | ||
RBiFMIndex(cereal_tag) | ||
: occ{cereal_tag{}} | ||
, csa{cereal_tag{}} | ||
{} | ||
|
||
size_t memoryUsage() const requires OccTableMemoryUsage<Table> { | ||
return occ.memoryUsage() + csa.memoryUsage(); | ||
} | ||
|
||
size_t size() const { | ||
return occ.size(); | ||
} | ||
|
||
auto locate(size_t idx) const -> std::tuple<size_t, size_t> { | ||
if constexpr (requires(Table t) {{ t.hasValue(size_t{}) }; }) { | ||
bool v = occ.hasValue(idx); | ||
uint64_t steps{}; | ||
while(!v) { | ||
idx = occ.rank_symbol(idx); | ||
steps += 1; | ||
v = occ.hasValue(idx); | ||
} | ||
auto [chr, pos] = csa.value(idx); | ||
return {chr, pos+steps}; | ||
|
||
} else { | ||
auto opt = csa.value(idx); | ||
uint64_t steps{}; | ||
while(!opt) { | ||
if constexpr (requires(Table t) { { t.rank_symbol(size_t{}) }; }) { | ||
idx = occ.rank_symbol(idx); | ||
} else { | ||
idx = occ.rank(idx, occ.symbol(idx)); | ||
} | ||
steps += 1; | ||
opt = csa.value(idx); | ||
} | ||
auto [chr, pos] = *opt; | ||
return {chr, pos+steps}; | ||
} | ||
} | ||
|
||
auto locate(size_t idx, size_t maxSteps) const -> std::optional<std::tuple<size_t, size_t>> { | ||
auto opt = csa.value(idx); | ||
uint64_t steps{}; | ||
for (;!opt and maxSteps > 0; --maxSteps) { | ||
idx = occ.rank(idx, occ.symbol(idx)); | ||
steps += 1; | ||
opt = csa.value(idx); | ||
} | ||
if (opt) { | ||
std::get<1>(*opt) += steps; | ||
} | ||
return opt; | ||
} | ||
|
||
|
||
auto single_locate_step(size_t idx) const -> std::optional<std::tuple<size_t, size_t>> { | ||
return csa.value(idx); | ||
} | ||
|
||
|
||
template <typename Archive> | ||
void serialize(Archive& ar) { | ||
ar(occ, csa); | ||
} | ||
}; | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,196 @@ | ||
// ----------------------------------------------------------------------------------------------------- | ||
// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin | ||
// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik | ||
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License | ||
// shipped with this file. | ||
// ----------------------------------------------------------------------------------------------------- | ||
#pragma once | ||
|
||
#include "RBiFMIndex.h" | ||
|
||
namespace fmindex_collection { | ||
|
||
template <typename Index> | ||
struct LeftRBiFMIndexCursor; | ||
|
||
template <typename Index> | ||
struct RBiFMIndexCursor { | ||
static constexpr size_t Sigma = Index::Sigma; | ||
static constexpr bool Reversed = false; | ||
|
||
Index const* index{}; | ||
size_t lb; | ||
size_t lbRev; | ||
size_t len{}; | ||
RBiFMIndexCursor() noexcept = default; | ||
RBiFMIndexCursor(Index const& index) noexcept | ||
: RBiFMIndexCursor{index, 0, 0, index.size()} | ||
{} | ||
RBiFMIndexCursor(Index const& index, size_t lb, size_t lbRev, size_t len) noexcept | ||
: index{&index} | ||
, lb{lb} | ||
, lbRev{lbRev} | ||
, len{len} | ||
{} | ||
|
||
bool operator==(RBiFMIndexCursor const& _other) const noexcept { | ||
return lb == _other.lb | ||
&& len == _other.len; | ||
} | ||
bool empty() const { | ||
return len == 0; | ||
} | ||
size_t count() const { | ||
return len; | ||
} | ||
auto extendLeft() const -> std::array<RBiFMIndexCursor, Sigma> { | ||
auto const& occ = index->occ; | ||
if constexpr (OccTablePrefetch<Index>) { | ||
occ.prefetch(lb+len); | ||
} | ||
auto [rs1, prs1] = occ.all_ranks(lb); | ||
auto [rs2, prs2] = occ.all_ranks(lb+len); | ||
|
||
auto cursors = std::array<RBiFMIndexCursor, Sigma>{}; | ||
cursors[0] = RBiFMIndexCursor{*index, rs1[0], lbRev, rs2[0] - rs1[0]}; | ||
cursors[0].prefetchLeft(); | ||
for (size_t i{1}; i < Sigma; ++i) { | ||
cursors[i] = RBiFMIndexCursor{*index, rs1[i], lbRev + prs2[i-1] - prs1[i-1], rs2[i] - rs1[i]}; | ||
} | ||
return cursors; | ||
} | ||
|
||
auto extendRight() const -> std::array<RBiFMIndexCursor, Sigma> { | ||
auto const& occ = index->occ; | ||
if constexpr (OccTablePrefetch<Index>) { | ||
occ.prefetch(lbRev+len); | ||
} | ||
auto [rs1, prs1] = occ.all_ranks(lbRev); | ||
auto [rs2, prs2] = occ.all_ranks(lbRev+len); | ||
|
||
auto cursors = std::array<RBiFMIndexCursor, Sigma>{}; | ||
cursors[0] = RBiFMIndexCursor{*index, lb, rs1[0], rs2[0] - rs1[0]}; | ||
cursors[0].prefetchRight(); | ||
for (size_t i{1}; i < Sigma; ++i) { | ||
cursors[i] = RBiFMIndexCursor{*index, lb + prs2[i-1] - prs1[i-1], rs1[i], rs2[i] - rs1[i]}; | ||
} | ||
return cursors; | ||
} | ||
void prefetchLeft() const { | ||
if constexpr (OccTablePrefetch<Index>) { | ||
auto& occ = index->occ; | ||
occ.prefetch(lb); | ||
occ.prefetch(lb+len); | ||
} | ||
} | ||
void prefetchRight() const { | ||
if constexpr (OccTablePrefetch<Index>) { | ||
auto& occ = index->occ; | ||
occ.prefetch(lbRev); | ||
occ.prefetch(lbRev+len); | ||
} | ||
} | ||
|
||
auto extendLeft(size_t symb) const -> RBiFMIndexCursor { | ||
assert(symb > 0); | ||
auto& occ = index->occ; | ||
size_t newLb = occ.rank(lb, symb); | ||
size_t newLbRev = lbRev + occ.prefix_rank(lb+len, symb-1) - occ.prefix_rank(lb, symb-1); | ||
size_t newLen = occ.rank(lb+len, symb) - newLb; | ||
auto newCursor = RBiFMIndexCursor{*index, newLb, newLbRev, newLen}; | ||
newCursor.prefetchLeft(); | ||
return newCursor; | ||
} | ||
auto extendRight(size_t symb) const -> RBiFMIndexCursor { | ||
assert(symb > 0); | ||
auto& occ = index->occ; | ||
size_t newLb = lb + occ.prefix_rank(lbRev+len, symb-1) - occ.prefix_rank(lbRev, symb-1); | ||
size_t newLbRev = occ.rank(lbRev, symb); | ||
size_t newLen = occ.rank(lbRev+len, symb) - newLbRev; | ||
auto newCursor = RBiFMIndexCursor{*index, newLb, newLbRev, newLen}; | ||
newCursor.prefetchRight(); | ||
return newCursor; | ||
} | ||
}; | ||
|
||
template <typename Index> | ||
auto begin(RBiFMIndexCursor<Index> const& _cursor) { | ||
return _cursor.lb; | ||
} | ||
template <typename Index> | ||
auto end(RBiFMIndexCursor<Index> const& _cursor) { | ||
return _cursor.lb + _cursor.len; | ||
} | ||
|
||
template <typename Index> | ||
struct LeftRBiFMIndexCursor { | ||
static constexpr size_t Sigma = Index::Sigma; | ||
static constexpr bool Reversed = false; | ||
|
||
Index const* index; | ||
size_t lb; | ||
size_t len; | ||
LeftRBiFMIndexCursor(RBiFMIndexCursor<Index> const& _other) | ||
: index{_other.index} | ||
, lb{_other.lb} | ||
, len{_other.len} | ||
{} | ||
LeftRBiFMIndexCursor() | ||
: index{nullptr} | ||
{} | ||
LeftRBiFMIndexCursor(Index const& index) | ||
: LeftRBiFMIndexCursor{index, 0, index.size()} | ||
{} | ||
LeftRBiFMIndexCursor(Index const& index, size_t lb, size_t len) | ||
: index{&index} | ||
, lb{lb} | ||
, len{len} | ||
{} | ||
bool empty() const { | ||
return len == 0; | ||
} | ||
size_t count() const { | ||
return len; | ||
} | ||
auto extendLeft() const -> std::array<LeftRBiFMIndexCursor, Sigma> { | ||
auto const& occ = index->occ; | ||
auto [rs1, prs1] = occ.all_ranks(lb); | ||
auto [rs2, prs2] = occ.all_ranks(lb+len); | ||
|
||
auto cursors = std::array<LeftRBiFMIndexCursor, Sigma>{}; | ||
cursors[0] = LeftRBiFMIndexCursor{*index, rs1[0], rs2[0] - rs1[0]}; | ||
for (size_t i{1}; i < Sigma; ++i) { | ||
cursors[i] = LeftRBiFMIndexCursor{*index, rs1[i], rs2[i] - rs1[i]}; | ||
// cursors[i].prefetchLeft(); | ||
} | ||
return cursors; | ||
} | ||
|
||
auto extendLeft(size_t symb) const -> LeftRBiFMIndexCursor { | ||
assert(symb > 0); | ||
auto& occ = index->occ; | ||
|
||
size_t newLb = occ.rank(lb, symb); | ||
size_t newLen = occ.rank(lb+len, symb) - newLb; | ||
if constexpr (OccTablePrefetch<Index>) { | ||
occ.prefetch(newLb); | ||
occ.prefetch(newLb + newLen); | ||
} | ||
|
||
auto newCursor = LeftRBiFMIndexCursor{*index, newLb, newLen}; | ||
return newCursor; | ||
} | ||
}; | ||
} | ||
|
||
namespace std { | ||
|
||
template <typename index_t> | ||
struct hash<fmindex_collection::RBiFMIndexCursor<index_t>> { | ||
auto operator()(fmindex_collection::RBiFMIndexCursor<index_t> const& cursor) const -> size_t { | ||
return hash<size_t>()(cursor.lb) | ||
^ hash<size_t>()(cursor.len); | ||
} | ||
}; | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.