From a0ed3a0f87de24fefd0c997548a76a373840303c Mon Sep 17 00:00:00 2001 From: Simon Gene Gottlieb Date: Mon, 4 Sep 2023 10:36:28 +0200 Subject: [PATCH 1/2] fix: searchng21 works with any cursor --- src/fmindex-collection/search/SearchNg21.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fmindex-collection/search/SearchNg21.h b/src/fmindex-collection/search/SearchNg21.h index 7de3c98e..e7d14d69 100644 --- a/src/fmindex-collection/search/SearchNg21.h +++ b/src/fmindex-collection/search/SearchNg21.h @@ -31,7 +31,7 @@ template struct Search { constexpr static size_t Sigma = index_t::Sigma; - using cursor_t = BiFMIndexCursor; + using cursor_t = select_cursor_t; using BlockIter = typename search_scheme_t::const_iterator; index_t const& index; @@ -163,7 +163,7 @@ struct Search { template void search_reordered(index_t const& index, query_t&& query, search_scheme_t const& search_scheme, search_scheme_reordered_t& reordered, delegate_t&& delegate) { - using cursor_t = BiFMIndexCursor; + using cursor_t = select_cursor_t; using R = std::decay_t(), 0))>; auto internal_delegate = [&]() { From 004f7b3834a8e43552f215826122b652528f1993 Mon Sep 17 00:00:00 2001 From: Simon Gene Gottlieb Date: Mon, 4 Sep 2023 10:37:06 +0200 Subject: [PATCH 2/2] feat: add RBiFMIndex: an fm index with the reverse text included --- src/fmindex-collection/RBiFMIndex.h | 155 +++++++++++++++ src/fmindex-collection/RBiFMIndexCursor.h | 196 +++++++++++++++++++ src/fmindex-collection/fmindex-collection.h | 2 + src/fmindex-collection/search/SelectCursor.h | 13 ++ src/fmindex-collection/utils.h | 57 ++++++ 5 files changed, 423 insertions(+) create mode 100644 src/fmindex-collection/RBiFMIndex.h create mode 100644 src/fmindex-collection/RBiFMIndexCursor.h diff --git a/src/fmindex-collection/RBiFMIndex.h b/src/fmindex-collection/RBiFMIndex.h new file mode 100644 index 00000000..f7dfe462 --- /dev/null +++ b/src/fmindex-collection/RBiFMIndex.h @@ -0,0 +1,155 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file. +// ----------------------------------------------------------------------------------------------------- +#pragma once + +#include "CSA.h" +#include "occtable/concepts.h" +#include "utils.h" + +#include + +namespace fmindex_collection { + +template +struct RBiFMIndex { + static size_t constexpr Sigma = Table::Sigma; + + using TTable = Table; + + Table occ; + TCSA csa; + +//private: + RBiFMIndex(std::span bwt, TCSA _csa) + : occ{bwt} + , csa{std::move(_csa)} + { + // compute last row + auto ct = std::array{}; + for (auto v : bwt) { + ct[v] += 1; + } + for (size_t i{1}; i < ct.size(); ++i) { + ct[i] = ct[i-1] + ct[i]; + } + // check last row is correct + for (size_t sym{0}; sym < Sigma; ++sym) { + if (occ.rank(occ.size(), sym) != ct[sym]) { + auto e = std::string{"Wrong rank for the last entry."} + + " Got different values for forward index." + + " sym: " + std::to_string(sym) + + " got: " + std::to_string(occ.rank(occ.size(), sym)) + + " expected: " + std::to_string(ct[sym]); + throw std::runtime_error(e); + } + } + if constexpr (requires(Table t) {{ t.hasValue(size_t{}) }; }) { + for (size_t i{0}; i < occ.size(); ++i) { + if (csa.value(i).has_value()) { + occ.setValue(i); + } + } + } + } + +public: + /**!\brief Creates a RBiFMIndex with a specified sampling rate + * + * \param _input a list of sequences + * \param samplingRate rate of the sampling + */ + RBiFMIndex(Sequences auto const& _input, size_t samplingRate, size_t threadNbr) + : occ{cereal_tag{}} + , csa{cereal_tag{}} + { + auto [totalSize, inputText, inputSizes] = createSequencesAndReverse(_input, samplingRate); + + // create BurrowsWheelerTransform and CompressedSuffixArray + auto [bwt, csa] = [&, &inputText=inputText, &inputSizes=inputSizes] () { + auto sa = createSA(inputText, threadNbr); + auto bwt = createBWT(inputText, sa); + auto csa = TCSA(std::move(sa), samplingRate, inputSizes); + return std::make_tuple(std::move(bwt), std::move(csa)); + }(); + + decltype(inputText){}.swap(inputText); // inputText memory can be deleted + + *this = RBiFMIndex{bwt, std::move(csa)}; + } + + + /*!\brief Specific c'tor for serialization use + */ + RBiFMIndex(cereal_tag) + : occ{cereal_tag{}} + , csa{cereal_tag{}} + {} + + size_t memoryUsage() const requires OccTableMemoryUsage { + return occ.memoryUsage() + csa.memoryUsage(); + } + + size_t size() const { + return occ.size(); + } + + auto locate(size_t idx) const -> std::tuple { + if constexpr (requires(Table t) {{ t.hasValue(size_t{}) }; }) { + bool v = occ.hasValue(idx); + uint64_t steps{}; + while(!v) { + idx = occ.rank_symbol(idx); + steps += 1; + v = occ.hasValue(idx); + } + auto [chr, pos] = csa.value(idx); + return {chr, pos+steps}; + + } else { + auto opt = csa.value(idx); + uint64_t steps{}; + while(!opt) { + if constexpr (requires(Table t) { { t.rank_symbol(size_t{}) }; }) { + idx = occ.rank_symbol(idx); + } else { + idx = occ.rank(idx, occ.symbol(idx)); + } + steps += 1; + opt = csa.value(idx); + } + auto [chr, pos] = *opt; + return {chr, pos+steps}; + } + } + + auto locate(size_t idx, size_t maxSteps) const -> std::optional> { + auto opt = csa.value(idx); + uint64_t steps{}; + for (;!opt and maxSteps > 0; --maxSteps) { + idx = occ.rank(idx, occ.symbol(idx)); + steps += 1; + opt = csa.value(idx); + } + if (opt) { + std::get<1>(*opt) += steps; + } + return opt; + } + + + auto single_locate_step(size_t idx) const -> std::optional> { + return csa.value(idx); + } + + + template + void serialize(Archive& ar) { + ar(occ, csa); + } +}; + +} diff --git a/src/fmindex-collection/RBiFMIndexCursor.h b/src/fmindex-collection/RBiFMIndexCursor.h new file mode 100644 index 00000000..b4c883cd --- /dev/null +++ b/src/fmindex-collection/RBiFMIndexCursor.h @@ -0,0 +1,196 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file. +// ----------------------------------------------------------------------------------------------------- +#pragma once + +#include "RBiFMIndex.h" + +namespace fmindex_collection { + +template +struct LeftRBiFMIndexCursor; + +template +struct RBiFMIndexCursor { + static constexpr size_t Sigma = Index::Sigma; + static constexpr bool Reversed = false; + + Index const* index{}; + size_t lb; + size_t lbRev; + size_t len{}; + RBiFMIndexCursor() noexcept = default; + RBiFMIndexCursor(Index const& index) noexcept + : RBiFMIndexCursor{index, 0, 0, index.size()} + {} + RBiFMIndexCursor(Index const& index, size_t lb, size_t lbRev, size_t len) noexcept + : index{&index} + , lb{lb} + , lbRev{lbRev} + , len{len} + {} + + bool operator==(RBiFMIndexCursor const& _other) const noexcept { + return lb == _other.lb + && len == _other.len; + } + bool empty() const { + return len == 0; + } + size_t count() const { + return len; + } + auto extendLeft() const -> std::array { + auto const& occ = index->occ; + if constexpr (OccTablePrefetch) { + occ.prefetch(lb+len); + } + auto [rs1, prs1] = occ.all_ranks(lb); + auto [rs2, prs2] = occ.all_ranks(lb+len); + + auto cursors = std::array{}; + cursors[0] = RBiFMIndexCursor{*index, rs1[0], lbRev, rs2[0] - rs1[0]}; + cursors[0].prefetchLeft(); + for (size_t i{1}; i < Sigma; ++i) { + cursors[i] = RBiFMIndexCursor{*index, rs1[i], lbRev + prs2[i-1] - prs1[i-1], rs2[i] - rs1[i]}; + } + return cursors; + } + + auto extendRight() const -> std::array { + auto const& occ = index->occ; + if constexpr (OccTablePrefetch) { + occ.prefetch(lbRev+len); + } + auto [rs1, prs1] = occ.all_ranks(lbRev); + auto [rs2, prs2] = occ.all_ranks(lbRev+len); + + auto cursors = std::array{}; + cursors[0] = RBiFMIndexCursor{*index, lb, rs1[0], rs2[0] - rs1[0]}; + cursors[0].prefetchRight(); + for (size_t i{1}; i < Sigma; ++i) { + cursors[i] = RBiFMIndexCursor{*index, lb + prs2[i-1] - prs1[i-1], rs1[i], rs2[i] - rs1[i]}; + } + return cursors; + } + void prefetchLeft() const { + if constexpr (OccTablePrefetch) { + auto& occ = index->occ; + occ.prefetch(lb); + occ.prefetch(lb+len); + } + } + void prefetchRight() const { + if constexpr (OccTablePrefetch) { + auto& occ = index->occ; + occ.prefetch(lbRev); + occ.prefetch(lbRev+len); + } + } + + auto extendLeft(size_t symb) const -> RBiFMIndexCursor { + assert(symb > 0); + auto& occ = index->occ; + size_t newLb = occ.rank(lb, symb); + size_t newLbRev = lbRev + occ.prefix_rank(lb+len, symb-1) - occ.prefix_rank(lb, symb-1); + size_t newLen = occ.rank(lb+len, symb) - newLb; + auto newCursor = RBiFMIndexCursor{*index, newLb, newLbRev, newLen}; + newCursor.prefetchLeft(); + return newCursor; + } + auto extendRight(size_t symb) const -> RBiFMIndexCursor { + assert(symb > 0); + auto& occ = index->occ; + size_t newLb = lb + occ.prefix_rank(lbRev+len, symb-1) - occ.prefix_rank(lbRev, symb-1); + size_t newLbRev = occ.rank(lbRev, symb); + size_t newLen = occ.rank(lbRev+len, symb) - newLbRev; + auto newCursor = RBiFMIndexCursor{*index, newLb, newLbRev, newLen}; + newCursor.prefetchRight(); + return newCursor; + } +}; + +template +auto begin(RBiFMIndexCursor const& _cursor) { + return _cursor.lb; +} +template +auto end(RBiFMIndexCursor const& _cursor) { + return _cursor.lb + _cursor.len; +} + +template +struct LeftRBiFMIndexCursor { + static constexpr size_t Sigma = Index::Sigma; + static constexpr bool Reversed = false; + + Index const* index; + size_t lb; + size_t len; + LeftRBiFMIndexCursor(RBiFMIndexCursor const& _other) + : index{_other.index} + , lb{_other.lb} + , len{_other.len} + {} + LeftRBiFMIndexCursor() + : index{nullptr} + {} + LeftRBiFMIndexCursor(Index const& index) + : LeftRBiFMIndexCursor{index, 0, index.size()} + {} + LeftRBiFMIndexCursor(Index const& index, size_t lb, size_t len) + : index{&index} + , lb{lb} + , len{len} + {} + bool empty() const { + return len == 0; + } + size_t count() const { + return len; + } + auto extendLeft() const -> std::array { + auto const& occ = index->occ; + auto [rs1, prs1] = occ.all_ranks(lb); + auto [rs2, prs2] = occ.all_ranks(lb+len); + + auto cursors = std::array{}; + cursors[0] = LeftRBiFMIndexCursor{*index, rs1[0], rs2[0] - rs1[0]}; + for (size_t i{1}; i < Sigma; ++i) { + cursors[i] = LeftRBiFMIndexCursor{*index, rs1[i], rs2[i] - rs1[i]}; +// cursors[i].prefetchLeft(); + } + return cursors; + } + + auto extendLeft(size_t symb) const -> LeftRBiFMIndexCursor { + assert(symb > 0); + auto& occ = index->occ; + + size_t newLb = occ.rank(lb, symb); + size_t newLen = occ.rank(lb+len, symb) - newLb; + if constexpr (OccTablePrefetch) { + occ.prefetch(newLb); + occ.prefetch(newLb + newLen); + } + + auto newCursor = LeftRBiFMIndexCursor{*index, newLb, newLen}; + return newCursor; + } +}; +} + +namespace std { + +template +struct hash> { + auto operator()(fmindex_collection::RBiFMIndexCursor const& cursor) const -> size_t { + return hash()(cursor.lb) + ^ hash()(cursor.len); + } +}; + +} diff --git a/src/fmindex-collection/fmindex-collection.h b/src/fmindex-collection/fmindex-collection.h index c5828ee5..e4e5568d 100644 --- a/src/fmindex-collection/fmindex-collection.h +++ b/src/fmindex-collection/fmindex-collection.h @@ -11,6 +11,8 @@ #include "BiFMIndex.h" #include "BiFMIndex_32.h" #include "BiFMIndexCursor.h" +#include "RBiFMIndex.h" +#include "RBiFMIndexCursor.h" #include "ReverseFMIndex.h" #include "ReverseFMIndexCursor.h" #include "CSA.h" diff --git a/src/fmindex-collection/search/SelectCursor.h b/src/fmindex-collection/search/SelectCursor.h index a5b6da76..c9f7cb3b 100644 --- a/src/fmindex-collection/search/SelectCursor.h +++ b/src/fmindex-collection/search/SelectCursor.h @@ -8,6 +8,7 @@ #include "../BiFMIndexCursor.h" #include "../FMIndexCursor.h" +#include "../RBiFMIndexCursor.h" #include "../ReverseFMIndexCursor.h" @@ -26,11 +27,17 @@ struct SelectIndexCursor> { using cursor_t = FMIndexCursor>; }; +template +struct SelectIndexCursor> { + using cursor_t = RBiFMIndexCursor>; +}; + template struct SelectIndexCursor> { using cursor_t = ReverseFMIndexCursor>; }; + template struct SelectLeftIndexCursor; @@ -44,6 +51,12 @@ struct SelectLeftIndexCursor> { using cursor_t = FMIndexCursor>; }; +template +struct SelectLeftIndexCursor> { + using cursor_t = LeftRBiFMIndexCursor>; +}; + + template using select_cursor_t = typename SelectIndexCursor::cursor_t; diff --git a/src/fmindex-collection/utils.h b/src/fmindex-collection/utils.h index 0746f90c..6966af50 100644 --- a/src/fmindex-collection/utils.h +++ b/src/fmindex-collection/utils.h @@ -98,6 +98,63 @@ auto createSequences(Sequences auto const& _input, int samplingRate, bool revers return {totalSize, inputText, inputSizes}; } +auto createSequencesAndReverse(Sequences auto const& _input, int samplingRate) -> std::tuple, std::vector>> { + // compute total numbers of bytes of the text including delimiters "$" + size_t totalSize{}; + for (auto const& l : _input) { + auto textLen = l.size(); + auto delimLen = samplingRate - textLen % samplingRate; // Make sure it is always a multiple of samplingRate + totalSize += textLen + delimLen; + } + for (auto const& l : _input) { + auto textLen = l.size(); + auto delimLen = samplingRate - textLen % samplingRate; // Make sure it is always a multiple of samplingRate + totalSize += textLen + delimLen; + } + + // our concatenated sequences with delimiters + auto inputText = std::vector{}; + inputText.reserve(totalSize); + + // list of sizes of the individual sequences + auto inputSizes = std::vector>{}; + inputSizes.reserve(_input.size()); + + // add text + for (auto const& l : _input) { + auto ls = l.size(); + inputText.insert(inputText.end(), begin(l), end(l)); + + // number of delimiters ('$') which need to be added. It must be at least one, and it + // has to make sure the text will be a multiple of samplingRate + size_t delimCount = samplingRate - (ls % samplingRate); + + // fill with delimiters/zeros + inputText.resize(inputText.size() + delimCount); + + inputSizes.emplace_back(ls, delimCount); + } + + // add reversed text + for (auto const& l : _input) { + auto ls = l.size(); + auto l2 = std::views::reverse(l); + inputText.insert(inputText.end(), begin(l2), end(l2)); + + // number of delimiters ('$') which need to be added. It must be at least one, and it + // has to make sure the text will be a multiple of samplingRate + size_t delimCount = samplingRate - (ls % samplingRate); + + // fill with delimiters/zeros + inputText.resize(inputText.size() + delimCount); + + inputSizes.emplace_back(ls, delimCount); + } + + return {totalSize, inputText, inputSizes}; +} + + inline auto createSA_32(std::span input, size_t threadNbr) -> std::vector { auto sa = std::vector(input.size());