Merge pull request #27 from SGSSGene/feat/rbifmindex

Feat/rbifmindex
SGSSGene · Sep 4, 2023 · a19d898 · a19d898
2 parents accc333 + 004f7b3
commit a19d898
Show file tree

Hide file tree

Showing 6 changed files with 425 additions and 2 deletions.
diff --git a/src/fmindex-collection/RBiFMIndex.h b/src/fmindex-collection/RBiFMIndex.h
@@ -0,0 +1,155 @@
+// -----------------------------------------------------------------------------------------------------
+// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
+// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
+// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
+// shipped with this file.
+// -----------------------------------------------------------------------------------------------------
+#pragma once
+
+#include "CSA.h"
+#include "occtable/concepts.h"
+#include "utils.h"
+
+#include <algorithm>
+
+namespace fmindex_collection {
+
+template <OccTable Table, typename TCSA = CSA>
+struct RBiFMIndex {
+    static size_t constexpr Sigma = Table::Sigma;
+
+    using TTable = Table;
+
+    Table  occ;
+    TCSA   csa;
+
+//private:
+    RBiFMIndex(std::span<uint8_t const> bwt, TCSA _csa)
+        : occ{bwt}
+        , csa{std::move(_csa)}
+    {
+        // compute last row
+        auto ct = std::array<uint64_t, Sigma>{};
+        for (auto v : bwt) {
+            ct[v] += 1;
+        }
+        for (size_t i{1}; i < ct.size(); ++i) {
+            ct[i] = ct[i-1] + ct[i];
+        }
+        // check last row is correct
+        for (size_t sym{0}; sym < Sigma; ++sym) {
+            if (occ.rank(occ.size(), sym) != ct[sym]) {
+                auto e = std::string{"Wrong rank for the last entry."}
+                    + " Got different values for forward index."
+                    + " sym: " + std::to_string(sym)
+                    + " got: " + std::to_string(occ.rank(occ.size(), sym))
+                    + " expected: " + std::to_string(ct[sym]);
+                throw std::runtime_error(e);
+            }
+        }
+        if constexpr (requires(Table t) {{ t.hasValue(size_t{}) }; }) {
+            for (size_t i{0}; i < occ.size(); ++i) {
+                if (csa.value(i).has_value()) {
+                    occ.setValue(i);
+                }
+            }
+        }
+    }
+
+public:
+    /**!\brief Creates a RBiFMIndex with a specified sampling rate
+     *
+     * \param _input a list of sequences
+     * \param samplingRate rate of the sampling
+     */
+    RBiFMIndex(Sequences auto const& _input, size_t samplingRate, size_t threadNbr)
+        : occ{cereal_tag{}}
+        , csa{cereal_tag{}}
+    {
+        auto [totalSize, inputText, inputSizes] = createSequencesAndReverse(_input, samplingRate);
+
+        // create BurrowsWheelerTransform and CompressedSuffixArray
+        auto [bwt, csa] = [&, &inputText=inputText, &inputSizes=inputSizes] () {
+            auto sa  = createSA(inputText, threadNbr);
+            auto bwt = createBWT(inputText, sa);
+            auto csa = TCSA(std::move(sa), samplingRate, inputSizes);
+            return std::make_tuple(std::move(bwt), std::move(csa));
+        }();
+
+        decltype(inputText){}.swap(inputText); // inputText memory can be deleted
+
+        *this = RBiFMIndex{bwt, std::move(csa)};
+    }
+
+
+    /*!\brief Specific c'tor for serialization use
+     */
+    RBiFMIndex(cereal_tag)
+        : occ{cereal_tag{}}
+        , csa{cereal_tag{}}
+    {}
+
+    size_t memoryUsage() const requires OccTableMemoryUsage<Table> {
+        return occ.memoryUsage() + csa.memoryUsage();
+    }
+
+    size_t size() const {
+        return occ.size();
+    }
+
+    auto locate(size_t idx) const -> std::tuple<size_t, size_t> {
+        if constexpr (requires(Table t) {{ t.hasValue(size_t{}) }; }) {
+            bool v = occ.hasValue(idx);
+            uint64_t steps{};
+            while(!v) {
+                idx = occ.rank_symbol(idx);
+                steps += 1;
+                v = occ.hasValue(idx);
+            }
+            auto [chr, pos] = csa.value(idx);
+            return {chr, pos+steps};
+
+        } else {
+            auto opt = csa.value(idx);
+            uint64_t steps{};
+            while(!opt) {
+                if constexpr (requires(Table t) { { t.rank_symbol(size_t{}) }; }) {
+                    idx = occ.rank_symbol(idx);
+                } else {
+                    idx = occ.rank(idx, occ.symbol(idx));
+                }
+                steps += 1;
+                opt = csa.value(idx);
+            }
+            auto [chr, pos] = *opt;
+            return {chr, pos+steps};
+        }
+    }
+
+    auto locate(size_t idx, size_t maxSteps) const -> std::optional<std::tuple<size_t, size_t>> {
+        auto opt = csa.value(idx);
+        uint64_t steps{};
+        for (;!opt and maxSteps > 0; --maxSteps) {
+            idx = occ.rank(idx, occ.symbol(idx));
+            steps += 1;
+            opt = csa.value(idx);
+        }
+        if (opt) {
+            std::get<1>(*opt) += steps;
+        }
+        return opt;
+    }
+
+
+    auto single_locate_step(size_t idx) const -> std::optional<std::tuple<size_t, size_t>> {
+        return csa.value(idx);
+    }
+
+
+    template <typename Archive>
+    void serialize(Archive& ar) {
+        ar(occ, csa);
+    }
+};
+
+}
diff --git a/src/fmindex-collection/RBiFMIndexCursor.h b/src/fmindex-collection/RBiFMIndexCursor.h
@@ -0,0 +1,196 @@
+// -----------------------------------------------------------------------------------------------------
+// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
+// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
+// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
+// shipped with this file.
+// -----------------------------------------------------------------------------------------------------
+#pragma once
+
+#include "RBiFMIndex.h"
+
+namespace fmindex_collection {
+
+template <typename Index>
+struct LeftRBiFMIndexCursor;
+
+template <typename Index>
+struct RBiFMIndexCursor {
+    static constexpr size_t Sigma    = Index::Sigma;
+    static constexpr bool   Reversed = false;
+
+    Index const* index{};
+    size_t lb;
+    size_t lbRev;
+    size_t len{};
+    RBiFMIndexCursor() noexcept = default;
+    RBiFMIndexCursor(Index const& index) noexcept
+        : RBiFMIndexCursor{index, 0, 0, index.size()}
+    {}
+    RBiFMIndexCursor(Index const& index, size_t lb, size_t lbRev, size_t len) noexcept
+        : index{&index}
+        , lb{lb}
+        , lbRev{lbRev}
+        , len{len}
+    {}
+
+    bool operator==(RBiFMIndexCursor const& _other) const noexcept {
+        return lb == _other.lb
+               && len == _other.len;
+    }
+    bool empty() const {
+        return len == 0;
+    }
+    size_t count() const {
+        return len;
+    }
+    auto extendLeft() const -> std::array<RBiFMIndexCursor, Sigma> {
+        auto const& occ = index->occ;
+        if constexpr (OccTablePrefetch<Index>) {
+            occ.prefetch(lb+len);
+        }
+        auto [rs1, prs1] = occ.all_ranks(lb);
+        auto [rs2, prs2] = occ.all_ranks(lb+len);
+
+        auto cursors = std::array<RBiFMIndexCursor, Sigma>{};
+        cursors[0] = RBiFMIndexCursor{*index, rs1[0], lbRev, rs2[0] - rs1[0]};
+        cursors[0].prefetchLeft();
+        for (size_t i{1}; i < Sigma; ++i) {
+            cursors[i] = RBiFMIndexCursor{*index, rs1[i], lbRev + prs2[i-1] - prs1[i-1], rs2[i] - rs1[i]};
+        }
+        return cursors;
+    }
+
+    auto extendRight() const -> std::array<RBiFMIndexCursor, Sigma> {
+        auto const& occ = index->occ;
+        if constexpr (OccTablePrefetch<Index>) {
+            occ.prefetch(lbRev+len);
+        }
+        auto [rs1, prs1] = occ.all_ranks(lbRev);
+        auto [rs2, prs2] = occ.all_ranks(lbRev+len);
+
+        auto cursors = std::array<RBiFMIndexCursor, Sigma>{};
+        cursors[0] = RBiFMIndexCursor{*index, lb, rs1[0], rs2[0] - rs1[0]};
+        cursors[0].prefetchRight();
+        for (size_t i{1}; i < Sigma; ++i) {
+            cursors[i] = RBiFMIndexCursor{*index, lb + prs2[i-1] - prs1[i-1], rs1[i], rs2[i] - rs1[i]};
+        }
+        return cursors;
+    }
+    void prefetchLeft() const {
+        if constexpr (OccTablePrefetch<Index>) {
+            auto& occ = index->occ;
+            occ.prefetch(lb);
+            occ.prefetch(lb+len);
+        }
+    }
+    void prefetchRight() const {
+        if constexpr (OccTablePrefetch<Index>) {
+            auto& occ = index->occ;
+            occ.prefetch(lbRev);
+            occ.prefetch(lbRev+len);
+        }
+    }
+
+    auto extendLeft(size_t symb) const -> RBiFMIndexCursor {
+        assert(symb > 0);
+        auto& occ = index->occ;
+        size_t newLb    = occ.rank(lb, symb);
+        size_t newLbRev = lbRev + occ.prefix_rank(lb+len, symb-1) - occ.prefix_rank(lb, symb-1);
+        size_t newLen   = occ.rank(lb+len, symb) - newLb;
+        auto newCursor = RBiFMIndexCursor{*index, newLb, newLbRev, newLen};
+        newCursor.prefetchLeft();
+        return newCursor;
+    }
+    auto extendRight(size_t symb) const -> RBiFMIndexCursor {
+        assert(symb > 0);
+        auto& occ = index->occ;
+        size_t newLb    = lb + occ.prefix_rank(lbRev+len, symb-1) - occ.prefix_rank(lbRev, symb-1);
+        size_t newLbRev = occ.rank(lbRev, symb);
+        size_t newLen   = occ.rank(lbRev+len, symb) - newLbRev;
+        auto newCursor = RBiFMIndexCursor{*index, newLb, newLbRev, newLen};
+        newCursor.prefetchRight();
+        return newCursor;
+    }
+};
+
+template <typename Index>
+auto begin(RBiFMIndexCursor<Index> const& _cursor) {
+    return _cursor.lb;
+}
+template <typename Index>
+auto end(RBiFMIndexCursor<Index> const& _cursor) {
+    return _cursor.lb + _cursor.len;
+}
+
+template <typename Index>
+struct LeftRBiFMIndexCursor {
+    static constexpr size_t Sigma    = Index::Sigma;
+    static constexpr bool   Reversed = false;
+
+    Index const* index;
+    size_t lb;
+    size_t len;
+    LeftRBiFMIndexCursor(RBiFMIndexCursor<Index> const& _other)
+        : index{_other.index}
+        , lb{_other.lb}
+        , len{_other.len}
+    {}
+    LeftRBiFMIndexCursor()
+        : index{nullptr}
+    {}
+    LeftRBiFMIndexCursor(Index const& index)
+        : LeftRBiFMIndexCursor{index, 0, index.size()}
+    {}
+    LeftRBiFMIndexCursor(Index const& index, size_t lb, size_t len)
+        : index{&index}
+        , lb{lb}
+        , len{len}
+    {}
+    bool empty() const {
+        return len == 0;
+    }
+    size_t count() const {
+        return len;
+    }
+    auto extendLeft() const -> std::array<LeftRBiFMIndexCursor, Sigma> {
+        auto const& occ = index->occ;
+        auto [rs1, prs1] = occ.all_ranks(lb);
+        auto [rs2, prs2] = occ.all_ranks(lb+len);
+
+        auto cursors = std::array<LeftRBiFMIndexCursor, Sigma>{};
+        cursors[0] = LeftRBiFMIndexCursor{*index, rs1[0], rs2[0] - rs1[0]};
+        for (size_t i{1}; i < Sigma; ++i) {
+            cursors[i] = LeftRBiFMIndexCursor{*index, rs1[i], rs2[i] - rs1[i]};
+//            cursors[i].prefetchLeft();
+        }
+        return cursors;
+    }
+
+    auto extendLeft(size_t symb) const -> LeftRBiFMIndexCursor {
+        assert(symb > 0);
+        auto& occ = index->occ;
+
+        size_t newLb    = occ.rank(lb, symb);
+        size_t newLen   = occ.rank(lb+len, symb) - newLb;
+        if constexpr (OccTablePrefetch<Index>) {
+            occ.prefetch(newLb);
+            occ.prefetch(newLb + newLen);
+        }
+
+        auto newCursor = LeftRBiFMIndexCursor{*index, newLb, newLen};
+        return newCursor;
+    }
+};
+}
+
+namespace std {
+
+template <typename index_t>
+struct hash<fmindex_collection::RBiFMIndexCursor<index_t>> {
+    auto operator()(fmindex_collection::RBiFMIndexCursor<index_t> const& cursor) const -> size_t {
+        return hash<size_t>()(cursor.lb)
+            ^ hash<size_t>()(cursor.len);
+    }
+};
+
+}
diff --git a/src/fmindex-collection/fmindex-collection.h b/src/fmindex-collection/fmindex-collection.h
@@ -11,6 +11,8 @@
 #include "BiFMIndex.h"
 #include "BiFMIndex_32.h"
 #include "BiFMIndexCursor.h"
+#include "RBiFMIndex.h"
+#include "RBiFMIndexCursor.h"
 #include "ReverseFMIndex.h"
 #include "ReverseFMIndexCursor.h"
 #include "CSA.h"

diff --git a/src/fmindex-collection/search/SearchNg21.h b/src/fmindex-collection/search/SearchNg21.h
@@ -31,7 +31,7 @@ template <typename index_t, typename search_scheme_t, typename delegate_t>
 struct Search {
     constexpr static size_t Sigma = index_t::Sigma;
 
-    using cursor_t = BiFMIndexCursor<index_t>;
+    using cursor_t = select_cursor_t<index_t>;
     using BlockIter = typename search_scheme_t::const_iterator;
 
     index_t const& index;
@@ -163,7 +163,7 @@ struct Search {
 
 template <typename index_t, typename query_t, typename search_scheme_t, typename search_scheme_reordered_t, typename delegate_t>
 void search_reordered(index_t const& index, query_t&& query, search_scheme_t const& search_scheme, search_scheme_reordered_t& reordered, delegate_t&& delegate) {
-    using cursor_t = BiFMIndexCursor<index_t>;
+    using cursor_t = select_cursor_t<index_t>;
     using R = std::decay_t<decltype(delegate(std::declval<cursor_t>(), 0))>;
 
     auto internal_delegate = [&]() {