diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cc547a28..476696d8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -80,7 +80,7 @@ jobs: - name: Install Tools on Mac run: | touch ${HOME}/.activate_brew - brew update + brew update-reset brew install --force-bottle --overwrite fmt boost cmake ${{ matrix.brew_pkgs }} pkg-config if: matrix.osname == 'MacOS 11' diff --git a/src/example/utils/utils.h b/src/example/utils/utils.h index 74e35692..2927433a 100644 --- a/src/example/utils/utils.h +++ b/src/example/utils/utils.h @@ -27,7 +27,7 @@ -inline auto construct_bwt_from_sa(std::vector const& sa, std::string_view const& text) -> std::vector { +inline auto construct_bwt_from_sa(std::vector const& sa, std::string_view const& text) -> std::vector { assert(sa.size() == text.size()); std::vector bwt; bwt.resize(text.size()); diff --git a/src/fmindex-collection/CSA.h b/src/fmindex-collection/CSA.h index 8880c2f1..f5d5e563 100644 --- a/src/fmindex-collection/CSA.h +++ b/src/fmindex-collection/CSA.h @@ -44,8 +44,11 @@ struct CSA { : bv {cereal_tag{}} {} - CSA(std::span sa, size_t _samplingRate, std::span const> _inputSizes, bool reverse=false) - : samplingRate{_samplingRate} + CSA(std::vector sa, size_t _samplingRate, std::span const> _inputSizes, bool reverse=false) + : bv {sa.size(), [&](size_t idx) { + return (sa[idx] % _samplingRate) == 0; + }} + , samplingRate{_samplingRate} { size_t bitsForSeqId = std::max(size_t{1}, size_t(std::ceil(std::log2(_inputSizes.size())))); assert(bitsForSeqId < 64); @@ -62,24 +65,14 @@ struct CSA { accInputSizes.emplace_back(accInputSizes.back() + len + delCt); } - // Annotate text with labels, naming the correct sequence id - auto labels = std::vector{}; - labels.reserve(sa.size() / samplingRate); - - for (size_t i{0}, subjId{0}; i < sa.size(); i += samplingRate) { - while (i >= accInputSizes[subjId]) { - subjId += 1; - } - labels.emplace_back(subjId-1); - } - // Construct sampled suffix array - auto ssa = std::vector{}; - ssa.reserve(sa.size() / _samplingRate); + size_t ssaI{}; // Index of the ssa that is inside of sa for (size_t i{0}; i < sa.size(); ++i) { bool sample = (sa[i] % samplingRate) == 0; if (sample) { - auto subjId = labels[sa[i] / samplingRate]; + // find subject id + auto iter = std::upper_bound(accInputSizes.begin(), accInputSizes.end(), sa[i]); + size_t subjId = std::distance(accInputSizes.begin(), iter) - 1; auto subjPos = sa[i] - accInputSizes[subjId]; if (reverse) { auto [len, delCt] = _inputSizes[subjId]; @@ -89,16 +82,14 @@ struct CSA { subjPos = len+1; } } - ssa.emplace_back(subjPos | (subjId << bitsForPosition)); + sa[ssaI] = subjPos | (subjId << bitsForPosition); + ++ssaI; } } - this->ssa = std::move(ssa); - this->bv = BitvectorCompact{sa.size(), [&](size_t idx) { - return (sa[idx] % samplingRate) == 0; - }}; + sa.resize(ssaI); + ssa = std::move(sa); } - auto operator=(CSA const&) -> CSA& = delete; auto operator=(CSA&& _other) noexcept -> CSA& = default; diff --git a/src/fmindex-collection/DenseCSA.h b/src/fmindex-collection/DenseCSA.h index d2088a04..d34dc4c6 100644 --- a/src/fmindex-collection/DenseCSA.h +++ b/src/fmindex-collection/DenseCSA.h @@ -61,7 +61,7 @@ struct DenseCSA { , bv {cereal_tag{}} {} - DenseCSA(std::span sa, size_t _samplingRate, std::span const> _inputSizes, bool reverse=false) + DenseCSA(std::span sa, size_t _samplingRate, std::span const> _inputSizes, bool reverse=false) : ssaPos{cereal_tag{}} , ssaSeq{cereal_tag{}} , bv {cereal_tag{}} @@ -81,26 +81,19 @@ struct DenseCSA { largestText = std::max(largestText, len); } - // Annotate text with labels, naming the correct sequence id - auto labels = std::vector{}; - labels.reserve(sa.size() / samplingRate); - - for (size_t i{0}, subjId{0}; i < sa.size(); i += samplingRate) { - while (i >= accInputSizes[subjId]) { - subjId += 1; - } - labels.emplace_back(subjId-1); - } - // Construct sampled suffix array size_t bitsForPos = std::max(size_t{1}, size_t(std::ceil(std::log2(largestText)))); ssaPos = DenseVector(bitsForPos); ssaSeq = DenseVector(bitsForSeqId); + ssaPos.reserve(sa.size() / _samplingRate); + ssaSeq.reserve(sa.size() / _samplingRate); for (size_t i{0}; i < sa.size(); ++i) { bool sample = (sa[i] % samplingRate) == 0; if (sample) { - auto subjId = labels[sa[i] / samplingRate]; + // find subject id + auto iter = std::upper_bound(accInputSizes.begin(), accInputSizes.end(), sa[i]); + size_t subjId = std::distance(accInputSizes.begin(), iter) - 1; auto subjPos = sa[i] - accInputSizes[subjId]; if (reverse) { auto [len, delCt] = _inputSizes[subjId]; diff --git a/src/fmindex-collection/utils.h b/src/fmindex-collection/utils.h index 5fb37cb6..b3cd45bc 100644 --- a/src/fmindex-collection/utils.h +++ b/src/fmindex-collection/utils.h @@ -19,18 +19,22 @@ #include #include +#if LIBSAIS_OPENMP +# include +#endif + namespace fmindex_collection { -inline auto createSA(std::span input, size_t threadNbr) -> std::vector { - auto sa = std::vector(input.size()); +inline auto createSA(std::span input, size_t threadNbr) -> std::vector { + auto sa = std::vector(input.size()); if (input.size() == 0) { return sa; } #if LIBSAIS_OPENMP - auto r = libsais64_omp(input.data(), sa.data(), input.size(), 0, nullptr, threadNbr); + auto r = libsais64_omp(input.data(), reinterpret_cast(sa.data()), input.size(), 0, nullptr, threadNbr); #else (void)threadNbr; // Unused if no openmp is available - auto r = libsais64(input.data(), sa.data(), input.size(), 0, nullptr); + auto r = libsais64(input.data(), reinterpret_cast(sa.data()), input.size(), 0, nullptr); #endif if (r != 0) { throw std::runtime_error("something went wrong constructing the SA"); } @@ -38,7 +42,7 @@ inline auto createSA(std::span input, size_t threadNbr) -> std::v } -inline auto createBWT(std::span input, std::span sa) -> std::vector { +inline auto createBWT(std::span input, std::span sa) -> std::vector { assert(input.size() == sa.size()); auto bwt = std::vector{}; bwt.resize(input.size()); @@ -52,7 +56,7 @@ auto createSequences(Sequences auto const& _input, int samplingRate, bool revers // compute total numbers of bytes of the text including delimiters "$" size_t totalSize{}; for (auto const& l : _input) { - auto textLen = l.size(); + auto textLen = l.size(); auto delimLen = samplingRate - textLen % samplingRate; // Make sure it is always a multiple of samplingRate totalSize += textLen + delimLen; } @@ -65,16 +69,11 @@ auto createSequences(Sequences auto const& _input, int samplingRate, bool revers auto inputSizes = std::vector>{}; inputSizes.reserve(_input.size()); - for (auto const& l : _input) { auto ls = l.size(); - // number of delimiters ('$') which need to be added. It must be at least one, and it - // has to make sure the text will be a multiple of samplingRate - size_t delimCount = samplingRate - (ls % samplingRate); - inputText.resize(inputText.size() + ls + delimCount, 0); if (not reverse) { - std::ranges::copy(l, end(inputText) - ls - delimCount); + inputText.insert(inputText.end(), begin(l), end(l)); } else { //!TODO hack for clang, broken in clang 15 #if __clang__ @@ -83,10 +82,17 @@ auto createSequences(Sequences auto const& _input, int samplingRate, bool revers #else auto l2 = std::views::reverse(l); #endif - std::ranges::copy(l2, end(inputText) - ls - delimCount); + inputText.insert(inputText.end(), begin(l2), end(l2)); } - inputSizes.emplace_back(l.size(), delimCount); + // number of delimiters ('$') which need to be added. It must be at least one, and it + // has to make sure the text will be a multiple of samplingRate + size_t delimCount = samplingRate - (ls % samplingRate); + + // fill with delimiters/zeros + inputText.resize(inputText.size() + delimCount); + + inputSizes.emplace_back(ls, delimCount); } return {totalSize, inputText, inputSizes}; } diff --git a/src/test_fmindex-collection/utils.cpp b/src/test_fmindex-collection/utils.cpp index 47ded49a..8ee7f0d6 100644 --- a/src/test_fmindex-collection/utils.cpp +++ b/src/test_fmindex-collection/utils.cpp @@ -12,7 +12,7 @@ TEST_CASE("check creation of suffix array", "[createSA]") { auto input = std::vector{'H', 'a', 'l', 'l', 'o', ' ', 'W', 'e', 'l', 't', '\0', '\0'}; - auto expected = std::vector{ 11, 10, 5, 0, 6, 1, 7, 2, 3, 8, 4, 9 }; + auto expected = std::vector{ 11, 10, 5, 0, 6, 1, 7, 2, 3, 8, 4, 9 }; auto output = fmindex_collection::createSA(input, 1); CHECK(output == expected);