From 9226a8084ec928411db01721830d4818ee34d980 Mon Sep 17 00:00:00 2001 From: Simon Gene Gottlieb Date: Tue, 9 May 2023 08:57:17 +0200 Subject: [PATCH 1/7] patch: replacing label array, with binay search --- src/fmindex-collection/CSA.h | 16 +++------------- src/fmindex-collection/DenseCSA.h | 17 +++++------------ 2 files changed, 8 insertions(+), 25 deletions(-) diff --git a/src/fmindex-collection/CSA.h b/src/fmindex-collection/CSA.h index 2b46700f..ec023717 100644 --- a/src/fmindex-collection/CSA.h +++ b/src/fmindex-collection/CSA.h @@ -56,24 +56,15 @@ struct CSA { accInputSizes.emplace_back(accInputSizes.back() + len + delCt); } - // Annotate text with labels, naming the correct sequence id - auto labels = std::vector{}; - labels.reserve(sa.size() / samplingRate); - - for (size_t i{0}, subjId{0}; i < sa.size(); i += samplingRate) { - while (i >= accInputSizes[subjId]) { - subjId += 1; - } - labels.emplace_back(subjId-1); - } - // Construct sampled suffix array auto ssa = std::vector{}; ssa.reserve(sa.size() / _samplingRate); for (size_t i{0}; i < sa.size(); ++i) { bool sample = (sa[i] % samplingRate) == 0; if (sample) { - auto subjId = labels[sa[i] / samplingRate]; + // find subject id + auto iter = std::upper_bound(accInputSizes.begin(), accInputSizes.end(), sa[i]); + size_t subjId = std::distance(accInputSizes.begin(), iter) - 1; auto subjPos = sa[i] - accInputSizes[subjId]; if (reverse) { auto [len, delCt] = _inputSizes[subjId]; @@ -92,7 +83,6 @@ struct CSA { }}; } - auto operator=(CSA const&) -> CSA& = delete; auto operator=(CSA&& _other) noexcept -> CSA& = default; diff --git a/src/fmindex-collection/DenseCSA.h b/src/fmindex-collection/DenseCSA.h index 05635b52..1120d67c 100644 --- a/src/fmindex-collection/DenseCSA.h +++ b/src/fmindex-collection/DenseCSA.h @@ -75,26 +75,19 @@ struct DenseCSA { largestText = std::max(largestText, len); } - // Annotate text with labels, naming the correct sequence id - auto labels = std::vector{}; - labels.reserve(sa.size() / samplingRate); - - for (size_t i{0}, subjId{0}; i < sa.size(); i += samplingRate) { - while (i >= accInputSizes[subjId]) { - subjId += 1; - } - labels.emplace_back(subjId-1); - } - // Construct sampled suffix array size_t bitsForPos = std::max(size_t{1}, size_t(std::ceil(std::log2(largestText)))); ssaPos = DenseVector(bitsForPos); ssaSeq = DenseVector(bitsForSeqId); + ssaPos.reserve(sa.size() / _samplingRate); + ssaSeq.reserve(sa.size() / _samplingRate); for (size_t i{0}; i < sa.size(); ++i) { bool sample = (sa[i] % samplingRate) == 0; if (sample) { - auto subjId = labels[sa[i] / samplingRate]; + // find subject id + auto iter = std::upper_bound(accInputSizes.begin(), accInputSizes.end(), sa[i]); + size_t subjId = std::distance(accInputSizes.begin(), iter) - 1; auto subjPos = sa[i] - accInputSizes[subjId]; if (reverse) { auto [len, delCt] = _inputSizes[subjId]; From 6834ac1718d453b1e0d6c3cb7c38a09ded4d7d98 Mon Sep 17 00:00:00 2001 From: Simon Gene Gottlieb Date: Tue, 9 May 2023 10:00:01 +0200 Subject: [PATCH 2/7] patch: reformulate sequence creation --- src/fmindex-collection/utils.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/fmindex-collection/utils.h b/src/fmindex-collection/utils.h index a3cc029f..b65a885c 100644 --- a/src/fmindex-collection/utils.h +++ b/src/fmindex-collection/utils.h @@ -46,7 +46,7 @@ auto createSequences(Sequences auto const& _input, int samplingRate, bool revers // compute total numbers of bytes of the text including delimiters "$" size_t totalSize{}; for (auto const& l : _input) { - auto textLen = l.size(); + auto textLen = l.size(); auto delimLen = samplingRate - textLen % samplingRate; // Make sure it is always a multiple of samplingRate totalSize += textLen + delimLen; } @@ -59,16 +59,11 @@ auto createSequences(Sequences auto const& _input, int samplingRate, bool revers auto inputSizes = std::vector>{}; inputSizes.reserve(_input.size()); - for (auto const& l : _input) { auto ls = l.size(); - // number of delimiters ('$') which need to be added. It must be at least one, and it - // has to make sure the text will be a multiple of samplingRate - size_t delimCount = samplingRate - (ls % samplingRate); - inputText.resize(inputText.size() + ls + delimCount, 0); if (not reverse) { - std::ranges::copy(l, end(inputText) - ls - delimCount); + inputText.insert(inputText.end(), begin(l), end(l)); } else { //!TODO hack for clang, broken in clang 15 #if __clang__ @@ -77,10 +72,17 @@ auto createSequences(Sequences auto const& _input, int samplingRate, bool revers #else auto l2 = std::views::reverse(l); #endif - std::ranges::copy(l2, end(inputText) - ls - delimCount); + inputText.insert(inputText.end(), begin(l2), end(l2)); } - inputSizes.emplace_back(l.size(), delimCount); + // number of delimiters ('$') which need to be added. It must be at least one, and it + // has to make sure the text will be a multiple of samplingRate + size_t delimCount = samplingRate - (ls % samplingRate); + + // fill with delimiters/zeros + inputText.resize(inputText.size() + delimCount); + + inputSizes.emplace_back(ls, delimCount); } return {totalSize, inputText, inputSizes}; } From fa293afb3ed2d4c9b2259f7fb1c7427e0b942356 Mon Sep 17 00:00:00 2001 From: Simon Gene Gottlieb Date: Tue, 9 May 2023 16:28:07 +0200 Subject: [PATCH 3/7] fix: save even more memory when constructing the CSA --- src/fmindex-collection/CSA.h | 13 +++++++------ src/fmindex-collection/utils.h | 11 ++++++----- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/fmindex-collection/CSA.h b/src/fmindex-collection/CSA.h index ec023717..2ab11e8f 100644 --- a/src/fmindex-collection/CSA.h +++ b/src/fmindex-collection/CSA.h @@ -38,7 +38,7 @@ struct CSA { : bv {cereal_tag{}} {} - CSA(std::span sa, size_t _samplingRate, std::span const> _inputSizes, bool reverse=false) + CSA(std::vector sa, size_t _samplingRate, std::span const> _inputSizes, bool reverse=false) : samplingRate{_samplingRate} { size_t bitsForSeqId = std::max(size_t{1}, size_t(std::ceil(std::log2(_inputSizes.size())))); @@ -57,8 +57,7 @@ struct CSA { } // Construct sampled suffix array - auto ssa = std::vector{}; - ssa.reserve(sa.size() / _samplingRate); + size_t ssaI{}; // Index of the ssa that is inside of sa for (size_t i{0}; i < sa.size(); ++i) { bool sample = (sa[i] % samplingRate) == 0; if (sample) { @@ -74,11 +73,13 @@ struct CSA { subjPos = len+1; } } - ssa.emplace_back(subjPos | (subjId << bitsForPosition)); + sa[ssaI] = subjPos | (subjId << bitsForPosition); + ++ssaI; } } - this->ssa = std::move(ssa); - this->bv = BitvectorCompact{sa.size(), [&](size_t idx) { + sa.resize(ssaI); + ssa = std::move(sa); + bv = BitvectorCompact{sa.size(), [&](size_t idx) { return (sa[idx] % samplingRate) == 0; }}; } diff --git a/src/fmindex-collection/utils.h b/src/fmindex-collection/utils.h index b65a885c..28a780b1 100644 --- a/src/fmindex-collection/utils.h +++ b/src/fmindex-collection/utils.h @@ -12,19 +12,20 @@ #include #include #include +#include namespace fmindex_collection { -inline auto createSA(std::span input, size_t threadNbr) -> std::vector { - auto sa = std::vector(input.size()); +inline auto createSA(std::span input, size_t threadNbr) -> std::vector { + auto sa = std::vector(input.size()); if (input.size() == 0) { return sa; } #if LIBSAIS_OPENMP - auto r = libsais64_omp(input.data(), sa.data(), input.size(), 0, nullptr, threadNbr); + auto r = libsais64_omp(input.data(), reinterpret_cast(sa.data()), input.size(), 0, nullptr, threadNbr); #else (void)threadNbr; // Unused if no openmp is available - auto r = libsais64(input.data(), sa.data(), input.size(), 0, nullptr); + auto r = libsais64(input.data(), reinterpret_cast(sa.data()), input.size(), 0, nullptr); #endif if (r != 0) { throw std::runtime_error("something went wrong constructing the SA"); } @@ -32,7 +33,7 @@ inline auto createSA(std::span input, size_t threadNbr) -> std::v } -inline auto createBWT(std::span input, std::span sa) -> std::vector { +inline auto createBWT(std::span input, std::span sa) -> std::vector { assert(input.size() == sa.size()); auto bwt = std::vector{}; bwt.resize(input.size()); From 67c58905496d5fe8868eaa7c0863d91ad01f6c18 Mon Sep 17 00:00:00 2001 From: Simon Gene Gottlieb Date: Tue, 9 May 2023 16:54:27 +0200 Subject: [PATCH 4/7] fixup: Bitvector wasn't populated correctly --- src/fmindex-collection/CSA.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/fmindex-collection/CSA.h b/src/fmindex-collection/CSA.h index 2ab11e8f..3c59bee4 100644 --- a/src/fmindex-collection/CSA.h +++ b/src/fmindex-collection/CSA.h @@ -39,7 +39,10 @@ struct CSA { {} CSA(std::vector sa, size_t _samplingRate, std::span const> _inputSizes, bool reverse=false) - : samplingRate{_samplingRate} + : bv {sa.size(), [&](size_t idx) { + return (sa[idx] % _samplingRate) == 0; + }} + , samplingRate{_samplingRate} { size_t bitsForSeqId = std::max(size_t{1}, size_t(std::ceil(std::log2(_inputSizes.size())))); assert(bitsForSeqId < 64); @@ -79,9 +82,6 @@ struct CSA { } sa.resize(ssaI); ssa = std::move(sa); - bv = BitvectorCompact{sa.size(), [&](size_t idx) { - return (sa[idx] % samplingRate) == 0; - }}; } auto operator=(CSA const&) -> CSA& = delete; From d13c7ead8dbdff3c798083e0412f0ef17456bae0 Mon Sep 17 00:00:00 2001 From: Simon Gene Gottlieb Date: Fri, 12 May 2023 10:18:40 +0200 Subject: [PATCH 5/7] fix: missing conversion from int64 to uint64 --- src/example/utils/utils.h | 2 +- src/fmindex-collection/DenseCSA.h | 2 +- src/test_fmindex-collection/utils.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/example/utils/utils.h b/src/example/utils/utils.h index 4537779a..c4d6021c 100644 --- a/src/example/utils/utils.h +++ b/src/example/utils/utils.h @@ -21,7 +21,7 @@ -inline auto construct_bwt_from_sa(std::vector const& sa, std::string_view const& text) -> std::vector { +inline auto construct_bwt_from_sa(std::vector const& sa, std::string_view const& text) -> std::vector { assert(sa.size() == text.size()); std::vector bwt; bwt.resize(text.size()); diff --git a/src/fmindex-collection/DenseCSA.h b/src/fmindex-collection/DenseCSA.h index 1120d67c..29ef4018 100644 --- a/src/fmindex-collection/DenseCSA.h +++ b/src/fmindex-collection/DenseCSA.h @@ -55,7 +55,7 @@ struct DenseCSA { , bv {cereal_tag{}} {} - DenseCSA(std::span sa, size_t _samplingRate, std::span const> _inputSizes, bool reverse=false) + DenseCSA(std::span sa, size_t _samplingRate, std::span const> _inputSizes, bool reverse=false) : ssaPos{cereal_tag{}} , ssaSeq{cereal_tag{}} , bv {cereal_tag{}} diff --git a/src/test_fmindex-collection/utils.cpp b/src/test_fmindex-collection/utils.cpp index dbbbbfb6..75f80301 100644 --- a/src/test_fmindex-collection/utils.cpp +++ b/src/test_fmindex-collection/utils.cpp @@ -6,7 +6,7 @@ TEST_CASE("check creation of suffix array", "[createSA]") { auto input = std::vector{'H', 'a', 'l', 'l', 'o', ' ', 'W', 'e', 'l', 't', '\0', '\0'}; - auto expected = std::vector{ 11, 10, 5, 0, 6, 1, 7, 2, 3, 8, 4, 9 }; + auto expected = std::vector{ 11, 10, 5, 0, 6, 1, 7, 2, 3, 8, 4, 9 }; auto output = fmindex_collection::createSA(input, 1); CHECK(output == expected); From c011979bac8304f9c0133d06b025843a25491452 Mon Sep 17 00:00:00 2001 From: Simon Gene Gottlieb Date: Thu, 29 Jun 2023 08:47:09 +0200 Subject: [PATCH 6/7] fix: add include guard for omp --- src/fmindex-collection/utils.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/fmindex-collection/utils.h b/src/fmindex-collection/utils.h index 28a780b1..2708a316 100644 --- a/src/fmindex-collection/utils.h +++ b/src/fmindex-collection/utils.h @@ -12,7 +12,10 @@ #include #include #include -#include + +#if LIBSAIS_OPENMP +# include +#endif namespace fmindex_collection { From 28407b5b6c200324c5aeed13f8962e256caf3e91 Mon Sep 17 00:00:00 2001 From: Simon Gene Gottlieb Date: Thu, 29 Jun 2023 10:21:32 +0200 Subject: [PATCH 7/7] fix: ci macos --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fd8a4d93..75f1464b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -80,7 +80,7 @@ jobs: - name: Install Tools on Mac run: | touch ${HOME}/.activate_brew - brew update + brew update-reset brew install --force-bottle --overwrite fmt boost cmake ${{ matrix.brew_pkgs }} pkg-config if: matrix.osname == 'MacOS 11'