diff --git a/src/fmindex-collection/utils.h b/src/fmindex-collection/utils.h index 0746f90c..aeef9043 100644 --- a/src/fmindex-collection/utils.h +++ b/src/fmindex-collection/utils.h @@ -170,4 +170,73 @@ auto createSequences_32(Sequences auto const& _input, int samplingRate, bool rev } +inline auto createSA_32(std::span input, size_t threadNbr) -> std::vector { + auto sa = std::vector(input.size()); + if (input.size() == 0) { + return sa; + } +#if LIBSAIS_OPENMP + auto r = libsais_int_omp((int32_t*)input.data(), sa.data(), input.size(), 0, nullptr, threadNbr); +#else + (void)threadNbr; // Unused if no openmp is available + auto r = libsais_int((int32_t*)input.data(), sa.data(), input.size(), 65536, 0); +#endif + + if (r != 0) { throw std::runtime_error("something went wrong constructing the SA"); } + return sa; +} + + +inline auto createBWT_32(std::span input, std::span sa) -> std::vector { + assert(input.size() == sa.size()); + auto bwt = std::vector{}; + bwt.resize(input.size()); + for (size_t i{0}; i < sa.size(); ++i) { + bwt[i] = input[(sa[i] + input.size() - 1) % input.size()]; + } + return bwt; +} + +auto createSequences_32(Sequences auto const& _input, int samplingRate, bool reverse=false) -> std::tuple, std::vector>> { + // compute total numbers of bytes of the text including delimiters "$" + size_t totalSize{}; + for (auto const& l : _input) { + auto textLen = l.size(); + auto delimLen = samplingRate - textLen % samplingRate; // Make sure it is always a multiple of samplingRate + totalSize += textLen + delimLen; + } + + // our concatenated sequences with delimiters + auto inputText = std::vector{}; + inputText.reserve(totalSize); + + // list of sizes of the individual sequences + auto inputSizes = std::vector>{}; + inputSizes.reserve(_input.size()); + + + for (auto const& l : _input) { + auto ls = l.size(); + // number of delimiters ('$') which need to be added. It must be at least one, and it + // has to make sure the text will be a multiple of samplingRate + size_t delimCount = samplingRate - (ls % samplingRate); + inputText.resize(inputText.size() + ls + delimCount, 0); + + if (not reverse) { + std::ranges::copy(l, end(inputText) - ls - delimCount); + } else { +//!TODO hack for clang, broken in clang 15 +#if __clang__ + auto l2 = std::vector(l); + std::ranges::reverse(l2); +#else + auto l2 = std::views::reverse(l); +#endif + std::ranges::copy(l2, end(inputText) - ls - delimCount); + } + + inputSizes.emplace_back(l.size(), delimCount); + } + return {totalSize, inputText, inputSizes}; +} }