Skip to content

Commit

Permalink
Merge pull request #22 from SGSSGene/fix/improve_construction2
Browse files Browse the repository at this point in the history
Fix/improve construction2
  • Loading branch information
SGSSGene authored Jun 29, 2023
2 parents 6a9ceef + 28407b5 commit a419ab9
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 52 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ jobs:
- name: Install Tools on Mac
run: |
touch ${HOME}/.activate_brew
brew update
brew update-reset
brew install --force-bottle --overwrite fmt boost cmake ${{ matrix.brew_pkgs }} pkg-config
if: matrix.osname == 'MacOS 11'

Expand Down
2 changes: 1 addition & 1 deletion src/example/utils/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@



inline auto construct_bwt_from_sa(std::vector<int64_t> const& sa, std::string_view const& text) -> std::vector<uint8_t> {
inline auto construct_bwt_from_sa(std::vector<uint64_t> const& sa, std::string_view const& text) -> std::vector<uint8_t> {
assert(sa.size() == text.size());
std::vector<uint8_t> bwt;
bwt.resize(text.size());
Expand Down
35 changes: 13 additions & 22 deletions src/fmindex-collection/CSA.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,11 @@ struct CSA {
: bv {cereal_tag{}}
{}

CSA(std::span<int64_t const> sa, size_t _samplingRate, std::span<std::tuple<size_t, size_t> const> _inputSizes, bool reverse=false)
: samplingRate{_samplingRate}
CSA(std::vector<uint64_t> sa, size_t _samplingRate, std::span<std::tuple<size_t, size_t> const> _inputSizes, bool reverse=false)
: bv {sa.size(), [&](size_t idx) {
return (sa[idx] % _samplingRate) == 0;
}}
, samplingRate{_samplingRate}
{
size_t bitsForSeqId = std::max(size_t{1}, size_t(std::ceil(std::log2(_inputSizes.size()))));
assert(bitsForSeqId < 64);
Expand All @@ -62,24 +65,14 @@ struct CSA {
accInputSizes.emplace_back(accInputSizes.back() + len + delCt);
}

// Annotate text with labels, naming the correct sequence id
auto labels = std::vector<uint64_t>{};
labels.reserve(sa.size() / samplingRate);

for (size_t i{0}, subjId{0}; i < sa.size(); i += samplingRate) {
while (i >= accInputSizes[subjId]) {
subjId += 1;
}
labels.emplace_back(subjId-1);
}

// Construct sampled suffix array
auto ssa = std::vector<uint64_t>{};
ssa.reserve(sa.size() / _samplingRate);
size_t ssaI{}; // Index of the ssa that is inside of sa
for (size_t i{0}; i < sa.size(); ++i) {
bool sample = (sa[i] % samplingRate) == 0;
if (sample) {
auto subjId = labels[sa[i] / samplingRate];
// find subject id
auto iter = std::upper_bound(accInputSizes.begin(), accInputSizes.end(), sa[i]);
size_t subjId = std::distance(accInputSizes.begin(), iter) - 1;
auto subjPos = sa[i] - accInputSizes[subjId];
if (reverse) {
auto [len, delCt] = _inputSizes[subjId];
Expand All @@ -89,16 +82,14 @@ struct CSA {
subjPos = len+1;
}
}
ssa.emplace_back(subjPos | (subjId << bitsForPosition));
sa[ssaI] = subjPos | (subjId << bitsForPosition);
++ssaI;
}
}
this->ssa = std::move(ssa);
this->bv = BitvectorCompact{sa.size(), [&](size_t idx) {
return (sa[idx] % samplingRate) == 0;
}};
sa.resize(ssaI);
ssa = std::move(sa);
}


auto operator=(CSA const&) -> CSA& = delete;
auto operator=(CSA&& _other) noexcept -> CSA& = default;

Expand Down
19 changes: 6 additions & 13 deletions src/fmindex-collection/DenseCSA.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ struct DenseCSA {
, bv {cereal_tag{}}
{}

DenseCSA(std::span<int64_t const> sa, size_t _samplingRate, std::span<std::tuple<size_t, size_t> const> _inputSizes, bool reverse=false)
DenseCSA(std::span<uint64_t const> sa, size_t _samplingRate, std::span<std::tuple<size_t, size_t> const> _inputSizes, bool reverse=false)
: ssaPos{cereal_tag{}}
, ssaSeq{cereal_tag{}}
, bv {cereal_tag{}}
Expand All @@ -81,26 +81,19 @@ struct DenseCSA {
largestText = std::max(largestText, len);
}

// Annotate text with labels, naming the correct sequence id
auto labels = std::vector<uint64_t>{};
labels.reserve(sa.size() / samplingRate);

for (size_t i{0}, subjId{0}; i < sa.size(); i += samplingRate) {
while (i >= accInputSizes[subjId]) {
subjId += 1;
}
labels.emplace_back(subjId-1);
}

// Construct sampled suffix array
size_t bitsForPos = std::max(size_t{1}, size_t(std::ceil(std::log2(largestText))));

ssaPos = DenseVector(bitsForPos);
ssaSeq = DenseVector(bitsForSeqId);
ssaPos.reserve(sa.size() / _samplingRate);
ssaSeq.reserve(sa.size() / _samplingRate);
for (size_t i{0}; i < sa.size(); ++i) {
bool sample = (sa[i] % samplingRate) == 0;
if (sample) {
auto subjId = labels[sa[i] / samplingRate];
// find subject id
auto iter = std::upper_bound(accInputSizes.begin(), accInputSizes.end(), sa[i]);
size_t subjId = std::distance(accInputSizes.begin(), iter) - 1;
auto subjPos = sa[i] - accInputSizes[subjId];
if (reverse) {
auto [len, delCt] = _inputSizes[subjId];
Expand Down
34 changes: 20 additions & 14 deletions src/fmindex-collection/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,30 @@
#include <tuple>
#include <vector>

#if LIBSAIS_OPENMP
# include <omp.h>
#endif

namespace fmindex_collection {

inline auto createSA(std::span<uint8_t const> input, size_t threadNbr) -> std::vector<int64_t> {
auto sa = std::vector<int64_t>(input.size());
inline auto createSA(std::span<uint8_t const> input, size_t threadNbr) -> std::vector<uint64_t> {
auto sa = std::vector<uint64_t>(input.size());
if (input.size() == 0) {
return sa;
}
#if LIBSAIS_OPENMP
auto r = libsais64_omp(input.data(), sa.data(), input.size(), 0, nullptr, threadNbr);
auto r = libsais64_omp(input.data(), reinterpret_cast<int64_t*>(sa.data()), input.size(), 0, nullptr, threadNbr);
#else
(void)threadNbr; // Unused if no openmp is available
auto r = libsais64(input.data(), sa.data(), input.size(), 0, nullptr);
auto r = libsais64(input.data(), reinterpret_cast<int64_t*>(sa.data()), input.size(), 0, nullptr);
#endif

if (r != 0) { throw std::runtime_error("something went wrong constructing the SA"); }
return sa;
}


inline auto createBWT(std::span<uint8_t const> input, std::span<int64_t const> sa) -> std::vector<uint8_t> {
inline auto createBWT(std::span<uint8_t const> input, std::span<uint64_t const> sa) -> std::vector<uint8_t> {
assert(input.size() == sa.size());
auto bwt = std::vector<uint8_t>{};
bwt.resize(input.size());
Expand All @@ -52,7 +56,7 @@ auto createSequences(Sequences auto const& _input, int samplingRate, bool revers
// compute total numbers of bytes of the text including delimiters "$"
size_t totalSize{};
for (auto const& l : _input) {
auto textLen = l.size();
auto textLen = l.size();
auto delimLen = samplingRate - textLen % samplingRate; // Make sure it is always a multiple of samplingRate
totalSize += textLen + delimLen;
}
Expand All @@ -65,16 +69,11 @@ auto createSequences(Sequences auto const& _input, int samplingRate, bool revers
auto inputSizes = std::vector<std::tuple<size_t, size_t>>{};
inputSizes.reserve(_input.size());


for (auto const& l : _input) {
auto ls = l.size();
// number of delimiters ('$') which need to be added. It must be at least one, and it
// has to make sure the text will be a multiple of samplingRate
size_t delimCount = samplingRate - (ls % samplingRate);
inputText.resize(inputText.size() + ls + delimCount, 0);

if (not reverse) {
std::ranges::copy(l, end(inputText) - ls - delimCount);
inputText.insert(inputText.end(), begin(l), end(l));
} else {
//!TODO hack for clang, broken in clang 15
#if __clang__
Expand All @@ -83,10 +82,17 @@ auto createSequences(Sequences auto const& _input, int samplingRate, bool revers
#else
auto l2 = std::views::reverse(l);
#endif
std::ranges::copy(l2, end(inputText) - ls - delimCount);
inputText.insert(inputText.end(), begin(l2), end(l2));
}

inputSizes.emplace_back(l.size(), delimCount);
// number of delimiters ('$') which need to be added. It must be at least one, and it
// has to make sure the text will be a multiple of samplingRate
size_t delimCount = samplingRate - (ls % samplingRate);

// fill with delimiters/zeros
inputText.resize(inputText.size() + delimCount);

inputSizes.emplace_back(ls, delimCount);
}
return {totalSize, inputText, inputSizes};
}
Expand Down
2 changes: 1 addition & 1 deletion src/test_fmindex-collection/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

TEST_CASE("check creation of suffix array", "[createSA]") {
auto input = std::vector<uint8_t>{'H', 'a', 'l', 'l', 'o', ' ', 'W', 'e', 'l', 't', '\0', '\0'};
auto expected = std::vector<int64_t>{ 11, 10, 5, 0, 6, 1, 7, 2, 3, 8, 4, 9 };
auto expected = std::vector<uint64_t>{ 11, 10, 5, 0, 6, 1, 7, 2, 3, 8, 4, 9 };

auto output = fmindex_collection::createSA(input, 1);
CHECK(output == expected);
Expand Down

0 comments on commit a419ab9

Please sign in to comment.