Skip to content

Commit

Permalink
Merge pull request #29 from SGSSGene/fix/search_schemes
Browse files Browse the repository at this point in the history
Fix/search schemes
  • Loading branch information
SGSSGene authored Sep 14, 2023
2 parents 1b21e5e + 75712d6 commit ff6ee9f
Show file tree
Hide file tree
Showing 10 changed files with 170 additions and 119 deletions.
10 changes: 5 additions & 5 deletions src/example/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,10 @@ int main(int argc, char const* const* argv) {
throw std::runtime_error("unknown search scheme generetaror \"" + config.generator + "\"");
}
auto len = mut_queries[0].size();
auto oss = iter->second(0, k, 0, 0); //!TODO last two parameters of second are not being used
auto oss = iter->second.generator(0, k, 0, 0); //!TODO last two parameters of second are not being used
auto ess = search_schemes::expand(oss, len);
auto dss = search_schemes::expandDynamic(oss, len, 4, 3'000'000'000); //!TODO use correct Sigma and text size
fmt::print("ss diff: {} to {}, using dyn: {}\n", search_schemes::expectedNodeCount(ess, 4, 3'000'000'000), search_schemes::expectedNodeCount(dss, 4, 3'000'000'000), config.generator_dyn);
auto dss = search_schemes::expandByWNC</*Edit=*/true>(oss, len, 4, 3'000'000'000); //!TODO use correct Sigma and text size
fmt::print("ss diff: {} to {}, using dyn: {}\n", search_schemes::weightedNodeCount</*Edit=*/false>(ess, 4, 3'000'000'000), search_schemes::weightedNodeCount</*Edit=*/false>(dss, 4, 3'000'000'000), config.generator_dyn);
if (!config.generator_dyn) {
return ess;
} else {
Expand All @@ -145,9 +145,9 @@ int main(int argc, char const* const* argv) {
throw std::runtime_error("unknown search scheme generetaror \"" + config.generator + "\"");
}
auto len = mut_queries[0].size();
auto oss = iter->second(j, j, 0, 0); //!TODO last two parameters of second are not being used
auto oss = iter->second.generator(j, j, 0, 0); //!TODO last two parameters of second are not being used
auto ess = search_schemes::expand(oss, len);
auto dss = search_schemes::expandDynamic(oss, len, 4, 3'000'000'000); //!TODO use correct Sigma and text size
auto dss = search_schemes::expandByWNC</*Edit=*/true>(oss, len, 4, 3'000'000'000); //!TODO use correct Sigma and text size
if (!config.generator_dyn) {
return ess;
} else {
Expand Down
6 changes: 6 additions & 0 deletions src/fmindex-collection/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# -----------------------------------------------------------------------------------------------------
# Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
# Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
# This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
# shipped with this file.
# -----------------------------------------------------------------------------------------------------
cmake_minimum_required (VERSION 3.8)

project(fmindex-collection)
Expand Down
14 changes: 5 additions & 9 deletions src/run_search_schemes/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,7 @@
// shipped with this file.
// -----------------------------------------------------------------------------------------------------
#include <fmt/format.h>
#include <search_schemes/generator/all.h>
#include <search_schemes/nodeCount.h>
#include <search_schemes/expectedNodeCount.h>
#include <search_schemes/expand.h>
#include <search_schemes/isComplete.h>
#include <search_schemes/search_schemes.h>

int main(int argc, char** argv) {
if (argc != 4) {
Expand All @@ -27,7 +23,7 @@ int main(int argc, char** argv) {
auto K = std::stod(argv[2]);
auto gen = argv[3];
{
auto oss = search_schemes::generator::all.at(gen)(0, K, 0, 0);
auto oss = search_schemes::generator::all.at(gen).generator(0, K, 0, 0);
if (oss.size() == 0) return 0;

// for (auto s : oss) {
Expand All @@ -40,12 +36,12 @@ int main(int argc, char** argv) {
auto ss = search_schemes::expand(oss, len);

auto nc = [&](auto ss) {
return search_schemes::expectedNodeCount(ss, 4, 3'000'000'000);
return search_schemes::weightedNodeCount</*Edit=*/false>(ss, 4, 3'000'000'000);
};
auto nce = [&](auto ss) {
return search_schemes::expectedNodeCountEdit(ss, 4, 3'000'000'000);
return search_schemes::weightedNodeCount</*Edit=*/true>(ss, 4, 3'000'000'000);
};
auto dss = search_schemes::expandDynamic(oss, len, 4, 3'000'000'000);
auto dss = search_schemes::expandByWNC</*Edit=*/true>(oss, len, 4, 3'000'000'000);


// fmt::print("ess:\n");
Expand Down
35 changes: 32 additions & 3 deletions src/search_schemes/expand.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@

#include "Scheme.h"
#include "isValid.h"
#include "expectedNodeCount.h"
#include "nodeCount.h"
#include "weightedNodeCount.h"

#include <algorithm>
#include <cassert>
Expand Down Expand Up @@ -174,6 +175,7 @@ inline auto expand(Search s, std::vector<size_t> parts) -> std::optional<Search>
}
return {r};
}

inline auto expand(Scheme ss, std::vector<size_t> parts) -> Scheme {
auto r = Scheme{};
for (auto const& s : ss) {
Expand All @@ -185,7 +187,34 @@ inline auto expand(Scheme ss, std::vector<size_t> parts) -> Scheme {
return r;
}

inline auto expandDynamic(Scheme ss, size_t _newLen, size_t sigma, size_t N) -> Scheme {
template <bool Edit=false>
auto expandByNC(Scheme ss, size_t _newLen, size_t sigma) -> Scheme {
if (ss.size() == 0) return {};
auto additionalPos = _newLen - ss[0].pi.size();
auto counts = std::vector<size_t>(ss[0].pi.size(), 1);

for (size_t i{0}; i<additionalPos; ++i) {
double bestVal = std::numeric_limits<double>::max();
size_t bestPos = 0;
for (size_t j{0}; j < ss[0].pi.size(); ++j) {
counts[j] += 1;
auto ess = expand(ss, counts);
counts[j] -= 1;
auto f = nodeCount<Edit>(ess, sigma);
if (f < bestVal) {
bestVal = f;
bestPos = j;
}
}
counts[bestPos] += 1;
}

return expand(ss, counts);
}

template <bool Edit=false>
auto expandByWNC(Scheme ss, size_t _newLen, size_t sigma, size_t N) -> Scheme {
if (ss.size() == 0) return {};
auto additionalPos = _newLen - ss[0].pi.size();
auto counts = std::vector<size_t>(ss[0].pi.size(), 1);

Expand All @@ -196,7 +225,7 @@ inline auto expandDynamic(Scheme ss, size_t _newLen, size_t sigma, size_t N) ->
counts[j] += 1;
auto ess = expand(ss, counts);
counts[j] -= 1;
auto f = expectedNodeCountEdit(ess, sigma, N);
auto f = weightedNodeCount<Edit>(ess, sigma, N);
if (f < bestVal) {
bestVal = f;
bestPos = j;
Expand Down
81 changes: 66 additions & 15 deletions src/search_schemes/generator/all.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,25 +19,76 @@
#include <functional>
#include <string>
#include <tuple>
#include <vector>
#include <map>
#include <unordered_map>
#include <vector>

namespace search_schemes::generator {

inline auto all = std::map<std::string, std::function<Scheme(int, int, int, int)>>{
{ "backtracking", []([[maybe_unused]] int minError, [[maybe_unused]] int maxError, [[maybe_unused]] int sigma, [[maybe_unused]] int dbSize) { return backtracking(1, minError, maxError); }},
{ "optimum", []([[maybe_unused]] int minError, [[maybe_unused]] int maxError, [[maybe_unused]] int sigma, [[maybe_unused]] int dbSize) { return optimum(minError, maxError); }},
{ "01*0", []([[maybe_unused]] int minError, [[maybe_unused]] int maxError, [[maybe_unused]] int sigma, [[maybe_unused]] int dbSize) { return zeroOnesZero_trivial(minError, maxError); }},
{ "01*0_opt", []([[maybe_unused]] int minError, [[maybe_unused]] int maxError, [[maybe_unused]] int sigma, [[maybe_unused]] int dbSize) { return zeroOnesZero_opt(minError, maxError); }},
{ "pigeon", []([[maybe_unused]] int minError, [[maybe_unused]] int maxError, [[maybe_unused]] int sigma, [[maybe_unused]] int dbSize) { return pigeon_trivial(minError, maxError); }},
{ "pigeon_opt", []([[maybe_unused]] int minError, [[maybe_unused]] int maxError, [[maybe_unused]] int sigma, [[maybe_unused]] int dbSize) { return pigeon_opt(minError, maxError); }},
{ "suffix", []([[maybe_unused]] int minError, [[maybe_unused]] int maxError, [[maybe_unused]] int sigma, [[maybe_unused]] int dbSize) { return suffixFilter(maxError+1, minError, maxError); }},
{ "kianfar", []([[maybe_unused]] int minError, [[maybe_unused]] int maxError, [[maybe_unused]] int sigma, [[maybe_unused]] int dbSize) { return kianfar(maxError); }},
{ "kucherov-k1", []([[maybe_unused]] int minError, [[maybe_unused]] int maxError, [[maybe_unused]] int sigma, [[maybe_unused]] int dbSize) { return kucherov(maxError+1, maxError); }},
{ "kucherov-k2", []([[maybe_unused]] int minError, [[maybe_unused]] int maxError, [[maybe_unused]] int sigma, [[maybe_unused]] int dbSize) { return kucherov(maxError+2, maxError); }},
{ "h2-k1", []([[maybe_unused]] int minError, [[maybe_unused]] int maxError, [[maybe_unused]] int sigma, [[maybe_unused]] int dbSize) { return h2(maxError+1, minError, maxError); }},
{ "h2-k2", []([[maybe_unused]] int minError, [[maybe_unused]] int maxError, [[maybe_unused]] int sigma, [[maybe_unused]] int dbSize) { return h2(maxError+2, minError, maxError); }},
{ "h2-k3", []([[maybe_unused]] int minError, [[maybe_unused]] int maxError, [[maybe_unused]] int sigma, [[maybe_unused]] int dbSize) { return h2(maxError+3, minError, maxError); }},
struct GeneratorEntry {
std::string name;
std::string description;
std::function<Scheme(int minError, int maxError, int sigma, int refSize)> generator;
};

inline auto all = []() {
auto res = std::map<std::string, GeneratorEntry>{};
auto add = [&](GeneratorEntry entry) {
res.try_emplace(entry.name, entry);
};
add({ .name = "backtracking",
.description = "simple backtracking, not utilisying the bidirectional fm-index or search schemes",
.generator = []([[maybe_unused]] int minError, [[maybe_unused]] int maxError, [[maybe_unused]] int sigma, [[maybe_unused]] int dbSize) { return backtracking(1, minError, maxError); }
});
add({ .name = "optimum",
.description = "known optimim search schemes",
.generator = []([[maybe_unused]] int minError, [[maybe_unused]] int maxError, [[maybe_unused]] int sigma, [[maybe_unused]] int dbSize) { return optimum(minError, maxError); }
});
add({ .name = "01*0",
.description = "based on 01*0 seeds",
.generator = []([[maybe_unused]] int minError, [[maybe_unused]] int maxError, [[maybe_unused]] int sigma, [[maybe_unused]] int dbSize) { return zeroOnesZero_trivial(minError, maxError); }
});
add({ .name = "01*0_opt",
.description = "based on 01*0 seeds, but joining search schemes with same part order",
.generator = []([[maybe_unused]] int minError, [[maybe_unused]] int maxError, [[maybe_unused]] int sigma, [[maybe_unused]] int dbSize) { return zeroOnesZero_opt(minError, maxError); }
});
add({ .name = "pigeon",
.description = "based on the pigeon hole principle",
.generator = []([[maybe_unused]] int minError, [[maybe_unused]] int maxError, [[maybe_unused]] int sigma, [[maybe_unused]] int dbSize) { return pigeon_trivial(minError, maxError); }
});
add({ .name = "pigeon_opt",
.description = "based on the pigeon hole principle, removing duplicate paths",
.generator = []([[maybe_unused]] int minError, [[maybe_unused]] int maxError, [[maybe_unused]] int sigma, [[maybe_unused]] int dbSize) { return pigeon_opt(minError, maxError); }
});
add({ .name = "suffix",
.description = "based on suffix filter",
.generator = []([[maybe_unused]] int minError, [[maybe_unused]] int maxError, [[maybe_unused]] int sigma, [[maybe_unused]] int dbSize) { return suffixFilter(maxError+1, minError, maxError); }
});
add({ .name = "kianfar",
.description = "designed by kianfar (?)",
.generator = []([[maybe_unused]] int minError, [[maybe_unused]] int maxError, [[maybe_unused]] int sigma, [[maybe_unused]] int dbSize) { return kianfar(maxError); }
});
add({ .name = "kucherov-k1",
.description = "designed by kucherov, divided into k+1 pieces (?)",
.generator = []([[maybe_unused]] int minError, [[maybe_unused]] int maxError, [[maybe_unused]] int sigma, [[maybe_unused]] int dbSize) { return kucherov(maxError+1, maxError); }
});
add({ .name = "kucherov-k2",
.description = "designed by kucherov, divided into k+2 pieces (?)",
.generator = []([[maybe_unused]] int minError, [[maybe_unused]] int maxError, [[maybe_unused]] int sigma, [[maybe_unused]] int dbSize) { return kucherov(maxError+2, maxError); }
});
add({ .name = "h2-k1",
.description = "designed by gottlieb, divided into k+1 pieces",
.generator = []([[maybe_unused]] int minError, [[maybe_unused]] int maxError, [[maybe_unused]] int sigma, [[maybe_unused]] int dbSize) { return h2(maxError+1, minError, maxError); }
});
add({ .name = "h2-k2",
.description = "designed by gottlieb, divided into k+2 pieces",
.generator = []([[maybe_unused]] int minError, [[maybe_unused]] int maxError, [[maybe_unused]] int sigma, [[maybe_unused]] int dbSize) { return h2(maxError+2, minError, maxError); }
});
add({ .name = "h2-k3",
.description = "designed by gottlieb, divided into k+3 pieces",
.generator = []([[maybe_unused]] int minError, [[maybe_unused]] int maxError, [[maybe_unused]] int sigma, [[maybe_unused]] int dbSize) { return h2(maxError+3, minError, maxError); }
});
return res;
}();

}
1 change: 1 addition & 0 deletions src/search_schemes/isComplete.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ void generateErrorConfig(CB cb, size_t len, size_t minK, size_t maxK) {
*
*/
inline auto isComplete(Scheme const& ss, size_t minK, size_t maxK) -> bool {
if (ss.empty()) return false;
bool complete{true};
auto len = ss.at(0).pi.size();
generateErrorConfig([&](ErrorConfig const& config) {
Expand Down
54 changes: 16 additions & 38 deletions src/search_schemes/nodeCount.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@

namespace search_schemes {

inline long double nodeCount(Search s, size_t sigma) {
/**
* If its not edit distance, its Hamming distance
*/
template <bool Edit>
long double nodeCount(Search s, size_t sigma) {
auto n_max = s.pi.size();
auto e = *std::max_element(begin(s.u), end(s.u));

Expand All @@ -29,7 +33,11 @@ inline long double nodeCount(Search s, size_t sigma) {
if (s.l[n-1] <= i and i <= s.u[n-1]) {
newArray[i] = lastArray[i];
if (i > 0) {
newArray[i] += (sigma-1) * lastArray[i-1];
if constexpr (Edit) {
newArray[i] += (sigma-1) * lastArray[i-1] + (sigma) * lastArray[i-1] + lastArray[i-1];
} else {
newArray[i] += (sigma-1) * lastArray[i-1];
}
}
acc += newArray[i];
} else {
Expand All @@ -41,43 +49,13 @@ inline long double nodeCount(Search s, size_t sigma) {
return acc;
}

inline long double nodeCount(Scheme const& ss, size_t sigma) {
/**
* If its not edit distance, its Hamming distance
*/
template <bool Edit>
long double nodeCount(Scheme const& ss, size_t sigma) {
return std::accumulate(begin(ss), end(ss), static_cast<long double>(0.), [&](long double v, auto const& s) {
return v + nodeCount(s, sigma);
return v + nodeCount<Edit>(s, sigma);
});
}

inline long double nodeCountEdit(Search s, size_t sigma) {
auto n_max = s.pi.size();
auto e = *std::max_element(begin(s.u), end(s.u));

auto lastArray = std::vector<long double>(e+1, 0);
lastArray[0] = 1;

long double acc = 0;

auto newArray = std::vector<long double>(e+1, 0);
for (size_t n {1}; n <= n_max; ++n) {
for (size_t i{0}; i < e+1; ++i) {
if (s.l[n-1] <= i and i <= s.u[n-1]) {
newArray[i] = lastArray[i];
if (i > 0) {
newArray[i] += (sigma-1) * lastArray[i-1] + (sigma) * lastArray[i-1] + lastArray[i-1];
}
acc += newArray[i];
} else {
newArray[i] = 0;
}
}
std::swap(newArray, lastArray);
}
return acc;
}

inline long double nodeCountEdit(Scheme const& ss, int sigma) {
return std::accumulate(begin(ss), end(ss), static_cast<long double>(0.), [&](long double v, auto const& s) {
return v + nodeCountEdit(s, sigma);
});
}

}
16 changes: 16 additions & 0 deletions src/search_schemes/search_schemes.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// -----------------------------------------------------------------------------------------------------
// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
// shipped with this file.
// -----------------------------------------------------------------------------------------------------
#pragma once

#include "Scheme.h"
#include "Search.h"
#include "expand.h"
#include "generator/all.h"
#include "isComplete.h"
#include "isValid.h"
#include "nodeCount.h"
#include "weightedNodeCount.h"
Loading

0 comments on commit ff6ee9f

Please sign in to comment.