Skip to content

Commit

Permalink
Fixed word size issues for searching with q-gram compression unit tes…
Browse files Browse the repository at this point in the history
…ts, added new tests.
  • Loading branch information
MrAlexSee committed Jan 26, 2019
1 parent b5b0f5c commit 68c8d22
Show file tree
Hide file tree
Showing 4 changed files with 239 additions and 15 deletions.
5 changes: 4 additions & 1 deletion unit_tests/makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ INCLUDE = -I$(BOOST_DIR)
TEST_FILES = catch.hpp repeat.hpp

EXE = main_tests
OBJ = main_tests.o hash_map_aligned_tests.o split_index_1_tests.o split_index_1_searching_tests.o split_index_1_comp_tests.o split_index_1_comp_triple_tests.o split_index_k_tests.o split_index_k_searching_tests.o utils_distance_tests.o utils_file_io_tests.o utils_string_utils_tests.o
OBJ = main_tests.o hash_map_aligned_tests.o split_index_1_tests.o split_index_1_searching_tests.o split_index_1_comp_searching_tests.o split_index_1_comp_tests.o split_index_1_comp_triple_tests.o split_index_k_tests.o split_index_k_searching_tests.o utils_distance_tests.o utils_file_io_tests.o utils_string_utils_tests.o

HASH_FUNCTION_LIB = hash_function.a
HASH_MAP_LIB = hash_map.a
Expand Down Expand Up @@ -44,6 +44,9 @@ split_index_1_tests.o: split_index_1_tests.cpp ../src/index/split_index.* ../src
split_index_1_searching_tests.o: split_index_1_searching_tests.cpp ../src/index/split_index.* ../src/index/split_index_1.* $(TEST_FILES)
$(CC) $(CCFLAGS) $(OPTFLAGS) $(INCLUDE) -c split_index_1_searching_tests.cpp

split_index_1_comp_searching_tests.o: split_index_1_comp_searching_tests.cpp ../src/index/split_index.* ../src/index/split_index_1.* $(TEST_FILES)
$(CC) $(CCFLAGS) $(OPTFLAGS) $(INCLUDE) -c split_index_1_comp_searching_tests.cpp

split_index_1_comp_tests.o: split_index_1_comp_tests.cpp ../src/index/split_index.* ../src/index/split_index_1.* split_index_1_comp_whitebox.hpp $(TEST_FILES)
$(CC) $(CCFLAGS) $(OPTFLAGS) $(INCLUDE) -c split_index_1_comp_tests.cpp

Expand Down
227 changes: 227 additions & 0 deletions unit_tests/split_index_1_comp_searching_tests.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
#include "catch.hpp"
#include "repeat.hpp"

#include "../src/index/split_index_1_comp.hpp"
#include "../src/index/split_index_1_comp_triple.hpp"

using namespace split_index;
using namespace std;

namespace split_index
{

namespace
{

hash_functions::HashFunctions::HashType hashType = hash_functions::HashFunctions::HashType::XxHash;
constexpr int maxNIter = 10;

}

// Note that words for these tests have at least 3 characters.
// This is a prerequisite for 2-gram coding.
// 3-,4-gram coding is handled automatically, since word length checks are anyway required in their case.

TEST_CASE("is searching compression empty patterns correct", "[split_index_1_comp_searching]")
{
const unordered_set<string> wordSet { "ala", "kota", "jarek", "lubi", "psy" };
SplitIndex *indexes[] = {
new SplitIndex1Comp(wordSet, hashType, 1.0f),
new SplitIndex1CompTriple(wordSet, hashType, 1.0f) };

const int nIndexes = sizeof(indexes) / sizeof(indexes[0]);

for (int iIndex = 0; iIndex < nIndexes; ++iIndex)
{
indexes[iIndex]->construct();

for (int nIter = 1; nIter <= maxNIter; ++nIter)
{
REQUIRE(indexes[iIndex]->search({ }, nIter).empty());
}

delete indexes[iIndex];
}
}

TEST_CASE("is searching compression words exact correct", "[split_index_1_comp_searching]")
{
const vector<string> words { "ala", "kota", "jarek", "lubi", "psy" };
const vector<string> patternsOut { "not", "this", "dict" };

SplitIndex *indexes[] = {
new SplitIndex1Comp({ words.begin(), words.end() }, hashType, 1.0f),
new SplitIndex1CompTriple({ words.begin(), words.end() }, hashType, 1.0f) };

const int nIndexes = sizeof(indexes) / sizeof(indexes[0]);

for (int iIndex = 0; iIndex < nIndexes; ++iIndex)
{
indexes[iIndex]->construct();

for (int nIter = 1; nIter <= maxNIter; ++nIter)
{
REQUIRE(indexes[iIndex]->search(words, nIter) == SplitIndex::ResultSetType(words.begin(), words.end()));
REQUIRE(indexes[iIndex]->search(patternsOut, nIter).empty());
}

delete indexes[iIndex];
}
}

TEST_CASE("is searching compression words exact one-by-one correct", "[split_index_1_comp_searching]")
{
const unordered_set<string> wordSet { "ala", "kota", "jarek", "lubi", "psy" };
const vector<string> patternsOut { "not", "this", "dict" };

SplitIndex *indexes[] = {
new SplitIndex1Comp(wordSet, hashType, 1.0f),
new SplitIndex1CompTriple(wordSet, hashType, 1.0f) };

const int nIndexes = sizeof(indexes) / sizeof(indexes[0]);

for (int iIndex = 0; iIndex < nIndexes; ++iIndex)
{
indexes[iIndex]->construct();

for (int nIter = 1; nIter <= maxNIter; ++nIter)
{
for (const string &word : wordSet)
{
REQUIRE(indexes[iIndex]->search({ word }, nIter) == SplitIndex::ResultSetType{ word });
}

for (const string &patternOut : patternsOut)
{
REQUIRE(indexes[iIndex]->search({ patternOut }, nIter).empty());
}
}

delete indexes[iIndex];
}
}

TEST_CASE("is searching compression words for k = 1 for 1 error correct", "[split_index_1_comp_searching]")
{
const unordered_set<string> wordSet { "ala", "kota", "jarek", "psa" };
vector<string> patternsIn;

SplitIndex *indexes[] = {
new SplitIndex1Comp(wordSet, hashType, 1.0f),
new SplitIndex1CompTriple(wordSet, hashType, 1.0f) };

const int nIndexes = sizeof(indexes) / sizeof(indexes[0]);

for (int iIndex = 0; iIndex < nIndexes; ++iIndex)
{
indexes[iIndex]->construct();

for (const string &word : wordSet)
{
for (size_t i = 0; i < word.size(); ++i)
{
string curWord = word;
curWord[i] = 'N';

patternsIn.push_back(move(curWord));
}
}

for (int nIter = 1; nIter <= maxNIter; ++nIter)
{
REQUIRE(indexes[iIndex]->search(patternsIn, nIter) == SplitIndex::ResultSetType(wordSet.begin(), wordSet.end()));
}

delete indexes[iIndex];
}
}

TEST_CASE("is searching compression words for k = 1 for 1 error one-by-one correct", "[split_index_1_comp_searching]")
{
const unordered_set<string> wordSet { "ala", "kota", "jarek", "psa" };

SplitIndex *indexes[] = {
new SplitIndex1Comp(wordSet, hashType, 1.0f),
new SplitIndex1CompTriple(wordSet, hashType, 1.0f) };

const int nIndexes = sizeof(indexes) / sizeof(indexes[0]);

for (int iIndex = 0; iIndex < nIndexes; ++iIndex)
{
indexes[iIndex]->construct();

for (const string &word : wordSet)
{
for (size_t i = 0; i < word.size(); ++i)
{
string curWord = word;
curWord[i] = 'N';

for (int nIter = 1; nIter <= maxNIter; ++nIter)
{
const SplitIndex::ResultSetType result = indexes[iIndex]->search({ curWord }, nIter);

REQUIRE(result.size() == 1);
REQUIRE(wordSet.find(*result.begin()) != wordSet.end());
}
}
}

delete indexes[iIndex];
}
}

TEST_CASE("is searching compression words for k = 1 for various number of mismatches correct", "[split_index_1_comp_searching]")
{
const unordered_set<string> wordSet { "ala", "kota", "jarek", "psa", "bardzo", "lubie", "owoce" };

SplitIndex *indexes[] = {
new SplitIndex1Comp(wordSet, hashType, 1.0f),
new SplitIndex1CompTriple(wordSet, hashType, 1.0f) };

const int nIndexes = sizeof(indexes) / sizeof(indexes[0]);

for (int iIndex = 0; iIndex < nIndexes; ++iIndex)
{
indexes[iIndex]->construct();

for (int nIter = 1; nIter <= maxNIter; ++nIter)
{
REQUIRE(indexes[iIndex]->search({ "osa", "ada" }, nIter) == SplitIndex::ResultSetType{ "ala", "psa" });
REQUIRE(indexes[iIndex]->search({ "bbb", "ccc", "ddd" }, nIter).empty());

REQUIRE(indexes[iIndex]->search({ "darek" }, nIter) == SplitIndex::ResultSetType{ "jarek" });
REQUIRE(indexes[iIndex]->search({ "barek" }, nIter) == SplitIndex::ResultSetType{ "jarek" });
REQUIRE(indexes[iIndex]->search({ "darek", "japek", "jacek", "barek" }, nIter) == SplitIndex::ResultSetType{ "jarek" });
REQUIRE(indexes[iIndex]->search({ "czarek", "bapek", "kapek" }, nIter).empty());

REQUIRE(indexes[iIndex]->search({ "bardzo" }, nIter) == SplitIndex::ResultSetType{ "bardzo" });
REQUIRE(indexes[iIndex]->search({ "barzzo" }, nIter) == SplitIndex::ResultSetType{ "bardzo" });
REQUIRE(indexes[iIndex]->search({ "kardzo" }, nIter) == SplitIndex::ResultSetType{ "bardzo" });
REQUIRE(indexes[iIndex]->search({ "karzzo", "bordza" }, nIter).empty());
}

delete indexes[iIndex];
}
}

TEST_CASE("is searching words equal to triple q-gram size correct", "[split_index_1_comp_searching]")
{
const unordered_set<string> wordSet{ "ma", "ala", "tion" };

// This assumes that there is at least one 4-gram used for encoding.

SplitIndex1CompTriple index1(wordSet, hashType, 1.0f);
index1.construct();

REQUIRE(index1.search({ "ma" }, 1) == SplitIndex::ResultSetType{ "ma" });
REQUIRE(index1.search({ "da" }, 1) == SplitIndex::ResultSetType{ "ma" });

REQUIRE(index1.search({ "ala" }, 1) == SplitIndex::ResultSetType{ "ala" });
REQUIRE(index1.search({ "ada" }, 1) == SplitIndex::ResultSetType{ "ala" });

REQUIRE(index1.search({ "tion" }, 1) == SplitIndex::ResultSetType{ "tion" });
REQUIRE(index1.search({ "twon" }, 1) == SplitIndex::ResultSetType{ "tion" });
}

} // namespace split_index
5 changes: 5 additions & 0 deletions unit_tests/split_index_1_comp_triple_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,4 +170,9 @@ TEST_CASE("is triple decoding to buffer correct", "[split_index_1_comp_triple]")
REQUIRE(size2 == 0); // Exceeded expected max word size.
}

TEST_CASE("is triple decoding to buffer with decreasing word length correct", "[split_index_1_comp_triple]")
{
// TODO
}

} // namespace split_index
17 changes: 3 additions & 14 deletions unit_tests/split_index_1_searching_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
#include "repeat.hpp"

#include "../src/index/split_index_1.hpp"
#include "../src/index/split_index_1_comp.hpp"
#include "../src/index/split_index_1_comp_triple.hpp"
#include "../src/index/split_index_k.hpp"

using namespace split_index;
Expand All @@ -20,13 +18,14 @@ constexpr int maxNIter = 10;

}

// Note that words for these tests have at least 2 characters.
// This is a prerequisite for splitting into 2 parts.

TEST_CASE("is searching empty patterns correct", "[split_index_1_searching]")
{
const unordered_set<string> wordSet { "ala", "ma", "kota", "jarek", "lubi", "psy" };
SplitIndex *indexes[] = {
new SplitIndex1(wordSet, hashType, 1.0f),
new SplitIndex1Comp(wordSet, hashType, 1.0f),
new SplitIndex1CompTriple(wordSet, hashType, 1.0f),
new SplitIndexK<1>(wordSet, hashType, 1.0f) };

const int nIndexes = sizeof(indexes) / sizeof(indexes[0]);
Expand All @@ -51,8 +50,6 @@ TEST_CASE("is searching words exact correct", "[split_index_1_searching]")

SplitIndex *indexes[] = {
new SplitIndex1({ words.begin(), words.end() }, hashType, 1.0f),
new SplitIndex1Comp({ words.begin(), words.end() }, hashType, 1.0f),
new SplitIndex1CompTriple({ words.begin(), words.end() }, hashType, 1.0f),
new SplitIndexK<1>({ words.begin(), words.end() }, hashType, 1.0f) };

const int nIndexes = sizeof(indexes) / sizeof(indexes[0]);
Expand All @@ -78,8 +75,6 @@ TEST_CASE("is searching words exact one-by-one correct", "[split_index_1_searchi

SplitIndex *indexes[] = {
new SplitIndex1(wordSet, hashType, 1.0f),
new SplitIndex1Comp(wordSet, hashType, 1.0f),
new SplitIndex1CompTriple(wordSet, hashType, 1.0f),
new SplitIndexK<1>(wordSet, hashType, 1.0f) };

const int nIndexes = sizeof(indexes) / sizeof(indexes[0]);
Expand Down Expand Up @@ -112,8 +107,6 @@ TEST_CASE("is searching words for k = 1 for 1 error correct", "[split_index_1_se

SplitIndex *indexes[] = {
new SplitIndex1(wordSet, hashType, 1.0f),
new SplitIndex1Comp(wordSet, hashType, 1.0f),
new SplitIndex1CompTriple(wordSet, hashType, 1.0f),
new SplitIndexK<1>(wordSet, hashType, 1.0f) };

const int nIndexes = sizeof(indexes) / sizeof(indexes[0]);
Expand Down Expand Up @@ -148,8 +141,6 @@ TEST_CASE("is searching words for k = 1 for 1 error one-by-one correct", "[split

SplitIndex *indexes[] = {
new SplitIndex1(wordSet, hashType, 1.0f),
new SplitIndex1Comp(wordSet, hashType, 1.0f),
new SplitIndex1CompTriple(wordSet, hashType, 1.0f),
new SplitIndexK<1>(wordSet, hashType, 1.0f) };

const int nIndexes = sizeof(indexes) / sizeof(indexes[0]);
Expand Down Expand Up @@ -185,8 +176,6 @@ TEST_CASE("is searching words for k = 1 for various number of mismatches correct

SplitIndex *indexes[] = {
new SplitIndex1(wordSet, hashType, 1.0f),
new SplitIndex1Comp(wordSet, hashType, 1.0f),
new SplitIndex1CompTriple(wordSet, hashType, 1.0f),
new SplitIndexK<1>(wordSet, hashType, 1.0f) };

const int nIndexes = sizeof(indexes) / sizeof(indexes[0]);
Expand Down

0 comments on commit 68c8d22

Please sign in to comment.