Fixed word size issues for searching with q-gram compression unit tes…

…ts, added new tests.
MrAlexSee · Jan 26, 2019 · 68c8d22 · 68c8d22
1 parent b5b0f5c
commit 68c8d22
Show file tree

Hide file tree

Showing 4 changed files with 239 additions and 15 deletions.
diff --git a/unit_tests/makefile b/unit_tests/makefile
@@ -9,7 +9,7 @@ INCLUDE    = -I$(BOOST_DIR)
 TEST_FILES = catch.hpp repeat.hpp
 
 EXE 	   = main_tests
-OBJ        = main_tests.o hash_map_aligned_tests.o split_index_1_tests.o split_index_1_searching_tests.o split_index_1_comp_tests.o split_index_1_comp_triple_tests.o split_index_k_tests.o split_index_k_searching_tests.o utils_distance_tests.o utils_file_io_tests.o utils_string_utils_tests.o
+OBJ        = main_tests.o hash_map_aligned_tests.o split_index_1_tests.o split_index_1_searching_tests.o split_index_1_comp_searching_tests.o split_index_1_comp_tests.o split_index_1_comp_triple_tests.o split_index_k_tests.o split_index_k_searching_tests.o utils_distance_tests.o utils_file_io_tests.o utils_string_utils_tests.o
 
 HASH_FUNCTION_LIB  = hash_function.a
 HASH_MAP_LIB       = hash_map.a
@@ -44,6 +44,9 @@ split_index_1_tests.o: split_index_1_tests.cpp ../src/index/split_index.* ../src
 split_index_1_searching_tests.o: split_index_1_searching_tests.cpp ../src/index/split_index.* ../src/index/split_index_1.* $(TEST_FILES)
 	$(CC) $(CCFLAGS) $(OPTFLAGS) $(INCLUDE) -c split_index_1_searching_tests.cpp
 
+split_index_1_comp_searching_tests.o: split_index_1_comp_searching_tests.cpp ../src/index/split_index.* ../src/index/split_index_1.* $(TEST_FILES)
+	$(CC) $(CCFLAGS) $(OPTFLAGS) $(INCLUDE) -c split_index_1_comp_searching_tests.cpp
+
 split_index_1_comp_tests.o: split_index_1_comp_tests.cpp ../src/index/split_index.* ../src/index/split_index_1.* split_index_1_comp_whitebox.hpp $(TEST_FILES)
 	$(CC) $(CCFLAGS) $(OPTFLAGS) $(INCLUDE) -c split_index_1_comp_tests.cpp
 

diff --git a/unit_tests/split_index_1_comp_searching_tests.cpp b/unit_tests/split_index_1_comp_searching_tests.cpp
@@ -0,0 +1,227 @@
+#include "catch.hpp"
+#include "repeat.hpp"
+
+#include "../src/index/split_index_1_comp.hpp"
+#include "../src/index/split_index_1_comp_triple.hpp"
+
+using namespace split_index;
+using namespace std;
+
+namespace split_index
+{
+
+namespace
+{
+
+hash_functions::HashFunctions::HashType hashType = hash_functions::HashFunctions::HashType::XxHash;
+constexpr int maxNIter = 10;
+
+}
+
+// Note that words for these tests have at least 3 characters.
+// This is a prerequisite for 2-gram coding.
+// 3-,4-gram coding is handled automatically, since word length checks are anyway required in their case.
+
+TEST_CASE("is searching compression empty patterns correct", "[split_index_1_comp_searching]")
+{
+    const unordered_set<string> wordSet { "ala", "kota", "jarek", "lubi", "psy" };
+    SplitIndex *indexes[] = { 
+        new SplitIndex1Comp(wordSet, hashType, 1.0f),
+        new SplitIndex1CompTriple(wordSet, hashType, 1.0f) };
+
+    const int nIndexes = sizeof(indexes) / sizeof(indexes[0]);
+
+    for (int iIndex = 0; iIndex < nIndexes; ++iIndex)
+    {
+        indexes[iIndex]->construct();
+
+        for (int nIter = 1; nIter <= maxNIter; ++nIter)
+        {
+            REQUIRE(indexes[iIndex]->search({ }, nIter).empty());
+        }
+
+        delete indexes[iIndex];
+    }
+}
+
+TEST_CASE("is searching compression words exact correct", "[split_index_1_comp_searching]")
+{
+    const vector<string> words { "ala", "kota", "jarek", "lubi", "psy" };
+    const vector<string> patternsOut { "not", "this", "dict" };
+
+    SplitIndex *indexes[] = { 
+        new SplitIndex1Comp({ words.begin(), words.end() }, hashType, 1.0f),
+        new SplitIndex1CompTriple({ words.begin(), words.end() }, hashType, 1.0f) };
+
+    const int nIndexes = sizeof(indexes) / sizeof(indexes[0]);
+
+    for (int iIndex = 0; iIndex < nIndexes; ++iIndex)
+    {
+        indexes[iIndex]->construct();
+
+        for (int nIter = 1; nIter <= maxNIter; ++nIter)
+        {
+            REQUIRE(indexes[iIndex]->search(words, nIter) == SplitIndex::ResultSetType(words.begin(), words.end()));
+            REQUIRE(indexes[iIndex]->search(patternsOut, nIter).empty());
+        }
+
+        delete indexes[iIndex];
+    }
+}
+
+TEST_CASE("is searching compression words exact one-by-one correct", "[split_index_1_comp_searching]")
+{
+    const unordered_set<string> wordSet { "ala", "kota", "jarek", "lubi", "psy" };
+    const vector<string> patternsOut { "not", "this", "dict" };
+
+    SplitIndex *indexes[] = { 
+        new SplitIndex1Comp(wordSet, hashType, 1.0f),
+        new SplitIndex1CompTriple(wordSet, hashType, 1.0f) };
+
+    const int nIndexes = sizeof(indexes) / sizeof(indexes[0]);
+
+    for (int iIndex = 0; iIndex < nIndexes; ++iIndex)
+    {
+        indexes[iIndex]->construct();
+
+        for (int nIter = 1; nIter <= maxNIter; ++nIter)
+        {
+            for (const string &word : wordSet)
+            {
+                REQUIRE(indexes[iIndex]->search({ word }, nIter) == SplitIndex::ResultSetType{ word });
+            }
+
+            for (const string &patternOut : patternsOut)
+            {
+                REQUIRE(indexes[iIndex]->search({ patternOut }, nIter).empty());
+            }
+        }
+
+        delete indexes[iIndex];
+    }
+}
+
+TEST_CASE("is searching compression words for k = 1 for 1 error correct", "[split_index_1_comp_searching]")
+{
+    const unordered_set<string> wordSet { "ala", "kota", "jarek", "psa" };
+    vector<string> patternsIn;
+
+    SplitIndex *indexes[] = { 
+        new SplitIndex1Comp(wordSet, hashType, 1.0f),
+        new SplitIndex1CompTriple(wordSet, hashType, 1.0f) };
+
+    const int nIndexes = sizeof(indexes) / sizeof(indexes[0]);
+
+    for (int iIndex = 0; iIndex < nIndexes; ++iIndex)
+    {
+        indexes[iIndex]->construct();
+
+        for (const string &word : wordSet)
+        {
+            for (size_t i = 0; i < word.size(); ++i)
+            {
+                string curWord = word;
+                curWord[i] = 'N';
+
+                patternsIn.push_back(move(curWord));
+            }
+        }
+
+        for (int nIter = 1; nIter <= maxNIter; ++nIter)
+        {
+            REQUIRE(indexes[iIndex]->search(patternsIn, nIter) == SplitIndex::ResultSetType(wordSet.begin(), wordSet.end()));
+        }
+
+        delete indexes[iIndex];
+    }
+}
+
+TEST_CASE("is searching compression words for k = 1 for 1 error one-by-one correct", "[split_index_1_comp_searching]")
+{
+    const unordered_set<string> wordSet { "ala", "kota", "jarek", "psa" };
+
+    SplitIndex *indexes[] = { 
+        new SplitIndex1Comp(wordSet, hashType, 1.0f),
+        new SplitIndex1CompTriple(wordSet, hashType, 1.0f) };
+
+    const int nIndexes = sizeof(indexes) / sizeof(indexes[0]);
+
+    for (int iIndex = 0; iIndex < nIndexes; ++iIndex)
+    {
+        indexes[iIndex]->construct();
+
+        for (const string &word : wordSet)
+        {
+            for (size_t i = 0; i < word.size(); ++i)
+            {
+                string curWord = word;
+                curWord[i] = 'N';
+
+                for (int nIter = 1; nIter <= maxNIter; ++nIter)
+                {
+                    const SplitIndex::ResultSetType result = indexes[iIndex]->search({ curWord }, nIter);
+
+                    REQUIRE(result.size() == 1);
+                    REQUIRE(wordSet.find(*result.begin()) != wordSet.end());
+                }
+            }
+        }
+
+        delete indexes[iIndex];
+    }    
+}
+
+TEST_CASE("is searching compression words for k = 1 for various number of mismatches correct", "[split_index_1_comp_searching]")
+{
+    const unordered_set<string> wordSet { "ala", "kota", "jarek", "psa", "bardzo", "lubie", "owoce" };
+
+    SplitIndex *indexes[] = { 
+        new SplitIndex1Comp(wordSet, hashType, 1.0f),
+        new SplitIndex1CompTriple(wordSet, hashType, 1.0f) };
+
+    const int nIndexes = sizeof(indexes) / sizeof(indexes[0]);
+
+    for (int iIndex = 0; iIndex < nIndexes; ++iIndex)
+    {
+        indexes[iIndex]->construct();
+
+        for (int nIter = 1; nIter <= maxNIter; ++nIter)
+        {
+            REQUIRE(indexes[iIndex]->search({ "osa", "ada" }, nIter) == SplitIndex::ResultSetType{ "ala", "psa" });
+            REQUIRE(indexes[iIndex]->search({ "bbb", "ccc", "ddd" }, nIter).empty());
+
+            REQUIRE(indexes[iIndex]->search({ "darek" }, nIter) == SplitIndex::ResultSetType{ "jarek" });
+            REQUIRE(indexes[iIndex]->search({ "barek" }, nIter) == SplitIndex::ResultSetType{ "jarek" });
+            REQUIRE(indexes[iIndex]->search({ "darek", "japek", "jacek", "barek" }, nIter) == SplitIndex::ResultSetType{ "jarek" });
+            REQUIRE(indexes[iIndex]->search({ "czarek", "bapek", "kapek" }, nIter).empty());
+
+            REQUIRE(indexes[iIndex]->search({ "bardzo" }, nIter) == SplitIndex::ResultSetType{ "bardzo" });
+            REQUIRE(indexes[iIndex]->search({ "barzzo" }, nIter) == SplitIndex::ResultSetType{ "bardzo" });
+            REQUIRE(indexes[iIndex]->search({ "kardzo" }, nIter) == SplitIndex::ResultSetType{ "bardzo" });
+            REQUIRE(indexes[iIndex]->search({ "karzzo", "bordza" }, nIter).empty());
+        }
+
+        delete indexes[iIndex];
+    }
+}
+
+TEST_CASE("is searching words equal to triple q-gram size correct", "[split_index_1_comp_searching]")
+{
+    const unordered_set<string> wordSet{ "ma", "ala", "tion" };
+
+    // This assumes that there is at least one 4-gram used for encoding.
+
+    SplitIndex1CompTriple index1(wordSet, hashType, 1.0f);
+    index1.construct();
+
+    REQUIRE(index1.search({ "ma" }, 1) == SplitIndex::ResultSetType{ "ma" });
+    REQUIRE(index1.search({ "da" }, 1) == SplitIndex::ResultSetType{ "ma" });
+
+    REQUIRE(index1.search({ "ala" }, 1) == SplitIndex::ResultSetType{ "ala" });
+    REQUIRE(index1.search({ "ada" }, 1) == SplitIndex::ResultSetType{ "ala" });
+
+    REQUIRE(index1.search({ "tion" }, 1) == SplitIndex::ResultSetType{ "tion" });
+    REQUIRE(index1.search({ "twon" }, 1) == SplitIndex::ResultSetType{ "tion" });
+}
+
+} // namespace split_index
diff --git a/unit_tests/split_index_1_comp_triple_tests.cpp b/unit_tests/split_index_1_comp_triple_tests.cpp
@@ -170,4 +170,9 @@ TEST_CASE("is triple decoding to buffer correct", "[split_index_1_comp_triple]")
     REQUIRE(size2 == 0); // Exceeded expected max word size.
 }
 
+TEST_CASE("is triple decoding to buffer with decreasing word length correct", "[split_index_1_comp_triple]")
+{
+    // TODO
+}
+
 } // namespace split_index
diff --git a/unit_tests/split_index_1_searching_tests.cpp b/unit_tests/split_index_1_searching_tests.cpp
@@ -2,8 +2,6 @@
 #include "repeat.hpp"
 
 #include "../src/index/split_index_1.hpp"
-#include "../src/index/split_index_1_comp.hpp"
-#include "../src/index/split_index_1_comp_triple.hpp"
 #include "../src/index/split_index_k.hpp"
 
 using namespace split_index;
@@ -20,13 +18,14 @@ constexpr int maxNIter = 10;
 
 }
 
+// Note that words for these tests have at least 2 characters.
+// This is a prerequisite for splitting into 2 parts.
+
 TEST_CASE("is searching empty patterns correct", "[split_index_1_searching]")
 {
     const unordered_set<string> wordSet { "ala", "ma", "kota", "jarek", "lubi", "psy" };
     SplitIndex *indexes[] = { 
         new SplitIndex1(wordSet, hashType, 1.0f), 
-        new SplitIndex1Comp(wordSet, hashType, 1.0f),
-        new SplitIndex1CompTriple(wordSet, hashType, 1.0f),
         new SplitIndexK<1>(wordSet, hashType, 1.0f) };
 
     const int nIndexes = sizeof(indexes) / sizeof(indexes[0]);
@@ -51,8 +50,6 @@ TEST_CASE("is searching words exact correct", "[split_index_1_searching]")
 
     SplitIndex *indexes[] = { 
         new SplitIndex1({ words.begin(), words.end() }, hashType, 1.0f), 
-        new SplitIndex1Comp({ words.begin(), words.end() }, hashType, 1.0f),
-        new SplitIndex1CompTriple({ words.begin(), words.end() }, hashType, 1.0f),
         new SplitIndexK<1>({ words.begin(), words.end() }, hashType, 1.0f) };
 
     const int nIndexes = sizeof(indexes) / sizeof(indexes[0]);
@@ -78,8 +75,6 @@ TEST_CASE("is searching words exact one-by-one correct", "[split_index_1_searchi
 
     SplitIndex *indexes[] = { 
         new SplitIndex1(wordSet, hashType, 1.0f), 
-        new SplitIndex1Comp(wordSet, hashType, 1.0f),
-        new SplitIndex1CompTriple(wordSet, hashType, 1.0f),
         new SplitIndexK<1>(wordSet, hashType, 1.0f) };
 
     const int nIndexes = sizeof(indexes) / sizeof(indexes[0]);
@@ -112,8 +107,6 @@ TEST_CASE("is searching words for k = 1 for 1 error correct", "[split_index_1_se
 
     SplitIndex *indexes[] = { 
         new SplitIndex1(wordSet, hashType, 1.0f), 
-        new SplitIndex1Comp(wordSet, hashType, 1.0f),
-        new SplitIndex1CompTriple(wordSet, hashType, 1.0f),
         new SplitIndexK<1>(wordSet, hashType, 1.0f) };
 
     const int nIndexes = sizeof(indexes) / sizeof(indexes[0]);
@@ -148,8 +141,6 @@ TEST_CASE("is searching words for k = 1 for 1 error one-by-one correct", "[split
 
     SplitIndex *indexes[] = { 
         new SplitIndex1(wordSet, hashType, 1.0f), 
-        new SplitIndex1Comp(wordSet, hashType, 1.0f),
-        new SplitIndex1CompTriple(wordSet, hashType, 1.0f),
         new SplitIndexK<1>(wordSet, hashType, 1.0f) };
 
     const int nIndexes = sizeof(indexes) / sizeof(indexes[0]);
@@ -185,8 +176,6 @@ TEST_CASE("is searching words for k = 1 for various number of mismatches correct
 
     SplitIndex *indexes[] = { 
         new SplitIndex1(wordSet, hashType, 1.0f), 
-        new SplitIndex1Comp(wordSet, hashType, 1.0f),
-        new SplitIndex1CompTriple(wordSet, hashType, 1.0f),
         new SplitIndexK<1>(wordSet, hashType, 1.0f) };
 
     const int nIndexes = sizeof(indexes) / sizeof(indexes[0]);