Skip to content

Commit

Permalink
feat: Add PrefixTree and RegisterHandler to support TDFA simulati…
Browse files Browse the repository at this point in the history
…on. (#56)

Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com>
  • Loading branch information
SharafMohamed and LinZhihao-723 authored Dec 6, 2024
1 parent 3f13224 commit 99b5b08
Show file tree
Hide file tree
Showing 7 changed files with 388 additions and 1 deletion.
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,15 @@ set(SOURCE_FILES
src/log_surgeon/SchemaParser.hpp
src/log_surgeon/Token.cpp
src/log_surgeon/Token.hpp
src/log_surgeon/finite_automata/PrefixTree.cpp
src/log_surgeon/finite_automata/PrefixTree.hpp
src/log_surgeon/finite_automata/RegexAST.hpp
src/log_surgeon/finite_automata/RegexDFA.hpp
src/log_surgeon/finite_automata/RegexDFA.tpp
src/log_surgeon/finite_automata/RegexNFA.hpp
src/log_surgeon/finite_automata/RegexNFAState.hpp
src/log_surgeon/finite_automata/RegexNFAStateType.hpp
src/log_surgeon/finite_automata/RegisterHandler.hpp
src/log_surgeon/finite_automata/Tag.hpp
src/log_surgeon/finite_automata/TaggedTransition.hpp
src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp
Expand Down
20 changes: 20 additions & 0 deletions src/log_surgeon/finite_automata/PrefixTree.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#include "PrefixTree.hpp"

#include <stdexcept>
#include <vector>

namespace log_surgeon::finite_automata {
auto PrefixTree::get_reversed_positions(id_t const node_id) const -> std::vector<position_t> {
if (m_nodes.size() <= node_id) {
throw std::out_of_range("Prefix tree index out of range.");
}

std::vector<position_t> reversed_positions;
auto current_node{m_nodes[node_id]};
while (false == current_node.is_root()) {
reversed_positions.push_back(current_node.get_position());
current_node = m_nodes[current_node.get_parent_id_unsafe()];
}
return reversed_positions;
}
} // namespace log_surgeon::finite_automata
91 changes: 91 additions & 0 deletions src/log_surgeon/finite_automata/PrefixTree.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#ifndef LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE_HPP
#define LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE_HPP

#include <cstddef>
#include <cstdint>
#include <optional>
#include <stdexcept>
#include <vector>

namespace log_surgeon::finite_automata {
/**
* Represents a prefix tree to store register data during TDFA simulation. Each node in the tree
* stores a single position in the lexed string. Each path from the root to an index corresponds to
* a sequence of positions for an individual tag:
* - Positive position node: Indicates the tag was matched at the position.
* - Negative position node: Indicates the tag was unmatched. If a negative node is the entire path,
* it indicates the tag was never matched. If the negative tag is along a path containing positive
* nodes, it functions as a placeholder. This can be useful for nested capture groups, to maintain
* a one-to-one mapping between the contained capture group and the enclosing capture group.
*/
class PrefixTree {
public:
using id_t = uint32_t;
using position_t = int32_t;

static constexpr id_t cRootId{0};

PrefixTree() : m_nodes{{std::nullopt, -1}} {}

/**
* @param parent_node_id Index of the inserted node's parent in the prefix tree.
* @param position The position in the lexed string.
* @return The index of the newly inserted node in the tree.
* @throw std::out_of_range if the parent's index is out of range.
*/
[[maybe_unused]] auto insert(id_t const parent_node_id, position_t const position) -> id_t {
if (m_nodes.size() <= parent_node_id) {
throw std::out_of_range("Predecessor index out of range.");
}

m_nodes.emplace_back(parent_node_id, position);
return m_nodes.size() - 1;
}

auto set(id_t const node_id, position_t const position) -> void {
m_nodes.at(node_id).set_position(position);
}

[[nodiscard]] auto size() const -> size_t { return m_nodes.size(); }

/**
* @param node_id The index of the node.
* @return A vector containing positions in order from the given index up to but not including
* the root node.
* @throw std::out_of_range if the index is out of range.
*/
[[nodiscard]] auto get_reversed_positions(id_t node_id) const -> std::vector<position_t>;

private:
class Node {
public:
Node(std::optional<id_t> const parent_id, position_t const position)
: m_parent_id{parent_id},
m_position{position} {}

[[nodiscard]] auto is_root() const -> bool { return false == m_parent_id.has_value(); }

/**
* Gets the parent ID without checking if it's `std::nullopt`.
* NOTE: This method should only be used if the caller has checked the node is not the root.
* @return The ID of the parent node in the prefix tree.
*/
[[nodiscard]] auto get_parent_id_unsafe() const -> id_t {
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
return m_parent_id.value();
}

auto set_position(position_t const position) -> void { m_position = position; }

[[nodiscard]] auto get_position() const -> position_t { return m_position; }

private:
std::optional<id_t> m_parent_id;
position_t m_position;
};

std::vector<Node> m_nodes;
};
} // namespace log_surgeon::finite_automata

#endif // LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE_HPP
52 changes: 52 additions & 0 deletions src/log_surgeon/finite_automata/RegisterHandler.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER_HPP
#define LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER_HPP

#include <cstddef>
#include <vector>

#include <log_surgeon/finite_automata/PrefixTree.hpp>

namespace log_surgeon::finite_automata {
/**
* The register handler maintains a prefix tree that is sufficient to represent all registers.
* The register handler also contains a vector of registers, and performs the set, copy, and append
* operations for these registers.
*
* NOTE: For efficiency, registers are not initialized when lexing a new string; instead, it is the
* DFA's responsibility to set the register values when needed.
*/
class RegisterHandler {
public:
auto add_register(
PrefixTree::id_t const prefix_tree_parent_node_id,
PrefixTree::position_t const position
) -> void {
auto const prefix_tree_node_id{m_prefix_tree.insert(prefix_tree_parent_node_id, position)};
m_registers.emplace_back(prefix_tree_node_id);
}

auto set_register(size_t const reg_id, PrefixTree::position_t const position) -> void {
m_prefix_tree.set(m_registers.at(reg_id), position);
}

auto copy_register(size_t const dest_reg_id, size_t const source_reg_id) -> void {
m_registers.at(dest_reg_id) = m_registers.at(source_reg_id);
}

auto append_position(size_t const reg_id, PrefixTree::position_t const position) -> void {
auto const node_id{m_registers.at(reg_id)};
m_registers.at(reg_id) = m_prefix_tree.insert(node_id, position);
}

[[nodiscard]] auto get_reversed_positions(size_t const reg_id
) const -> std::vector<PrefixTree::position_t> {
return m_prefix_tree.get_reversed_positions(m_registers.at(reg_id));
}

private:
PrefixTree m_prefix_tree;
std::vector<PrefixTree::id_t> m_registers;
};
} // namespace log_surgeon::finite_automata

#endif // LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER_HPP
5 changes: 4 additions & 1 deletion tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,13 @@ set(
SOURCES_LOG_SURGEON
../src/log_surgeon/FileReader.cpp
../src/log_surgeon/FileReader.hpp
../src/log_surgeon/finite_automata/PrefixTree.cpp
../src/log_surgeon/finite_automata/PrefixTree.hpp
../src/log_surgeon/finite_automata/RegexAST.hpp
../src/log_surgeon/finite_automata/RegexNFA.hpp
../src/log_surgeon/finite_automata/RegexNFAState.hpp
../src/log_surgeon/finite_automata/RegexNFAStateType.hpp
../src/log_surgeon/finite_automata/RegisterHandler.hpp
../src/log_surgeon/finite_automata/Tag.hpp
../src/log_surgeon/finite_automata/TaggedTransition.hpp
../src/log_surgeon/LALR1Parser.cpp
Expand All @@ -21,7 +24,7 @@ set(
../src/log_surgeon/Token.hpp
)

set(SOURCES_TESTS test-lexer.cpp test-NFA.cpp test-tag.cpp)
set(SOURCES_TESTS test-lexer.cpp test-NFA.cpp test-prefix-tree.cpp test-register-handler.cpp test-tag.cpp)

add_executable(unit-test ${SOURCES_LOG_SURGEON} ${SOURCES_TESTS})
target_link_libraries(unit-test PRIVATE Catch2::Catch2WithMain log_surgeon::log_surgeon)
Expand Down
120 changes: 120 additions & 0 deletions tests/test-prefix-tree.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#include <limits>
#include <stdexcept>
#include <vector>

#include <catch2/catch_test_macros.hpp>

#include <log_surgeon/finite_automata/PrefixTree.hpp>

using log_surgeon::finite_automata::PrefixTree;
using id_t = PrefixTree::id_t;
using position_t = PrefixTree::position_t;

TEST_CASE("`PrefixTree` operations", "[PrefixTree]") {
constexpr auto cRootId{PrefixTree::cRootId};
constexpr position_t cInitialPos1{4};
constexpr position_t cSetPos1{10};

SECTION("Newly constructed tree works correctly") {
PrefixTree const tree;

// A newly constructed tree should return no positions as the root node is ignored
REQUIRE(tree.get_reversed_positions(cRootId).empty());
}

SECTION("Inserting nodes into the prefix tree works correctly") {
constexpr position_t cInitialPos2{7};
constexpr position_t cInitialPos3{9};
constexpr position_t cMaxPos{std::numeric_limits<position_t>::max()};
constexpr position_t cNegativePos1{-1};
constexpr position_t cNegativePos2{-100};
constexpr position_t cTreeSize1{4};
constexpr position_t cTreeSize2{8};

PrefixTree tree;

// Test basic insertions
auto const node_id_1{tree.insert(cRootId, cInitialPos1)};
auto const node_id_2{tree.insert(node_id_1, cInitialPos2)};
auto const node_id_3{tree.insert(node_id_2, cInitialPos3)};
REQUIRE(std::vector<position_t>{cInitialPos1} == tree.get_reversed_positions(node_id_1));
REQUIRE(std::vector<position_t>{cInitialPos2, cInitialPos1}
== tree.get_reversed_positions(node_id_2));
REQUIRE(std::vector<position_t>{cInitialPos3, cInitialPos2, cInitialPos1}
== tree.get_reversed_positions(node_id_3));
REQUIRE(cTreeSize1 == tree.size());

// Test insertion with large position values
auto const node_id_4{tree.insert(cRootId, cMaxPos)};
REQUIRE(cMaxPos == tree.get_reversed_positions(node_id_4)[0]);

// Test insertion with negative position values
auto const node_id_5{tree.insert(cRootId, cNegativePos1)};
auto const node_id_6{tree.insert(node_id_5, cInitialPos1)};
auto const node_id_7{tree.insert(node_id_6, cNegativePos2)};
REQUIRE(std::vector<position_t>{cNegativePos1} == tree.get_reversed_positions(node_id_5));
REQUIRE(std::vector<position_t>{cInitialPos1, cNegativePos1}
== tree.get_reversed_positions(node_id_6));
REQUIRE(std::vector<position_t>{cNegativePos2, cInitialPos1, cNegativePos1}
== tree.get_reversed_positions(node_id_7));
REQUIRE(cTreeSize2 == tree.size());
}

SECTION("Invalid index access throws correctly") {
PrefixTree tree;
REQUIRE_THROWS_AS(tree.get_reversed_positions(tree.size()), std::out_of_range);

tree.insert(cRootId, cInitialPos1);
REQUIRE_THROWS_AS(tree.get_reversed_positions(tree.size()), std::out_of_range);

REQUIRE_THROWS_AS(
tree.get_reversed_positions(std::numeric_limits<id_t>::max()),
std::out_of_range
);
}

SECTION("Set position for a valid index works correctly") {
constexpr position_t cSetPos2{12};
constexpr position_t cSetPos3{15};
constexpr position_t cSetPos4{20};

PrefixTree tree;
// Test that you can set the root node for sanity, although this value is not used
tree.set(cRootId, cSetPos1);

// Test updates to different nodes
auto const node_id_1{tree.insert(cRootId, cInitialPos1)};
auto const node_id_2{tree.insert(node_id_1, cInitialPos1)};
tree.set(node_id_1, cSetPos1);
tree.set(node_id_2, cSetPos2);
REQUIRE(std::vector<position_t>{cSetPos1} == tree.get_reversed_positions(node_id_1));
REQUIRE(std::vector<position_t>{cSetPos2, cSetPos1}
== tree.get_reversed_positions(node_id_2));

// Test multiple updates to the same node
tree.set(node_id_2, cSetPos3);
tree.set(node_id_2, cSetPos4);
REQUIRE(std::vector<position_t>{cSetPos4, cSetPos1}
== tree.get_reversed_positions(node_id_2));

// Test that updates don't affect unrelated paths
auto const node_id_3{tree.insert(cRootId, cSetPos2)};
tree.set(node_id_3, cSetPos3);
REQUIRE(std::vector<position_t>{cSetPos1} == tree.get_reversed_positions(node_id_1));
REQUIRE(std::vector<position_t>{cSetPos4, cSetPos1}
== tree.get_reversed_positions(node_id_2));
}

SECTION("Set position for an invalid index throws correctly") {
constexpr id_t cInvalidNodeId{100};

PrefixTree tree;

// Test setting position before any insertions
REQUIRE_THROWS_AS(tree.set(cInvalidNodeId, cSetPos1), std::out_of_range);

// Test setting position just beyond valid range
auto const node_id_1{tree.insert(cRootId, cInitialPos1)};
REQUIRE_THROWS_AS(tree.set(node_id_1 + 1, cSetPos1), std::out_of_range);
}
}
Loading

0 comments on commit 99b5b08

Please sign in to comment.