diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 9672f6da..6b6cd51d 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,5 +1,11 @@ -# References - + # Description diff --git a/.github/workflows/pr-title-checks.yaml b/.github/workflows/pr-title-checks.yaml new file mode 100644 index 00000000..1d65f1e0 --- /dev/null +++ b/.github/workflows/pr-title-checks.yaml @@ -0,0 +1,25 @@ +name: "pr-title-checks" + +on: + pull_request_target: + types: ["edited", "opened", "reopened"] + branches: ["main"] + +permissions: {} + +concurrency: + group: "${{github.workflow}}-${{github.ref}}" + + # Cancel in-progress jobs for efficiency + cancel-in-progress: true + +jobs: + conventional-commits: + permissions: + # For amannn/action-semantic-pull-request + pull-requests: "read" + runs-on: "ubuntu-latest" + steps: + - uses: "amannn/action-semantic-pull-request@v5" + env: + GITHUB_TOKEN: "${{secrets.GITHUB_TOKEN}}" diff --git a/CMakeLists.txt b/CMakeLists.txt index d4802ca2..2627928e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -93,6 +93,8 @@ set(SOURCE_FILES src/log_surgeon/SchemaParser.hpp src/log_surgeon/Token.cpp src/log_surgeon/Token.hpp + src/log_surgeon/finite_automata/PrefixTree.cpp + src/log_surgeon/finite_automata/PrefixTree.hpp src/log_surgeon/finite_automata/RegexAST.hpp src/log_surgeon/finite_automata/Dfa.hpp src/log_surgeon/finite_automata/DfaState.hpp @@ -101,7 +103,7 @@ set(SOURCE_FILES src/log_surgeon/finite_automata/Nfa.hpp src/log_surgeon/finite_automata/NfaState.hpp src/log_surgeon/finite_automata/NfaStateType.hpp - src/log_surgeon/finite_automata/Register.hpp + src/log_surgeon/finite_automata/RegisterHandler.hpp src/log_surgeon/finite_automata/Tag.hpp src/log_surgeon/finite_automata/TaggedTransition.hpp src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index 04c2f50d..3da31cd5 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -410,11 +410,12 @@ auto Lexer::epsilon_closure(NfaStateType const* stat { stack.push(positive_tagged_start_transition.get_dest_state()); } - for (auto const& positive_tagged_end_transition : - current_state->get_positive_tagged_start_transitions()) - { - stack.push(positive_tagged_end_transition.get_dest_state()); + auto const& optional_positive_tagged_end_transition + = current_state->get_positive_tagged_end_transition(); + if (optional_positive_tagged_end_transition.has_value()) { + stack.push(optional_positive_tagged_end_transition.value().get_dest_state()); } + auto const& optional_negative_tagged_transition = current_state->get_negative_tagged_transition(); if (optional_negative_tagged_transition.has_value()) { diff --git a/src/log_surgeon/finite_automata/Nfa.hpp b/src/log_surgeon/finite_automata/Nfa.hpp index a50389c3..caf58ce4 100644 --- a/src/log_surgeon/finite_automata/Nfa.hpp +++ b/src/log_surgeon/finite_automata/Nfa.hpp @@ -31,14 +31,14 @@ class Nfa { [[nodiscard]] auto new_state() -> NfaStateType*; /** - * Creates a unique_ptr for an NFA state with a positive tagged transition and adds it to + * Creates a unique_ptr for an NFA state with a positive tagged end transition and adds it to * `m_states`. * @param tag * @param dest_state - * @return NfaStateType* + * @return A new state with a positive tagged end transition to `dest_state`. */ - [[nodiscard]] auto new_state_with_positive_tagged_transition( - Tag* tag, + [[nodiscard]] auto new_state_with_positive_tagged_end_transition( + Tag const* tag, NfaStateType const* dest_state ) -> NfaStateType*; @@ -50,10 +50,23 @@ class Nfa { * @return NfaStateType* */ [[nodiscard]] auto new_state_with_negative_tagged_transition( - std::vector tags, + std::vector tags, NfaStateType const* dest_state ) -> NfaStateType*; + /** + * Creates the start and end states for a capture group. + * @param tag The tag associated with the capture group. + * @param dest_state + * @return A pair of states: + * - A new state with a positive tagged start transition from `m_root`. + * - A new state with a positive tagged end transition to `dest_state`. + */ + [[nodiscard]] auto new_start_and_end_states_with_positive_tagged_transitions( + Tag const* tag, + NfaStateType const* dest_state + ) -> std::pair; + /** * @return A vector representing the traversal order of the NFA states using breadth-first * search (BFS). @@ -97,8 +110,8 @@ auto Nfa::new_state() -> NfaStateType* { } template -auto Nfa::new_state_with_positive_tagged_transition( - Tag* tag, +auto Nfa::new_state_with_positive_tagged_end_transition( + Tag const* tag, NfaStateType const* dest_state ) -> NfaStateType* { m_states.emplace_back(std::make_unique(tag, dest_state)); @@ -107,13 +120,25 @@ auto Nfa::new_state_with_positive_tagged_transition( template auto Nfa::new_state_with_negative_tagged_transition( - std::vector tags, + std::vector tags, NfaStateType const* dest_state ) -> NfaStateType* { m_states.emplace_back(std::make_unique(std::move(tags), dest_state)); return m_states.back().get(); } +template +auto Nfa::new_start_and_end_states_with_positive_tagged_transitions( + Tag const* tag, + NfaStateType const* dest_state +) -> std::pair { + auto* start_state = new_state(); + m_root->add_positive_tagged_start_transition(tag, start_state); + + auto* end_state = new_state_with_positive_tagged_end_transition(tag, dest_state); + return {start_state, end_state}; +} + template auto Nfa::get_bfs_traversal_order() const -> std::vector { std::queue state_queue; @@ -148,11 +173,14 @@ auto Nfa::get_bfs_traversal_order() const -> std::vectorget_positive_tagged_end_transitions()) - { - add_to_queue_and_visited(positive_tagged_end_transition.get_dest_state()); + + auto const& optional_positive_tagged_end_transition + = current_state->get_positive_tagged_end_transition(); + if (optional_positive_tagged_end_transition.has_value()) { + add_to_queue_and_visited(optional_positive_tagged_end_transition.value().get_dest_state( + )); } + auto const& optional_negative_tagged_transition = current_state->get_negative_tagged_transition(); if (optional_negative_tagged_transition.has_value()) { diff --git a/src/log_surgeon/finite_automata/NfaState.hpp b/src/log_surgeon/finite_automata/NfaState.hpp index 5dabefc3..88f2509c 100644 --- a/src/log_surgeon/finite_automata/NfaState.hpp +++ b/src/log_surgeon/finite_automata/NfaState.hpp @@ -30,10 +30,10 @@ class NfaState { NfaState() = default; - NfaState(Tag* tag, NfaState const* dest_state) - : m_positive_tagged_end_transitions{{tag, dest_state}} {} + NfaState(Tag const* tag, NfaState const* dest_state) + : m_positive_tagged_end_transition{PositiveTaggedTransition{tag, dest_state}} {} - NfaState(std::vector tags, NfaState const* dest_state) + NfaState(std::vector tags, NfaState const* dest_state) : m_negative_tagged_transition{NegativeTaggedTransition{std::move(tags), dest_state}} {} auto set_accepting(bool accepting) -> void { m_accepting = accepting; } @@ -48,7 +48,8 @@ class NfaState { return m_matching_variable_id; } - auto add_positive_tagged_start_transition(Tag* tag, NfaState* dest_state) -> void { + auto + add_positive_tagged_start_transition(Tag const* tag, NfaState const* dest_state) -> void { m_positive_tagged_start_transitions.emplace_back(tag, dest_state); } @@ -57,9 +58,9 @@ class NfaState { return m_positive_tagged_start_transitions; } - [[nodiscard]] auto get_positive_tagged_end_transitions( - ) const -> std::vector> const& { - return m_positive_tagged_end_transitions; + [[nodiscard]] auto get_positive_tagged_end_transition( + ) const -> std::optional> const& { + return m_positive_tagged_end_transition; } [[nodiscard]] auto get_negative_tagged_transition( @@ -109,7 +110,7 @@ class NfaState { bool m_accepting{false}; uint32_t m_matching_variable_id{0}; std::vector> m_positive_tagged_start_transitions; - std::vector> m_positive_tagged_end_transitions; + std::optional> m_positive_tagged_end_transition; std::optional> m_negative_tagged_transition; std::vector m_epsilon_transitions; std::array, cSizeOfByte> m_bytes_transitions; @@ -185,28 +186,27 @@ auto NfaState::serialize( epsilon_transitions.emplace_back(std::to_string(state_ids.at(dest_state))); } - std::vector positive_tagged_start_transition_strings; + std::vector serialized_positive_tagged_start_transitions; for (auto const& positive_tagged_start_transition : m_positive_tagged_start_transitions) { - auto const optional_serialized_positive_transition + auto const optional_serialized_positive_start_transition = positive_tagged_start_transition.serialize(state_ids); - if (false == optional_serialized_positive_transition.has_value()) { + if (false == optional_serialized_positive_start_transition.has_value()) { return std::nullopt; } - positive_tagged_start_transition_strings.emplace_back( - optional_serialized_positive_transition.value() + serialized_positive_tagged_start_transitions.emplace_back( + optional_serialized_positive_start_transition.value() ); } - std::vector positive_tagged_end_transition_strings; - for (auto const& positive_tagged_end_transition : m_positive_tagged_end_transitions) { - auto const optional_serialized_positive_transition - = positive_tagged_end_transition.serialize(state_ids); - if (false == optional_serialized_positive_transition.has_value()) { + std::string serialized_positive_tagged_end_transition; + if (m_positive_tagged_end_transition.has_value()) { + auto const optional_serialized_positive_end_transition + = m_positive_tagged_end_transition.value().serialize(state_ids); + if (false == optional_serialized_positive_end_transition.has_value()) { return std::nullopt; } - positive_tagged_end_transition_strings.emplace_back( - optional_serialized_positive_transition.value() - ); + serialized_positive_tagged_end_transition + = optional_serialized_positive_end_transition.value(); } std::string negative_tagged_transition_string; @@ -230,8 +230,8 @@ auto NfaState::serialize( accepting_tag_string, fmt::join(byte_transitions, ","), fmt::join(epsilon_transitions, ","), - fmt::join(positive_tagged_start_transition_strings, ","), - fmt::join(positive_tagged_end_transition_strings, ","), + fmt::join(serialized_positive_tagged_start_transitions, ","), + serialized_positive_tagged_end_transition, negative_tagged_transition_string ); } diff --git a/src/log_surgeon/finite_automata/PrefixTree.cpp b/src/log_surgeon/finite_automata/PrefixTree.cpp new file mode 100644 index 00000000..bf0705c8 --- /dev/null +++ b/src/log_surgeon/finite_automata/PrefixTree.cpp @@ -0,0 +1,20 @@ +#include "PrefixTree.hpp" + +#include +#include + +namespace log_surgeon::finite_automata { +auto PrefixTree::get_reversed_positions(id_t const node_id) const -> std::vector { + if (m_nodes.size() <= node_id) { + throw std::out_of_range("Prefix tree index out of range."); + } + + std::vector reversed_positions; + auto current_node{m_nodes[node_id]}; + while (false == current_node.is_root()) { + reversed_positions.push_back(current_node.get_position()); + current_node = m_nodes[current_node.get_parent_node_id().value()]; + } + return reversed_positions; +} +} // namespace log_surgeon::finite_automata diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp new file mode 100644 index 00000000..815c7dda --- /dev/null +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -0,0 +1,85 @@ +#ifndef LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE_HPP +#define LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE_HPP + +#include +#include +#include +#include + +namespace log_surgeon::finite_automata { +/** + * Represents a prefix tree to store register data during TDFA simulation. Each node in the tree + * stores a single position in the lexed string. Each path from the root to an index corresponds to + * a sequence of positions for an individual tag: + * - Positive position node: Indicates the tag was matched at the position. + * - Negative position node: Indicates the tag was unmatched. If a negative node is the entire path, + * it indicates the tag was never matched. If the negative tag is along a path containing positive + * nodes, it functions as a placeholder. This can be useful for nested capture groups, to maintain a + * one-to-one mapping between the contained capture group and the enclosing capture group. + */ +class PrefixTree { +public: + using id_t = uint32_t; + using position_t = int32_t; + + static constexpr id_t cRootId{0}; + + PrefixTree() : m_nodes{{std::nullopt, -1}} {} + + /** + * @param parent_node_id Index of the inserted node's parent in the prefix tree. + * @param position The position in the lexed string. + * @return The index of the newly inserted node in the tree. + * @throw std::out_of_range if the parent's index is out of range. + */ + [[maybe_unused]] auto insert(id_t const parent_node_id, position_t const position) -> id_t { + if (m_nodes.size() <= parent_node_id) { + throw std::out_of_range("Predecessor index out of range."); + } + + m_nodes.emplace_back(parent_node_id, position); + return m_nodes.size() - 1; + } + + auto set(id_t const node_id, position_t const position) -> void { + m_nodes.at(node_id).set_position(position); + } + + [[nodiscard]] auto size() const -> size_t { return m_nodes.size(); } + + /** + * @param node_id The index of the node. + * @return A vector containing positions in order from the given index up to but not including + * the root node. + * @throw std::out_of_range if the index is out of range. + */ + [[nodiscard]] auto get_reversed_positions(id_t node_id) const -> std::vector; + +private: + class Node { + public: + Node(std::optional const parent_node_id, position_t const position) + : m_parent_node_id{parent_node_id}, + m_position{position} {} + + [[nodiscard]] auto is_root() const -> bool { return false == m_parent_node_id.has_value(); } + + [[nodiscard]] auto get_parent_node_id() const -> std::optional { + return m_parent_node_id; + } + + auto set_position(position_t const position) -> void { m_position = position; } + + [[nodiscard]] auto get_position() const -> position_t { return m_position; } + + private: + std::optional m_parent_node_id; + position_t m_position; + }; + + std::vector m_nodes; +}; + +} // namespace log_surgeon::finite_automata + +#endif // LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE_HPP diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index f1409715..e97e29a5 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -82,23 +83,23 @@ class RegexAST { */ [[nodiscard]] virtual auto serialize() const -> std::u32string = 0; - [[nodiscard]] auto get_subtree_positive_tags() const -> std::vector const& { + [[nodiscard]] auto get_subtree_positive_tags() const -> std::vector const& { return m_subtree_positive_tags; } - auto set_subtree_positive_tags(std::vector subtree_positive_tags) -> void { + auto set_subtree_positive_tags(std::vector subtree_positive_tags) -> void { m_subtree_positive_tags = std::move(subtree_positive_tags); } - auto add_subtree_positive_tags(std::vector subtree_positive_tags) -> void { + auto add_subtree_positive_tags(std::vector const& subtree_positive_tags) -> void { m_subtree_positive_tags.insert( m_subtree_positive_tags.end(), - std::make_move_iterator(subtree_positive_tags.begin()), - std::make_move_iterator(subtree_positive_tags.end()) + subtree_positive_tags.cbegin(), + subtree_positive_tags.cend() ); } - auto set_negative_tags(std::vector negative_tags) -> void { + auto set_negative_tags(std::vector negative_tags) -> void { m_negative_tags = std::move(negative_tags); } @@ -107,8 +108,8 @@ class RegexAST { * @param nfa * @param end_state */ - auto add_to_nfa_with_negative_tags(Nfa* nfa, NfaStateType* end_state) const - -> void { + auto + add_to_nfa_with_negative_tags(Nfa* nfa, NfaStateType* end_state) const -> void { // Handle negative tags as: // root --(regex)--> state_with_negative_tagged_transition --(negative tags)--> end_state if (false == m_negative_tags.empty()) { @@ -123,8 +124,8 @@ class RegexAST { protected: RegexAST(RegexAST const& rhs) = default; auto operator=(RegexAST const& rhs) -> RegexAST& = default; - RegexAST(RegexAST&& rhs) noexcept = default; - auto operator=(RegexAST&& rhs) noexcept -> RegexAST& = default; + RegexAST(RegexAST&& rhs) noexcept = delete; + auto operator=(RegexAST&& rhs) noexcept -> RegexAST& = delete; [[nodiscard]] auto serialize_negative_tags() const -> std::u32string { if (m_negative_tags.empty()) { @@ -145,8 +146,8 @@ class RegexAST { } private: - std::vector m_subtree_positive_tags; - std::vector m_negative_tags; + std::vector m_subtree_positive_tags; + std::vector m_negative_tags; }; /** @@ -438,10 +439,6 @@ class RegexASTOr : public RegexAST { m_left(std::unique_ptr>(rhs.m_left->clone())), m_right(std::unique_ptr>(rhs.m_right->clone())) {} - auto operator=(RegexASTOr const& rhs) -> RegexASTOr& = default; - RegexASTOr(RegexASTOr&& rhs) noexcept = default; - auto operator=(RegexASTOr&& rhs) noexcept -> RegexASTOr& = default; - /** * Used for cloning a unique_pointer of type RegexASTOr * @return RegexASTOr* @@ -505,10 +502,6 @@ class RegexASTCat : public RegexAST { m_left(std::unique_ptr>(rhs.m_left->clone())), m_right(std::unique_ptr>(rhs.m_right->clone())) {} - auto operator=(RegexASTCat const& rhs) -> RegexASTCat& = default; - RegexASTCat(RegexASTCat&& rhs) noexcept = default; - auto operator=(RegexASTCat&& rhs) noexcept -> RegexASTCat& = default; - /** * Used for cloning a unique_pointer of type RegexASTCat * @return RegexASTCat* @@ -574,10 +567,6 @@ class RegexASTMultiplication : public RegexAST { m_min(rhs.m_min), m_max(rhs.m_max) {} - auto operator=(RegexASTMultiplication const& rhs) -> RegexASTMultiplication& = default; - RegexASTMultiplication(RegexASTMultiplication&& rhs) noexcept = default; - auto operator=(RegexASTMultiplication&& rhs) noexcept -> RegexASTMultiplication& = default; - /** * Used for cloning a unique_pointer of type RegexASTMultiplication * @return RegexASTMultiplication* @@ -632,17 +621,34 @@ class RegexASTMultiplication : public RegexAST { uint32_t m_max; }; +/** + * Represents a capture group AST node. + * NOTE: + * - `m_tag` is always expected to be non-null. + * - `m_group_regex_ast` is always expected to be non-null. + * @tparam NfaStateType Specifies the type of transition (bytes or UTF-8 characters). + */ template class RegexASTCapture : public RegexAST { public: ~RegexASTCapture() override = default; + /** + * @param group_regex_ast + * @param tag + * @throw std::invalid_argument if `group_regex_ast` or `tag` are `nullptr`. + */ RegexASTCapture( std::unique_ptr> group_regex_ast, std::unique_ptr tag ) - : m_group_regex_ast{std::move(group_regex_ast)}, - m_tag{std::move(tag)} { + : m_group_regex_ast{( + nullptr == group_regex_ast + ? throw std::invalid_argument("Group regex AST cannot be null") + : std::move(group_regex_ast) + )}, + m_tag{nullptr == tag ? throw std::invalid_argument("Tag cannot be null") + : std::move(tag)} { RegexAST::set_subtree_positive_tags( m_group_regex_ast->get_subtree_positive_tags() ); @@ -654,14 +660,10 @@ class RegexASTCapture : public RegexAST { m_group_regex_ast{ std::unique_ptr>(rhs.m_group_regex_ast->clone()) }, - m_tag{rhs.m_tag ? std::make_unique(*rhs.m_tag) : nullptr} { + m_tag{std::make_unique(*rhs.m_tag)} { RegexAST::set_subtree_positive_tags(rhs.get_subtree_positive_tags()); } - auto operator=(RegexASTCapture const& rhs) -> RegexASTCapture& = default; - RegexASTCapture(RegexASTCapture&& rhs) noexcept = default; - auto operator=(RegexASTCapture&& rhs) noexcept -> RegexASTCapture& = default; - /** * Used for cloning a `unique_pointer` of type `RegexASTCapture`. * @return RegexASTCapture* @@ -691,15 +693,15 @@ class RegexASTCapture : public RegexAST { /** * Adds the needed `Nfa::states` to the passed in nfa to handle a - * `RegexASTCapture` before transitioning to an accepting `end_state`. + * `RegexASTCapture` before transitioning to a `dest_state`. * @param nfa - * @param end_state + * @param dest_state */ - auto add_to_nfa(Nfa* nfa, NfaStateType* end_state) const -> void override; + auto add_to_nfa(Nfa* nfa, NfaStateType* dest_state) const -> void override; [[nodiscard]] auto serialize() const -> std::u32string override; - [[nodiscard]] auto get_group_name() const -> std::string const& { return m_tag->get_name(); } + [[nodiscard]] auto get_group_name() const -> std::string_view { return m_tag->get_name(); } [[nodiscard]] auto get_group_regex_ast( ) const -> std::unique_ptr> const& { @@ -779,8 +781,7 @@ RegexASTOr::RegexASTOr( } template -void RegexASTOr::add_to_nfa(Nfa* nfa, NfaStateType* end_state) - const { +void RegexASTOr::add_to_nfa(Nfa* nfa, NfaStateType* end_state) const { m_left->add_to_nfa_with_negative_tags(nfa, end_state); m_right->add_to_nfa_with_negative_tags(nfa, end_state); } @@ -807,8 +808,7 @@ RegexASTCat::RegexASTCat( } template -void RegexASTCat::add_to_nfa(Nfa* nfa, NfaStateType* end_state) - const { +void RegexASTCat::add_to_nfa(Nfa* nfa, NfaStateType* end_state) const { NfaStateType* saved_root = nfa->get_root(); NfaStateType* intermediate_state = nfa->new_state(); m_left->add_to_nfa_with_negative_tags(nfa, intermediate_state); @@ -890,26 +890,59 @@ template } template -void RegexASTCapture::add_to_nfa(Nfa* nfa, NfaStateType* end_state) - const { - NfaStateType* root = nfa->get_root(); - auto* capture_group_start_state = nfa->new_state(); - root->add_positive_tagged_start_transition(m_tag.get(), capture_group_start_state); - - auto* state_with_positive_tagged_transition - = nfa->new_state_with_positive_tagged_transition(m_tag.get(), end_state); - nfa->set_root(capture_group_start_state); - m_group_regex_ast->add_to_nfa_with_negative_tags(nfa, state_with_positive_tagged_transition); - - nfa->set_root(root); +auto RegexASTCapture::add_to_nfa( + Nfa* nfa, + NfaStateType* dest_state +) const -> void { + // TODO: move this into a documentation file in the future, and reference it here. + // The NFA constructed for a capture group follows the structure below, with tagged transitions + // explicitly labeled for clarity: + // +---------------------+ + // | `m_root` | + // +---------------------+ + // | `m_tag` start + // | (positive tagged start transition) + // v + // +---------------------+ + // |`capture_start_state`| + // +---------------------+ + // | + // | (epsilon transition) + // v + // +---------------------+ + // | `m_group_regex_ast` | + // | (nested NFA) | + // +---------------------+ + // | `m_negative_tags` + // | (negative tagged transition) + // v + // +---------------------+ + // | `capture_end_state` | + // +---------------------+ + // | `m_tag` end + // | (positive tagged end transition) + // v + // +---------------------+ + // | `dest_state` | + // +---------------------+ + auto [capture_start_state, capture_end_state] + = nfa->new_start_and_end_states_with_positive_tagged_transitions( + m_tag.get(), + dest_state + ); + + auto* initial_root = nfa->get_root(); + nfa->set_root(capture_start_state); + m_group_regex_ast->add_to_nfa_with_negative_tags(nfa, capture_end_state); + nfa->set_root(initial_root); } template [[nodiscard]] auto RegexASTCapture::serialize() const -> std::u32string { - auto const tag_name_u32 = std::u32string(m_tag->get_name().begin(), m_tag->get_name().end()); + auto const tag_name_u32 = std::u32string(m_tag->get_name().cbegin(), m_tag->get_name().cend()); return fmt::format( U"({})<{}>{}", - nullptr != m_group_regex_ast ? m_group_regex_ast->serialize() : U"null", + m_group_regex_ast->serialize(), tag_name_u32, RegexAST::serialize_negative_tags() ); diff --git a/src/log_surgeon/finite_automata/Register.hpp b/src/log_surgeon/finite_automata/Register.hpp deleted file mode 100644 index d0be4f15..00000000 --- a/src/log_surgeon/finite_automata/Register.hpp +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGISTER -#define LOG_SURGEON_FINITE_AUTOMATA_REGISTER - -#include - -#include - -namespace log_surgeon::finite_automata { -class Register { -public: - explicit Register(Tag* tag) : m_tag{tag} {} - - auto add_pos(uint32_t const pos) -> void { positions.push_back(pos); } - - auto update_last_position(uint32_t const pos) -> void { positions.back() = pos; } - - auto negate_last_position() -> void { positions.pop_back(); } - - auto negate_all_positions() -> void { positions.clear(); } - - [[nodiscard]] auto get_tag() const -> Tag* { return m_tag; } - - [[nodiscard]] auto get_last_position() const -> uint32_t { return positions.back(); } - - [[nodiscard]] auto get_all_positions() const -> std::vector const& { - return positions; - } - -private: - Tag* m_tag; - std::vector positions; -}; -} // namespace log_surgeon::finite_automata - -#endif // LOG_SURGEON_FINITE_AUTOMATA_REGISTER diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp new file mode 100644 index 00000000..86a06f21 --- /dev/null +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -0,0 +1,52 @@ +#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER_HPP +#define LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER_HPP + +#include +#include + +#include + +namespace log_surgeon::finite_automata { +/** + * The register handler maintains a prefix tree that is sufficient to represent all registers. + * The register handler also contains a vector of registers, and performs the set, copy, and append + * operations for these registers. + * + * NOTE: For efficiency, registers are not initialized when lexing a new string; instead, it is the + * responsibility of the DFA to set the register values when needed. + */ +class RegisterHandler { +public: + auto add_register( + PrefixTree::id_t const prefix_tree_parent_node_id, + PrefixTree::position_t const position + ) -> void { + auto const prefix_tree_node_id{m_prefix_tree.insert(prefix_tree_parent_node_id, position)}; + m_registers.emplace_back(prefix_tree_node_id); + } + + auto set_register(size_t const reg_id, PrefixTree::position_t const position) -> void { + m_prefix_tree.set(m_registers.at(reg_id), position); + } + + auto copy_register(size_t const dest_reg_id, size_t const source_reg_id) -> void { + m_registers.at(dest_reg_id) = m_registers.at(source_reg_id); + } + + auto append_position(size_t const reg_id, PrefixTree::position_t const position) -> void { + auto const node_id{m_registers.at(reg_id)}; + m_registers.at(reg_id) = m_prefix_tree.insert(node_id, position); + } + + [[nodiscard]] auto get_reversed_positions(size_t const reg_id + ) const -> std::vector { + return m_prefix_tree.get_reversed_positions(m_registers.at(reg_id)); + } + +private: + PrefixTree m_prefix_tree; + std::vector m_registers; +}; +} // namespace log_surgeon::finite_automata + +#endif // LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER_HPP diff --git a/src/log_surgeon/finite_automata/Tag.hpp b/src/log_surgeon/finite_automata/Tag.hpp index e37fe33f..3a3b4d7f 100644 --- a/src/log_surgeon/finite_automata/Tag.hpp +++ b/src/log_surgeon/finite_automata/Tag.hpp @@ -1,38 +1,19 @@ #ifndef LOG_SURGEON_FINITE_AUTOMATA_TAG #define LOG_SURGEON_FINITE_AUTOMATA_TAG -#include #include +#include #include -#include namespace log_surgeon::finite_automata { -/** - * This class represents a tag that is associated with matches of a capture group. If - * `m_start_positions` is empty, it indicates that the capture group was unmatched. - * - * Since capture group regex can be contained within repetition regex, - * (e.g., "((user_id=(?\d+),)+"), `m_start_positions` and `m_end_positions` are vectors that - * track the locations of each occurrence of the capture group. - */ class Tag { public: explicit Tag(std::string name) : m_name{std::move(name)} {} - auto set_start_positions(std::vector start_positions) -> void { - m_start_positions = std::move(start_positions); - } - - auto set_end_positions(std::vector end_positions) -> void { - m_end_positions = std::move(end_positions); - } - - [[nodiscard]] auto get_name() const -> std::string const& { return m_name; } + [[nodiscard]] auto get_name() const -> std::string_view { return m_name; } private: - std::string const m_name; - std::vector m_start_positions; - std::vector m_end_positions; + std::string m_name; }; } // namespace log_surgeon::finite_automata diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index 28e7937b..4da9b5fa 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -11,23 +12,25 @@ #include namespace log_surgeon::finite_automata { +/** + * Represents an NFA transition indicating that a capture group has been matched. + * NOTE: `m_tag` is always expected to be non-null. + * @tparam NfaStateType Specifies the type of transition (bytes or UTF-8 characters). + */ template class PositiveTaggedTransition { public: - PositiveTaggedTransition(Tag* tag, NfaStateType const* dest_state) - : m_tag{tag}, + /** + * @param tag + * @param dest_state + * @throw std::invalid_argument if `tag` is `nullptr`. + */ + PositiveTaggedTransition(Tag const* tag, NfaStateType const* dest_state) + : m_tag{nullptr == tag ? throw std::invalid_argument("Tag cannot be null") : tag}, m_dest_state{dest_state} {} [[nodiscard]] auto get_dest_state() const -> NfaStateType const* { return m_dest_state; } - auto set_tag_start_positions(std::vector start_positions) const -> void { - m_tag->set_start_positions(std::move(start_positions)); - } - - auto set_tag_end_positions(std::vector end_positions) const -> void { - m_tag->set_end_positions(std::move(end_positions)); - } - /** * @param state_ids A map of states to their unique identifiers. * @return A string representation of the positive tagged transition on success. @@ -36,22 +39,37 @@ class PositiveTaggedTransition { [[nodiscard]] auto serialize(std::unordered_map const& state_ids ) const -> std::optional { auto const state_id_it = state_ids.find(m_dest_state); - if (state_id_it == state_ids.end() || nullptr == m_tag) { + if (state_id_it == state_ids.end()) { return std::nullopt; } return fmt::format("{}[{}]", state_id_it->second, m_tag->get_name()); } private: - Tag* m_tag; + Tag const* m_tag; NfaStateType const* m_dest_state; }; +/** + * Represents an NFA transition indicating that a capture group has been unmatched. + * NOTE: All tags in `m_tags` are always expected to be non-null. + * @tparam NfaStateType Specifies the type of transition (bytes or UTF-8 characters). + */ template class NegativeTaggedTransition { public: - NegativeTaggedTransition(std::vector tags, NfaStateType const* dest_state) - : m_tags{std::move(tags)}, + /** + * @param tags + * @param dest_state + * @throw std::invalid_argument if any elements in `tags` is `nullptr`. + */ + NegativeTaggedTransition(std::vector tags, NfaStateType const* dest_state) + : m_tags{[&tags] { + if (std::ranges::any_of(tags, [](Tag const* tag) { return nullptr == tag; })) { + throw std::invalid_argument("Tags cannot contain null elements"); + } + return std::move(tags); + }()}, m_dest_state{dest_state} {} [[nodiscard]] auto get_dest_state() const -> NfaStateType const* { return m_dest_state; } @@ -68,16 +86,13 @@ class NegativeTaggedTransition { return std::nullopt; } - if (std::ranges::any_of(m_tags, [](Tag const* tag) { return tag == nullptr; })) { - return std::nullopt; - } auto const tag_names = m_tags | std::ranges::views::transform(&Tag::get_name); return fmt::format("{}[{}]", state_id_it->second, fmt::join(tag_names, ",")); } private: - std::vector const m_tags; + std::vector m_tags; NfaStateType const* m_dest_state; }; } // namespace log_surgeon::finite_automata diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 8a1733e2..1e4a8363 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -2,11 +2,13 @@ set( SOURCES_LOG_SURGEON ../src/log_surgeon/FileReader.cpp ../src/log_surgeon/FileReader.hpp + ../src/log_surgeon/finite_automata/PrefixTree.cpp + ../src/log_surgeon/finite_automata/PrefixTree.hpp ../src/log_surgeon/finite_automata/RegexAST.hpp ../src/log_surgeon/finite_automata/Nfa.hpp ../src/log_surgeon/finite_automata/NfaState.hpp ../src/log_surgeon/finite_automata/NfaStateType.hpp - ../src/log_surgeon/finite_automata/Register.hpp + ../src/log_surgeon/finite_automata/RegisterHandler.hpp ../src/log_surgeon/finite_automata/Tag.hpp ../src/log_surgeon/finite_automata/TaggedTransition.hpp ../src/log_surgeon/LALR1Parser.cpp @@ -22,7 +24,7 @@ set( ../src/log_surgeon/Token.hpp ) -set(SOURCES_TESTS test-lexer.cpp test-NFA.cpp test-tag.cpp) +set(SOURCES_TESTS test-lexer.cpp test-NFA.cpp test-prefix-tree.cpp test-register-handler.cpp test-tag.cpp) add_executable(unit-test ${SOURCES_LOG_SURGEON} ${SOURCES_TESTS}) target_link_libraries(unit-test PRIVATE Catch2::Catch2WithMain log_surgeon::log_surgeon) diff --git a/tests/test-lexer.cpp b/tests/test-lexer.cpp index 77c33ef8..2b028920 100644 --- a/tests/test-lexer.cpp +++ b/tests/test-lexer.cpp @@ -1,7 +1,5 @@ #include -#include #include -#include #include #include #include diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp new file mode 100644 index 00000000..47262bdd --- /dev/null +++ b/tests/test-prefix-tree.cpp @@ -0,0 +1,117 @@ +#include +#include +#include + +#include + +#include + +using log_surgeon::finite_automata::PrefixTree; +using id_t = PrefixTree::id_t; +using position_t = PrefixTree::position_t; + +constexpr auto cRootId{PrefixTree::cRootId}; +constexpr id_t cInvaidNodeId{100}; +constexpr position_t cInsertPos1{4}; +constexpr position_t cInsertPos2{7}; +constexpr position_t cInsertPos3{9}; +constexpr position_t cMaxPos{std::numeric_limits::max()}; +constexpr position_t cNegativePos1{-1}; +constexpr position_t cNegativePos2{-100}; +constexpr position_t cSetPos1{10}; +constexpr position_t cSetPos2{12}; +constexpr position_t cSetPos3{15}; +constexpr position_t cSetPos4{20}; +constexpr position_t cTreeSize1{4}; +constexpr position_t cTreeSize2{8}; + +TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { + SECTION("Newly constructed tree works correctly") { + PrefixTree const tree; + + // A newly constructed tree should return no positions as the root node is ignored + REQUIRE(tree.get_reversed_positions(cRootId).empty()); + } + + SECTION("Inserting nodes into the prefix tree works correctly") { + PrefixTree tree; + + // Test basic insertions + auto const node_id_1{tree.insert(cRootId, cInsertPos1)}; + auto const node_id_2{tree.insert(node_id_1, cInsertPos2)}; + auto const node_id_3{tree.insert(node_id_2, cInsertPos3)}; + REQUIRE(std::vector{cInsertPos1} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{cInsertPos2, cInsertPos1} + == tree.get_reversed_positions(node_id_2)); + REQUIRE(std::vector{cInsertPos3, cInsertPos2, cInsertPos1} + == tree.get_reversed_positions(node_id_3)); + REQUIRE(cTreeSize1 == tree.size()); + + // Test insertion with large position values + auto const node_id_4{tree.insert(cRootId, cMaxPos)}; + REQUIRE(cMaxPos == tree.get_reversed_positions(node_id_4)[0]); + + // Test insertion with negative position values + auto const node_id_5{tree.insert(cRootId, cNegativePos1)}; + auto const node_id_6{tree.insert(node_id_5, cInsertPos1)}; + auto const node_id_7{tree.insert(node_id_6, cNegativePos2)}; + REQUIRE(std::vector{cNegativePos1} == tree.get_reversed_positions(node_id_5)); + REQUIRE(std::vector{cInsertPos1, cNegativePos1} + == tree.get_reversed_positions(node_id_6)); + REQUIRE(std::vector{cNegativePos2, cInsertPos1, cNegativePos1} + == tree.get_reversed_positions(node_id_7)); + REQUIRE(cTreeSize2 == tree.size()); + } + + SECTION("Invalid index access throws correctly") { + PrefixTree tree; + REQUIRE_THROWS_AS(tree.get_reversed_positions(tree.size()), std::out_of_range); + + tree.insert(cRootId, cInsertPos1); + REQUIRE_THROWS_AS(tree.get_reversed_positions(tree.size()), std::out_of_range); + + REQUIRE_THROWS_AS( + tree.get_reversed_positions(std::numeric_limits::max()), + std::out_of_range + ); + } + + SECTION("Set position for a valid index works correctly") { + PrefixTree tree; + // Test that you can set the root node for sanity, although this value is not used + tree.set(cRootId, cSetPos1); + + // Test updates to different nodes + auto const node_id_1{tree.insert(cRootId, cInsertPos1)}; + auto const node_id_2{tree.insert(node_id_1, cInsertPos1)}; + tree.set(node_id_1, cSetPos1); + tree.set(node_id_2, cSetPos2); + REQUIRE(std::vector{cSetPos1} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{cSetPos2, cSetPos1} + == tree.get_reversed_positions(node_id_2)); + + // Test multiple updates to the same node + tree.set(node_id_2, cSetPos3); + tree.set(node_id_2, cSetPos4); + REQUIRE(std::vector{cSetPos4, cSetPos1} + == tree.get_reversed_positions(node_id_2)); + + // Test that updates don't affect unrelated paths + auto const node_id_3{tree.insert(cRootId, cSetPos2)}; + tree.set(node_id_3, cSetPos3); + REQUIRE(std::vector{cSetPos1} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{cSetPos4, cSetPos1} + == tree.get_reversed_positions(node_id_2)); + } + + SECTION("Set position for an invalid index throws correctly") { + PrefixTree tree; + + // Test setting position before any insertions + REQUIRE_THROWS_AS(tree.set(cInvaidNodeId, cSetPos4), std::out_of_range); + + // Test setting position just beyond valid range + auto const node_id_1{tree.insert(cRootId, cInsertPos1)}; + REQUIRE_THROWS_AS(tree.set(node_id_1 + 1, cSetPos4), std::out_of_range); + } +} diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp new file mode 100644 index 00000000..9cec3ff5 --- /dev/null +++ b/tests/test-register-handler.cpp @@ -0,0 +1,85 @@ +#include +#include +#include + +#include + +#include +#include + +using log_surgeon::finite_automata::RegisterHandler; +using position_t = log_surgeon::finite_automata::PrefixTree::position_t; + +constexpr position_t cInitialPos{0}; +constexpr position_t cNegativePos1{-1}; +constexpr position_t cNegativePos2{-100}; +constexpr position_t cSetPos1{5}; +constexpr position_t cSetPos2{10}; +constexpr position_t cSetPos3{15}; +constexpr size_t cNumRegisters{5}; +constexpr size_t cRegId1{0}; +constexpr size_t cRegId2{1}; +constexpr size_t cRegId3{2}; +constexpr size_t cInvalidRegId{10}; + +namespace { +auto add_register_to_handler(RegisterHandler& handler) -> void { + for (size_t i{0}; i < cNumRegisters; ++i) { + handler.add_register(i, cInitialPos); + } +} +} // namespace + +TEST_CASE("`RegisterHandler` tests", "[RegisterHandler]") { + RegisterHandler handler; + + SECTION("Initial state is empty") { + REQUIRE_THROWS_AS(handler.get_reversed_positions(cRegId1), std::out_of_range); + } + + add_register_to_handler(handler); + + SECTION("Set register position correctly") { + handler.set_register(cRegId1, cSetPos1); + REQUIRE(std::vector{cSetPos1} == handler.get_reversed_positions(cRegId1)); + } + + SECTION("Register relationships are maintained") { + handler.set_register(cRegId1, cSetPos1); + handler.set_register(cRegId2, cSetPos2); + handler.set_register(cRegId3, cSetPos3); + + auto positions{handler.get_reversed_positions(cRegId3)}; + REQUIRE(std::vector{cSetPos3, cSetPos2, cSetPos1} + == handler.get_reversed_positions(cRegId3)); + } + + SECTION("Copy register index correctly") { + handler.set_register(cRegId1, cSetPos1); + handler.copy_register(cRegId2, cRegId1); + REQUIRE(std::vector{cSetPos1} == handler.get_reversed_positions(cRegId2)); + } + + SECTION("`append_position` appends position correctly") { + handler.set_register(cRegId1, cSetPos1); + handler.append_position(cRegId1, cSetPos2); + REQUIRE(std::vector{cSetPos2, cSetPos1} + == handler.get_reversed_positions(cRegId1)); + } + + SECTION("Throws out of range correctly") { + REQUIRE_THROWS_AS(handler.set_register(cInvalidRegId, cSetPos1), std::out_of_range); + REQUIRE_THROWS_AS(handler.copy_register(cInvalidRegId, cRegId2), std::out_of_range); + REQUIRE_THROWS_AS(handler.copy_register(cRegId1, cInvalidRegId), std::out_of_range); + REQUIRE_THROWS_AS(handler.append_position(cInvalidRegId, cSetPos1), std::out_of_range); + REQUIRE_THROWS_AS(handler.get_reversed_positions(cInvalidRegId), std::out_of_range); + } + + SECTION("Handles negative position values correctly") { + handler.set_register(cRegId1, cNegativePos1); + handler.append_position(cRegId1, cSetPos1); + handler.append_position(cRegId1, cNegativePos2); + REQUIRE(std::vector{cNegativePos2, cSetPos1, cNegativePos1} + == handler.get_reversed_positions(cRegId1)); + } +} diff --git a/tests/test-tag.cpp b/tests/test-tag.cpp index fdfff4c1..41f8a2ef 100644 --- a/tests/test-tag.cpp +++ b/tests/test-tag.cpp @@ -4,7 +4,31 @@ using log_surgeon::finite_automata::Tag; -TEST_CASE("Test Tag class", "[Tag]") { - Tag const tag("uID"); - REQUIRE("uID" == tag.get_name()); +TEST_CASE("Tag operations", "[Tag]") { + SECTION("Basic name retrieval works correctly") { + Tag const tag{"uID"}; + REQUIRE("uID" == tag.get_name()); + } + + SECTION("Empty tag name is handled correctly") { + Tag const empty_tag{""}; + REQUIRE(empty_tag.get_name().empty()); + } + + SECTION("Special characters in tag names are preserved") { + Tag const special_tag{"user.id-123_@"}; + REQUIRE("user.id-123_@" == special_tag.get_name()); + } + + SECTION("Copy constructor works correctly") { + Tag assign_tag{"target"}; + assign_tag = Tag{"new_source"}; + REQUIRE("new_source" == assign_tag.get_name()); + } + + SECTION("Move constructor works correctly") { + Tag original_tag{"source"}; + Tag moved_tag{std::move(original_tag)}; + REQUIRE("source" == moved_tag.get_name()); + } } diff --git a/tools/deps-install/ubuntu/install-catch2.sh b/tools/deps-install/ubuntu/install-catch2.sh index bb5ebfbe..aa063d72 100755 --- a/tools/deps-install/ubuntu/install-catch2.sh +++ b/tools/deps-install/ubuntu/install-catch2.sh @@ -69,7 +69,7 @@ fi # Build cd "$extracted_dir" -cmake -B build -S . -DBUILD_TESTING=OFF +cmake -B build -S . -DBUILD_TESTING=OFF -DCMAKE_CXX_STANDARD=17 cmake --build build --parallel "$num_cpus" # Check if checkinstall is installed