From a6274ec8631033d0b75a83f8e351045f1e284f72 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 11 Sep 2024 20:07:38 -0400 Subject: [PATCH 001/323] Bug-fix for unicode array sizes --- src/log_surgeon/Constants.hpp | 1 + src/log_surgeon/LogParser.cpp | 2 +- src/log_surgeon/finite_automata/RegexAST.hpp | 18 +++++++++--------- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/log_surgeon/Constants.hpp b/src/log_surgeon/Constants.hpp index f1a15853..85ae1670 100644 --- a/src/log_surgeon/Constants.hpp +++ b/src/log_surgeon/Constants.hpp @@ -6,6 +6,7 @@ namespace log_surgeon { constexpr uint32_t cUnicodeMax = 0x10'FFFF; +constexpr uint32_t cSizeOfUnicode = cUnicodeMax + 1; constexpr uint32_t cSizeOfByte = 256; constexpr uint32_t cSizeOfAllChildren = 10'000; constexpr uint32_t cNullSymbol = 10'000'000; diff --git a/src/log_surgeon/LogParser.cpp b/src/log_surgeon/LogParser.cpp index 88c168f0..9d2e91f8 100644 --- a/src/log_surgeon/LogParser.cpp +++ b/src/log_surgeon/LogParser.cpp @@ -96,7 +96,7 @@ void LogParser::add_rules(std::unique_ptr schema_ast) { rule->m_regex_ptr->remove_delimiters_from_wildcard(delimiters); // currently, error out if non-timestamp pattern contains a delimiter // check if regex contains a delimiter - std::array is_possible_input{}; + std::array is_possible_input{}; rule->m_regex_ptr->set_possible_inputs_to_true(is_possible_input); bool contains_delimiter = false; uint32_t delimiter_name = 0; diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 2409e0f9..a14a9c70 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -35,7 +35,7 @@ class RegexAST { * lexer rule * @param is_possible_input */ - virtual auto set_possible_inputs_to_true(std::array& is_possible_input + virtual auto set_possible_inputs_to_true(std::array& is_possible_input ) const -> void = 0; /** @@ -77,7 +77,7 @@ class RegexASTLiteral : public RegexAST { * lexer rule containing RegexASTLiteral at a leaf node in its AST * @param is_possible_input */ - auto set_possible_inputs_to_true(std::array& is_possible_input + auto set_possible_inputs_to_true(std::array& is_possible_input ) const -> void override { is_possible_input[m_character] = true; } @@ -126,7 +126,7 @@ class RegexASTInteger : public RegexAST { * lexer rule containing RegexASTInteger at a leaf node in its AST * @param is_possible_input */ - auto set_possible_inputs_to_true(std::array& is_possible_input + auto set_possible_inputs_to_true(std::array& is_possible_input ) const -> void override { for (uint32_t const i : m_digits) { is_possible_input.at('0' + i) = true; @@ -196,7 +196,7 @@ class RegexASTGroup : public RegexAST { * lexer rule containing RegexASTGroup at a leaf node in its AST * @param is_possible_input */ - auto set_possible_inputs_to_true(std::array& is_possible_input + auto set_possible_inputs_to_true(std::array& is_possible_input ) const -> void override { if (!m_negate) { for (auto const& [begin, end] : m_ranges) { @@ -205,7 +205,7 @@ class RegexASTGroup : public RegexAST { } } } else { - std::vector inputs(cUnicodeMax, 1); + std::vector inputs(cSizeOfUnicode, 1); for (auto const& [begin, end] : m_ranges) { for (uint32_t i = begin; i <= end; i++) { inputs[i] = 0; @@ -321,7 +321,7 @@ class RegexASTOr : public RegexAST { * lexer rule containing RegexASTOr at a leaf node in its AST * @param is_possible_input */ - auto set_possible_inputs_to_true(std::array& is_possible_input + auto set_possible_inputs_to_true(std::array& is_possible_input ) const -> void override { m_left->set_possible_inputs_to_true(is_possible_input); m_right->set_possible_inputs_to_true(is_possible_input); @@ -381,7 +381,7 @@ class RegexASTCat : public RegexAST { * lexer rule containing RegexASTCat at a leaf node in its AST * @param is_possible_input */ - auto set_possible_inputs_to_true(std::array& is_possible_input + auto set_possible_inputs_to_true(std::array& is_possible_input ) const -> void override { m_left->set_possible_inputs_to_true(is_possible_input); m_right->set_possible_inputs_to_true(is_possible_input); @@ -451,7 +451,7 @@ class RegexASTMultiplication : public RegexAST { * lexer rule containing RegexASTMultiplication at a leaf node in its AST * @param is_possible_input */ - auto set_possible_inputs_to_true(std::array& is_possible_input + auto set_possible_inputs_to_true(std::array& is_possible_input ) const -> void override { m_operand->set_possible_inputs_to_true(is_possible_input); } @@ -522,7 +522,7 @@ class RegexASTCapture : public RegexAST { * lexer rule containing `RegexASTCapture` at a leaf node in its AST. * @param is_possible_input */ - auto set_possible_inputs_to_true(std::array& is_possible_input + auto set_possible_inputs_to_true(std::array& is_possible_input ) const -> void override { m_group_regex_ast->set_possible_inputs_to_true(is_possible_input); } From 4f122c6e568ff80868d5ef8067e5f7235b1acda3 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 12 Sep 2024 07:04:48 -0400 Subject: [PATCH 002/323] Move LexicalRule to its own class; Change name to variable_id; Change tag to matching_variable_id; Use full names for vars (r->rule); Clarify if states are NFA or DFA --- examples/intersect-test.cpp | 5 +- src/log_surgeon/Lexer.hpp | 53 ++++++++++++-------- src/log_surgeon/Lexer.tpp | 37 +++++++------- src/log_surgeon/LogEvent.hpp | 8 +-- src/log_surgeon/finite_automata/RegexAST.hpp | 10 ++-- src/log_surgeon/finite_automata/RegexDFA.hpp | 18 ++++--- src/log_surgeon/finite_automata/RegexDFA.tpp | 17 ++++--- src/log_surgeon/finite_automata/RegexNFA.hpp | 22 ++++---- 8 files changed, 94 insertions(+), 76 deletions(-) diff --git a/examples/intersect-test.cpp b/examples/intersect-test.cpp index 226afd6b..77450d1a 100644 --- a/examples/intersect-test.cpp +++ b/examples/intersect-test.cpp @@ -9,6 +9,7 @@ using log_surgeon::finite_automata::RegexDFAByteState; using log_surgeon::finite_automata::RegexNFA; using log_surgeon::finite_automata::RegexNFAByteState; using log_surgeon::lexers::ByteLexer; +using log_surgeon::LexicalRule; using log_surgeon::ParserAST; using log_surgeon::SchemaVarAST; using std::string; @@ -33,7 +34,7 @@ auto get_intersect_for_query( auto schema_ast = schema.release_schema_ast_ptr(); for (unique_ptr const& parser_ast : schema_ast->m_schema_vars) { auto* schema_var_ast = dynamic_cast(parser_ast.get()); - ByteLexer::Rule rule(0, std::move(schema_var_ast->m_regex_ptr)); + LexicalRule rule(0, std::move(schema_var_ast->m_regex_ptr)); rule.add_ast(&nfa); } auto dfa2 = ByteLexer::nfa_to_dfa(nfa); @@ -70,7 +71,7 @@ auto main() -> int { auto schema_ast = schema.release_schema_ast_ptr(); for (unique_ptr const& parser_ast : schema_ast->m_schema_vars) { auto* var_ast = dynamic_cast(parser_ast.get()); - ByteLexer::Rule rule(m_id_symbol.size(), std::move(var_ast->m_regex_ptr)); + LexicalRule rule(m_id_symbol.size(), std::move(var_ast->m_regex_ptr)); m_id_symbol[m_id_symbol.size()] = var_ast->m_name; rule.add_ast(&nfa); } diff --git a/src/log_surgeon/Lexer.hpp b/src/log_surgeon/Lexer.hpp index 1a0ad137..7693fef0 100644 --- a/src/log_surgeon/Lexer.hpp +++ b/src/log_surgeon/Lexer.hpp @@ -18,6 +18,34 @@ #include namespace log_surgeon { +template +class LexicalRule { +public: + // Constructor + LexicalRule( + uint32_t const variable_id, + std::unique_ptr> regex + ) + : m_variable_id(variable_id), + m_regex(std::move(regex)) {} + + /** + * Adds AST representing the lexical rule to the NFA + * @param nfa + */ + auto add_ast(finite_automata::RegexNFA* nfa) const -> void; + + [[nodiscard]] auto get_variable_id() const -> uint32_t const& { return m_variable_id; } + + [[nodiscard]] auto get_regex() const -> finite_automata::RegexAST* { + return m_regex.get(); + } + +private: + uint32_t m_variable_id; + std::unique_ptr> m_regex; +}; + template class Lexer { public: @@ -26,25 +54,6 @@ class Lexer { static inline std::vector const cTokenUncaughtStringTypes = {(int)SymbolID::TokenUncaughtStringID}; - /** - * A lexical rule has a name and regex pattern - */ - struct Rule { - // Constructor - Rule(uint32_t n, std::unique_ptr> r) - : m_name(n), - m_regex(std::move(r)) {} - - /** - * Adds AST representing the lexical rule to the NFA - * @param nfa - */ - auto add_ast(finite_automata::RegexNFA* nfa) const -> void; - - uint32_t m_name; - std::unique_ptr> m_regex; - }; - /** * Generate a DFA from an NFA * @param finite_automata::RegexNFA nfa @@ -69,10 +78,10 @@ class Lexer { /** * Return regex pattern for a rule name - * @param name + * @param variable_id * @return finite_automata::RegexAST* */ - auto get_rule(uint32_t const& name) -> finite_automata::RegexAST*; + auto get_rule(uint32_t const& variable_id) -> finite_automata::RegexAST*; /** * Generate DFA for lexer @@ -178,7 +187,7 @@ class Lexer { std::set m_type_ids_set; std::array m_is_delimiter{false}; std::array m_is_first_char{false}; - std::vector m_rules; + std::vector> m_rules; uint32_t m_line{0}; bool m_has_delimiters{false}; std::unique_ptr> m_dfa; diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index 42b602ca..4ba1f91e 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -75,7 +75,7 @@ auto Lexer::scan(ParserInputBuffer& input_buffer, To && state->is_accepting()) { m_match = true; - m_type_ids = &(state->get_tags()); + m_type_ids = &(state->get_matching_variable_ids()); m_match_pos = prev_byte_buf_pos; m_match_line = m_line; } @@ -85,7 +85,7 @@ auto Lexer::scan(ParserInputBuffer& input_buffer, To if (m_has_delimiters && !m_match) { next = m_dfa->get_root()->next(next_char); m_match = true; - m_type_ids = &(next->get_tags()); + m_type_ids = &(next->get_matching_variable_ids()); m_start_pos = prev_byte_buf_pos; m_match_pos = input_buffer.storage().pos(); m_match_line = m_line; @@ -206,7 +206,7 @@ auto Lexer::scan_with_wildcard( && state->is_accepting()) { m_match = true; - m_type_ids = &(state->get_tags()); + m_type_ids = &(state->get_matching_variable_ids()); m_match_pos = prev_byte_buf_pos; m_match_line = m_line; } @@ -216,7 +216,7 @@ auto Lexer::scan_with_wildcard( if (m_has_delimiters && !m_match) { next = m_dfa->get_root()->next(next_char); m_match = true; - m_type_ids = &(next->get_tags()); + m_type_ids = &(next->get_matching_variable_ids()); m_start_pos = prev_byte_buf_pos; m_match_pos = input_buffer.storage().pos(); m_match_line = m_line; @@ -362,11 +362,11 @@ void Lexer::add_rule( } template -auto Lexer::get_rule(uint32_t const& name +auto Lexer::get_rule(uint32_t const& variable_id ) -> finite_automata::RegexAST* { - for (Rule& rule : m_rules) { - if (rule.m_name == name) { - return rule.m_regex.get(); + for (auto& rule : m_rules) { + if (rule.get_variable_id() == variable_id) { + return rule.get_regex(); } } return nullptr; @@ -375,8 +375,8 @@ auto Lexer::get_rule(uint32_t const& name template void Lexer::generate() { finite_automata::RegexNFA nfa; - for (Rule const& r : m_rules) { - r.add_ast(&nfa); + for (auto& rule : m_rules) { + rule.add_ast(&nfa); } m_dfa = nfa_to_dfa(nfa); DFAStateType const* state = m_dfa->get_root(); @@ -392,8 +392,8 @@ void Lexer::generate() { template void Lexer::generate_reverse() { finite_automata::RegexNFA nfa; - for (Rule const& r : m_rules) { - r.add_ast(&nfa); + for (auto& rule : m_rules) { + rule.add_ast(&nfa); } nfa.reverse(); m_dfa = nfa_to_dfa(nfa); @@ -407,13 +407,12 @@ void Lexer::generate_reverse() { } } -template -void Lexer::Rule::add_ast(finite_automata::RegexNFA* nfa -) const { - NFAStateType* s = nfa->new_state(); - s->set_accepting(true); - s->set_tag(m_name); - m_regex->add(nfa, s); +template +void LexicalRule::add_ast(finite_automata::RegexNFA* nfa) const { + NFAStateType* end_state = nfa->new_state(); + end_state->set_accepting(true); + end_state->set_matching_variable_id(m_variable_id); + m_regex->add(nfa, end_state); } template diff --git a/src/log_surgeon/LogEvent.hpp b/src/log_surgeon/LogEvent.hpp index 5df70149..ec44c576 100644 --- a/src/log_surgeon/LogEvent.hpp +++ b/src/log_surgeon/LogEvent.hpp @@ -47,11 +47,11 @@ class LogEventView { * NOTE: Currently, the returned Token(s) cannot be const as calling * Token::to_string or Token::to_string_view may mutate Token (to handle the * case where a token is wraps from the end to the beginning of a buffer). - * @param var_id - * @return The tokens corresponding to var_id + * @param variable_id + * @return The tokens corresponding to variable_id */ - [[nodiscard]] auto get_variables(size_t var_id) const -> std::vector const& { - return m_log_var_occurrences[var_id]; + [[nodiscard]] auto get_variables(size_t variable_id) const -> std::vector const& { + return m_log_var_occurrences[variable_id]; } /** diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index a14a9c70..81fe217e 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -94,7 +94,7 @@ class RegexASTLiteral : public RegexAST { /** * Add the needed RegexNFA::states to the passed in nfa to handle a - * RegexASTLiteral before transitioning to a pre-tagged end_state + * RegexASTLiteral before transitioning to an accepting end_state * @param nfa * @param end_state */ @@ -145,7 +145,7 @@ class RegexASTInteger : public RegexAST { /** * Add the needed RegexNFA::states to the passed in nfa to handle a - * RegexASTInteger before transitioning to a pre-tagged end_state + * RegexASTInteger before transitioning to an accepting end_state * @param nfa * @param end_state */ @@ -251,7 +251,7 @@ class RegexASTGroup : public RegexAST { /** * Add the needed RegexNFA::states to the passed in nfa to handle a - * RegexASTGroup before transitioning to a pre-tagged end_state + * RegexASTGroup before transitioning to an accepting end_state * @param nfa * @param end_state */ @@ -339,7 +339,7 @@ class RegexASTOr : public RegexAST { /** * Add the needed RegexNFA::states to the passed in nfa to handle a - * RegexASTOr before transitioning to a pre-tagged end_state + * RegexASTOr before transitioning to an accepting end_state * @param nfa * @param end_state */ @@ -399,7 +399,7 @@ class RegexASTCat : public RegexAST { /** * Add the needed RegexNFA::states to the passed in nfa to handle a - * RegexASTCat before transitioning to a pre-tagged end_state + * RegexASTCat before transitioning to an accepting end_state * @param nfa * @param end_state */ diff --git a/src/log_surgeon/finite_automata/RegexDFA.hpp b/src/log_surgeon/finite_automata/RegexDFA.hpp index 20327c65..6d8a3c01 100644 --- a/src/log_surgeon/finite_automata/RegexDFA.hpp +++ b/src/log_surgeon/finite_automata/RegexDFA.hpp @@ -23,11 +23,15 @@ class RegexDFAState { public: using Tree = UnicodeIntervalTree*>; - auto add_tag(int const& rule_name_id) -> void { m_tags.push_back(rule_name_id); } + auto add_matching_variable_id(int const& variable_id) -> void { + m_matching_variable_ids.push_back(variable_id); + } - [[nodiscard]] auto get_tags() const -> std::vector const& { return m_tags; } + [[nodiscard]] auto get_matching_variable_ids() const -> std::vector const& { + return m_matching_variable_ids; + } - [[nodiscard]] auto is_accepting() const -> bool { return !m_tags.empty(); } + [[nodiscard]] auto is_accepting() const -> bool { return !m_matching_variable_ids.empty(); } auto add_byte_transition(uint8_t const& byte, RegexDFAState* dest_state) -> void { m_bytes_transition[byte] = dest_state; @@ -42,7 +46,7 @@ class RegexDFAState { [[nodiscard]] auto next(uint32_t character) const -> RegexDFAState*; private: - std::vector m_tags; + std::vector m_matching_variable_ids; RegexDFAState* m_bytes_transition[cSizeOfByte]; // NOTE: We don't need m_tree_transitions for the `stateType == // RegexDFAStateType::Byte` case, so we use an empty class (`std::tuple<>`) @@ -89,10 +93,10 @@ class RegexDFAStatePair { } /** - * @return The tags of the first state of the pair + * @return The matching variable ids of the first state of the pair */ - [[nodiscard]] auto get_first_tags() const -> std::vector const& { - return m_state1->get_tags(); + [[nodiscard]] auto get_first_matching_variable_ids() const -> std::vector const& { + return m_state1->get_matching_variable_ids(); } private: diff --git a/src/log_surgeon/finite_automata/RegexDFA.tpp b/src/log_surgeon/finite_automata/RegexDFA.tpp index b4e2bda9..3bab70c7 100644 --- a/src/log_surgeon/finite_automata/RegexDFA.tpp +++ b/src/log_surgeon/finite_automata/RegexDFA.tpp @@ -40,16 +40,17 @@ auto RegexDFAStatePair::get_reachable_pairs( template template -auto RegexDFA::new_state(std::set const& set) -> DFAStateType* { +auto RegexDFA::new_state(std::set const& nfa_state_set +) -> DFAStateType* { std::unique_ptr ptr = std::make_unique(); m_states.push_back(std::move(ptr)); - DFAStateType* state = m_states.back().get(); - for (NFAStateType const* s : set) { - if (s->is_accepting()) { - state->add_tag(s->get_tag()); + DFAStateType* dfa_state = m_states.back().get(); + for (NFAStateType const* nfa_state : nfa_state_set) { + if (nfa_state->is_accepting()) { + dfa_state->add_matching_variable_id(nfa_state->get_matching_variable_id()); } } - return state; + return dfa_state; } template @@ -63,8 +64,8 @@ auto RegexDFA::get_intersect(std::unique_ptr const& dfa_ while (false == unvisited_pairs.empty()) { auto current_pair_it = unvisited_pairs.begin(); if (current_pair_it->is_accepting()) { - auto& tags = current_pair_it->get_first_tags(); - schema_types.insert(tags.begin(), tags.end()); + auto& matching_variable_ids = current_pair_it->get_first_matching_variable_ids(); + schema_types.insert(matching_variable_ids.begin(), matching_variable_ids.end()); } visited_pairs.insert(*current_pair_it); current_pair_it->get_reachable_pairs(visited_pairs, unvisited_pairs); diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 040eb724..74fff507 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -31,9 +31,13 @@ class RegexNFAState { [[nodiscard]] auto is_accepting() const -> bool const& { return m_accepting; } - auto set_tag(int rule_name_id) -> void { m_tag = rule_name_id; } + auto set_matching_variable_id(int const variable_id) -> void { + m_matching_variable_id = variable_id; + } - [[nodiscard]] auto get_tag() const -> int const& { return m_tag; } + [[nodiscard]] auto get_matching_variable_id() const -> int const& { + return m_matching_variable_id; + } auto set_epsilon_transitions(std::vector& epsilon_transitions) -> void { m_epsilon_transitions = epsilon_transitions; @@ -78,7 +82,7 @@ class RegexNFAState { private: bool m_accepting{false}; - int m_tag{0}; + int m_matching_variable_id{0}; std::vector m_epsilon_transitions; std::array, cSizeOfByte> m_bytes_transitions; // NOTE: We don't need m_tree_transitions for the `stateType == @@ -217,15 +221,15 @@ void RegexNFA::reverse() { } } - // propagate tag from old accepting m_states + // propagate matching_variable_id from old accepting m_states for (NFAStateType* old_accepting_state : new_end->get_epsilon_transitions()) { - int tag = old_accepting_state->get_tag(); + int matching_variable_id = old_accepting_state->get_matching_variable_id(); std::stack unvisited_states; std::set visited_states; unvisited_states.push(old_accepting_state); while (!unvisited_states.empty()) { NFAStateType* current_state = unvisited_states.top(); - current_state->set_tag(tag); + current_state->set_matching_variable_id(matching_variable_id); unvisited_states.pop(); visited_states.insert(current_state); for (uint32_t byte = 0; byte < cSizeOfByte; byte++) { @@ -247,7 +251,7 @@ void RegexNFA::reverse() { for (int32_t i = m_states.size() - 1; i >= 0; --i) { std::unique_ptr& src_state_unique_ptr = m_states[i]; NFAStateType* src_state = src_state_unique_ptr.get(); - int tag = src_state->get_tag(); + int matching_variable_id = src_state->get_matching_variable_id(); for (uint32_t byte = 0; byte < cSizeOfByte; byte++) { std::vector byte_transitions = src_state->get_byte_transitions(byte); for (int32_t j = byte_transitions.size() - 1; j >= 0; --j) { @@ -255,7 +259,7 @@ void RegexNFA::reverse() { if (dest_state == m_root) { dest_state = new_state(); assert(dest_state != nullptr); - dest_state->set_tag(tag); + dest_state->set_matching_variable_id(matching_variable_id); dest_state->set_accepting(true); } } @@ -267,7 +271,7 @@ void RegexNFA::reverse() { NFAStateType*& dest_state = epsilon_transitions[j]; if (dest_state == m_root) { dest_state = new_state(); - dest_state->set_tag(src_state->get_tag()); + dest_state->set_matching_variable_id(src_state->get_matching_variable_id()); dest_state->set_accepting(true); } } From c24f6e1eedfd6bb3c80079c2e738e33b6295d5b5 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 12 Sep 2024 09:06:28 -0400 Subject: [PATCH 003/323] Additional fix for swapping meaning of tag --- src/log_surgeon/finite_automata/RegexAST.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 81fe217e..1bf51982 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -46,7 +46,7 @@ class RegexAST { /** * Add the needed RegexNFA::states to the passed in nfa to handle the - * current node before transitioning to a pre-tagged end_state + * current node before transitioning to an accepting end_state * @param nfa * @param end_state */ From 33582dae8855bdc92f59cf60b23531014ef1229f Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 12 Sep 2024 09:12:22 -0400 Subject: [PATCH 004/323] Another additional fix for swapping meaning of tag --- src/log_surgeon/finite_automata/RegexAST.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 1bf51982..7f4db644 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -468,7 +468,7 @@ class RegexASTMultiplication : public RegexAST { /** * Add the needed RegexNFA::states to the passed in nfa to handle a - * RegexASTMultiplication before transitioning to a pre-tagged end_state + * RegexASTMultiplication before transitioning to an accepting end_state * @param nfa * @param end_state */ @@ -538,7 +538,7 @@ class RegexASTCapture : public RegexAST { /** * Adds the needed `RegexNFA::states` to the passed in nfa to handle a - * `RegexASTCapture` before transitioning to a pre-tagged `end_state`. + * `RegexASTCapture` before transitioning to an accepting `end_state`. * @param nfa * @param end_state */ From 3338ec7429a20a9b1edba412d3cb7b4cd1379c5f Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 12 Sep 2024 09:17:37 -0400 Subject: [PATCH 005/323] Fix up some comments --- src/log_surgeon/LogParser.cpp | 10 ++++++++-- src/log_surgeon/finite_automata/RegexNFA.hpp | 1 + 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/LogParser.cpp b/src/log_surgeon/LogParser.cpp index 9d2e91f8..602a7324 100644 --- a/src/log_surgeon/LogParser.cpp +++ b/src/log_surgeon/LogParser.cpp @@ -53,7 +53,8 @@ void LogParser::add_rules(std::unique_ptr schema_ast) { delimiters.push_back(i); } } - // Currently, required to have delimiters + + // Required to have delimiters if (delimiters.empty()) { throw runtime_error("When using --schema-path, \"delimiters:\" line must be used."); } @@ -94,7 +95,7 @@ void LogParser::add_rules(std::unique_ptr schema_ast) { } // transform '.' from any-character into any non-delimiter character rule->m_regex_ptr->remove_delimiters_from_wildcard(delimiters); - // currently, error out if non-timestamp pattern contains a delimiter + // check if regex contains a delimiter std::array is_possible_input{}; rule->m_regex_ptr->set_possible_inputs_to_true(is_possible_input); @@ -107,6 +108,8 @@ void LogParser::add_rules(std::unique_ptr schema_ast) { break; } } + + // Error out if non-timestamp pattern contains a delimiter if (contains_delimiter) { FileReader schema_reader; ErrorCode error_code = schema_reader.try_open(schema_ast->m_file_path); @@ -141,6 +144,8 @@ void LogParser::add_rules(std::unique_ptr schema_ast) { + arrows + "\n" ); } + + // Add delimiters as prefix to regex as variables require preceeding delimiters unique_ptr> delimiter_group = make_unique>( RegexASTGroup(delimiters) @@ -149,6 +154,7 @@ void LogParser::add_rules(std::unique_ptr schema_ast) { std::move(delimiter_group), std::move(rule->m_regex_ptr) ); + add_rule(rule->m_name, std::move(rule->m_regex_ptr)); } } diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 74fff507..6c1e8953 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -95,6 +95,7 @@ class RegexNFAState { using RegexNFAByteState = RegexNFAState; using RegexNFAUTF8State = RegexNFAState; +// TODO: rename RegexNFA to NFA and RegexDFA to DFA template class RegexNFA { public: From 3cd3c0f250ba48f07f40ca1ab34b614c9957e01e Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 12 Sep 2024 09:22:07 -0400 Subject: [PATCH 006/323] Fix comment grammar --- src/log_surgeon/LogParser.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/LogParser.cpp b/src/log_surgeon/LogParser.cpp index 602a7324..aa16cc06 100644 --- a/src/log_surgeon/LogParser.cpp +++ b/src/log_surgeon/LogParser.cpp @@ -145,7 +145,7 @@ void LogParser::add_rules(std::unique_ptr schema_ast) { ); } - // Add delimiters as prefix to regex as variables require preceeding delimiters + // To make lexing log-specific: modify variable regex to contain a delimiter at the start. unique_ptr> delimiter_group = make_unique>( RegexASTGroup(delimiters) From e05acbb401efbf891d56f45c5838da9ba791df3b Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 13 Sep 2024 10:41:53 -0400 Subject: [PATCH 007/323] Add tags to AST; Serialize AST for testing; Add unit-test for testing added tags --- src/log_surgeon/finite_automata/RegexAST.hpp | 274 +++++++++++++++++++ tests/test-lexer.cpp | 28 ++ 2 files changed, 302 insertions(+) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 7f4db644..270a7573 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -52,11 +52,46 @@ class RegexAST { */ virtual auto add(RegexNFA* nfa, NFAStateType* end_state) -> void = 0; + /** + * Serialize the AST into a string + * @param with_tags + * @return string representing the AST + */ + virtual auto serialize(bool with_tags) -> std::string = 0; + + /** + * Serialize the negative tags + * @return + */ + auto serialize_negative_tags() -> std::string { + std::string serialized_string; + for (auto const& negative_tag : m_negative_tags) { + serialized_string += "<~" + std::to_string(negative_tag) + ">"; + } + return serialized_string; + } + + /** + * Traverse the AST and add positive and negative tags. + */ + virtual auto add_tags(std::vector& all_tags) -> std::vector = 0; + + auto set_negative_tags(std::vector const& negative_tags) -> void { + m_negative_tags = negative_tags; + } + + [[nodiscard]] auto get_negative_tags() const -> std::vector const& { + return m_negative_tags; + } + protected: RegexAST(RegexAST const& rhs) = default; auto operator=(RegexAST const& rhs) -> RegexAST& = default; RegexAST(RegexAST&& rhs) noexcept = default; auto operator=(RegexAST&& rhs) noexcept -> RegexAST& = default; + +private: + std::vector m_negative_tags; }; template @@ -100,6 +135,21 @@ class RegexASTLiteral : public RegexAST { */ auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + /** + * serialize the RegexASTLiteral into a string + * @param with_tags + * @return string representing the AST + */ + auto serialize(bool with_tags) -> std::string override; + + /** + * Do nothing as RegexASTLiteral is a leaf node that is not a capture group + */ + auto add_tags([[maybe_unused]] std::vector& all_tags + ) -> std::vector override { + return {}; + } + [[nodiscard]] auto get_character() const -> uint32_t const& { return m_character; } private: @@ -151,6 +201,21 @@ class RegexASTInteger : public RegexAST { */ auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + /** + * serialize the RegexASTInteger into a string + * @param with_tags + * @return string representing the AST + */ + auto serialize(bool with_tags) -> std::string override; + + /** + * Do nothing as RegexASTInteger is a leaf node that is not a capture group + */ + auto add_tags([[maybe_unused]] std::vector& all_tags + ) -> std::vector override { + return {}; + } + [[nodiscard]] auto get_digits() const -> std::vector const& { return m_digits; } [[nodiscard]] auto get_digit(uint32_t i) const -> uint32_t const& { return m_digits[i]; } @@ -257,6 +322,21 @@ class RegexASTGroup : public RegexAST { */ auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + /** + * serialize the RegexASTGroup into a string + * @param with_tags + * @return string representing the AST + */ + auto serialize(bool with_tags) -> std::string override; + + /** + * Do nothing as RegexASTGroup is a leaf node that is not a capture group + */ + auto add_tags([[maybe_unused]] std::vector& all_tags + ) -> std::vector override { + return {}; + } + auto add_range(uint32_t min, uint32_t max) -> void { m_ranges.emplace_back(min, max); } auto add_literal(uint32_t literal) -> void { m_ranges.emplace_back(literal, literal); } @@ -345,6 +425,26 @@ class RegexASTOr : public RegexAST { */ auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + /** + * serialize the RegexASTOr into a string + * @param with_tags + * @return string representing the AST + */ + auto serialize(bool with_tags) -> std::string override; + + /** + * Traverse the AST and add positive and negative tags. + */ + auto add_tags(std::vector& all_tags) -> std::vector override; + + [[nodiscard]] auto get_left() const -> std::unique_ptr> const& { + return m_left; + } + + [[nodiscard]] auto get_right() const -> std::unique_ptr> const& { + return m_right; + } + private: std::unique_ptr> m_left; std::unique_ptr> m_right; @@ -405,6 +505,18 @@ class RegexASTCat : public RegexAST { */ auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + /** + * serialize the RegexASTCat into a string + * @param with_tags + * @return string representing the AST + */ + auto serialize(bool with_tags) -> std::string override; + + /** + * Traverse the AST and add positive and negative tags. + */ + auto add_tags(std::vector& all_tags) -> std::vector override; + [[nodiscard]] auto get_left() const -> std::unique_ptr> const& { return m_left; } @@ -474,6 +586,18 @@ class RegexASTMultiplication : public RegexAST { */ auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + /** + * serialize the RegexASTMultiplication into a string + * @param with_tags + * @return string representing the AST + */ + auto serialize(bool with_tags) -> std::string override; + + /** + * Traverse the AST and add positive and negative tags. + */ + auto add_tags(std::vector& all_tags) -> std::vector override; + [[nodiscard]] auto is_infinite() const -> bool { return this->m_max == 0; } [[nodiscard]] auto get_operand() const -> std::unique_ptr> const& { @@ -544,6 +668,18 @@ class RegexASTCapture : public RegexAST { */ auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + /** + * serialize the RegexASTCapture into a string + * @param with_tags + * @return string representing the AST + */ + auto serialize(bool with_tags) -> std::string override; + + /** + * Traverse the AST and add positive and negative tags. + */ + auto add_tags(std::vector& all_tags) -> std::vector override; + [[nodiscard]] auto get_group_name() const -> std::string const& { return m_group_name; } [[nodiscard]] auto get_group_regex_ast( @@ -551,9 +687,12 @@ class RegexASTCapture : public RegexAST { return m_group_regex_ast; } + [[nodiscard]] auto get_tag() const -> uint32_t { return m_tag; } + private: std::string m_group_name; std::unique_ptr> m_group_regex_ast; + uint32_t m_tag; }; template @@ -564,6 +703,15 @@ void RegexASTLiteral::add(RegexNFA* nfa, NFAStateTyp nfa->add_root_interval(Interval(m_character, m_character), end_state); } +template +auto RegexASTLiteral::serialize(bool const with_tags) -> std::string { + std::string serialized_string = std::string(1, static_cast(m_character)); + if (with_tags) { + serialized_string += this->serialize_negative_tags(); + } + return serialized_string; +} + template RegexASTInteger::RegexASTInteger(uint32_t digit) { digit = digit - '0'; @@ -585,6 +733,18 @@ void RegexASTInteger::add( throw std::runtime_error("Unsupported"); } +template +auto RegexASTInteger::serialize(bool const with_tags) -> std::string { + std::string serialized_string; + for (auto const& digit : m_digits) { + serialized_string += std::to_string('0' + digit); + } + if (with_tags) { + serialized_string += this->serialize_negative_tags(); + } + return serialized_string; +} + template RegexASTOr::RegexASTOr( std::unique_ptr> left, @@ -599,6 +759,30 @@ void RegexASTOr::add(RegexNFA* nfa, NFAStateType* en m_right->add(nfa, end_state); } +template +auto RegexASTOr::add_tags(std::vector& all_tags) -> std::vector { + auto positive_left_tags = m_left->add_tags(all_tags); + auto positive_right_tags = m_right->add_tags(all_tags); + m_left->set_negative_tags(positive_right_tags); + m_right->set_negative_tags(positive_left_tags); + positive_left_tags.insert( + positive_left_tags.end(), + positive_right_tags.begin(), + positive_right_tags.end() + ); + return positive_left_tags; +} + +template +auto RegexASTOr::serialize(bool const with_tags) -> std::string { + std::string serialized_string + = "(" + m_left->serialize(with_tags) + ")|(" + m_right->serialize(with_tags) + ")"; + if (with_tags) { + serialized_string += this->serialize_negative_tags(); + } + return serialized_string; +} + template RegexASTCat::RegexASTCat( std::unique_ptr> left, @@ -617,6 +801,27 @@ void RegexASTCat::add(RegexNFA* nfa, NFAStateType* e nfa->set_root(saved_root); } +template +auto RegexASTCat::add_tags(std::vector& all_tags) -> std::vector { + auto positive_left_tags = m_left->add_tags(all_tags); + auto positive_right_tags = m_right->add_tags(all_tags); + positive_left_tags.insert( + positive_left_tags.end(), + positive_right_tags.begin(), + positive_right_tags.end() + ); + return positive_left_tags; +} + +template +auto RegexASTCat::serialize(bool const with_tags) -> std::string { + std::string serialized_string = m_left->serialize(with_tags) + m_right->serialize(with_tags); + if (with_tags) { + serialized_string += this->serialize_negative_tags(); + } + return serialized_string; +} + template RegexASTMultiplication::RegexASTMultiplication( std::unique_ptr> operand, @@ -663,11 +868,58 @@ void RegexASTMultiplication::add( nfa->set_root(saved_root); } +template +auto RegexASTMultiplication::add_tags(std::vector& all_tags +) -> std::vector { + return m_operand->add_tags(all_tags); +} + +template +auto RegexASTMultiplication::serialize(bool const with_tags) -> std::string { + std::string serialized_string + = m_operand->serialize(with_tags) + "{" + std::to_string(m_min) + ","; + if (is_infinite()) { + serialized_string += "inf"; + } else { + serialized_string += std::to_string(m_max); + } + serialized_string += "}"; + if (with_tags) { + serialized_string += this->serialize_negative_tags(); + } + return serialized_string; +} + template void RegexASTCapture::add(RegexNFA* nfa, NFAStateType* end_state) { m_group_regex_ast->add(nfa, end_state); } +template +auto RegexASTCapture::add_tags(std::vector& all_tags +) -> std::vector { + m_tag = all_tags.size(); + all_tags.push_back(m_tag); + std::vector child_tags = m_group_regex_ast->add_tags(all_tags); + std::vector new_tags; + new_tags.push_back(m_tag); + new_tags.insert(new_tags.end(), child_tags.begin(), child_tags.end()); + return new_tags; +} + +template +auto RegexASTCapture::serialize(bool const with_tags) -> std::string { + std::string serialized_string = "("; + if (false == with_tags) { + serialized_string += "?<" + m_group_name + ">"; + } + serialized_string += m_group_regex_ast->serialize(with_tags) + ")"; + if (with_tags) { + serialized_string += "<" + std::to_string(m_tag) + ">" + this->serialize_negative_tags(); + } + return serialized_string; +} + template RegexASTGroup::RegexASTGroup() = default; @@ -791,6 +1043,28 @@ void RegexASTGroup::add(RegexNFA* nfa, NFAStateType* nfa->get_root()->add_interval(Interval(begin, end), end_state); } } + +template +auto RegexASTGroup::serialize(bool const with_tags) -> std::string { + std::string serialized_string; + serialized_string += "["; + if (m_negate) { + serialized_string += "^"; + } + if (m_is_wildcard) { + serialized_string += "*"; + } else { + for (auto const& [begin, end] : m_ranges) { + serialized_string += std::string(1, static_cast(begin)) + "-" + + std::string(1, static_cast(end)); + } + } + serialized_string += "]"; + if (with_tags) { + serialized_string += this->serialize_negative_tags(); + } + return serialized_string; +} } // namespace log_surgeon::finite_automata #endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_AST_HPP diff --git a/tests/test-lexer.cpp b/tests/test-lexer.cpp index 16b399b5..92e845b7 100644 --- a/tests/test-lexer.cpp +++ b/tests/test-lexer.cpp @@ -1,3 +1,7 @@ +#include +#include +#include + #include #include @@ -78,4 +82,28 @@ TEST_CASE("Test the Schema class", "[Schema]") { REQUIRE('0' == regex_ast_group_ast->get_ranges()[0].first); REQUIRE('9' == regex_ast_group_ast->get_ranges()[0].second); } + + SECTION("Test AST with tags") { + schema.add_variable( + "capture", + "Z|(A(?((?(a)|(b))|(?(c)|(d))))B(?\\d+)C)", + -1 + ); + auto const schema_ast = schema.release_schema_ast_ptr(); + auto& capture_rule_ast + = dynamic_cast(*schema_ast->m_schema_vars[0]); + std::vector all_tags; + capture_rule_ast.m_regex_ptr->add_tags(all_tags); + + std::string expected_serialized_string + = "(Z)|(A(?((?(a)|(b)))|((?(c)|" + "(d))))B(?[0-9]{1,inf})C)"; + REQUIRE(capture_rule_ast.m_regex_ptr->serialize(false) == expected_serialized_string); + + std::string expected_serialized_string_with_tags + = "(Z<~0><~1><~2><~3>)|(A((((a)|(b))<1><~2>)|(((c)|(d))<2><~1>))<0>B([0-9]{1,inf})<" + "3>C)"; + REQUIRE(capture_rule_ast.m_regex_ptr->serialize(true) + == expected_serialized_string_with_tags); + } } From 5e61e83daef8363d9e9b873c15be0465a00221b7 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 13 Sep 2024 10:59:01 -0400 Subject: [PATCH 008/323] Use using to condense code; Use a unique schema object for each test for clairty that nothing is shared b/w tests --- tests/test-lexer.cpp | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/tests/test-lexer.cpp b/tests/test-lexer.cpp index 92e845b7..43babedb 100644 --- a/tests/test-lexer.cpp +++ b/tests/test-lexer.cpp @@ -9,6 +9,9 @@ #include #include +using std::string; +using std::vector; + using RegexASTCatByte = log_surgeon::finite_automata::RegexASTCat< log_surgeon::finite_automata::RegexNFAByteState>; using RegexASTCaptureByte = log_surgeon::finite_automata::RegexASTCapture< @@ -19,27 +22,29 @@ using RegexASTLiteralByte = log_surgeon::finite_automata::RegexASTLiteral< log_surgeon::finite_automata::RegexNFAByteState>; using RegexASTMultiplicationByte = log_surgeon::finite_automata::RegexASTMultiplication< log_surgeon::finite_automata::RegexNFAByteState>; +using RegexASTOrByte + = log_surgeon::finite_automata::RegexASTOr; +using log_surgeon::SchemaVarAST; TEST_CASE("Test the Schema class", "[Schema]") { - log_surgeon::Schema schema; - SECTION("Add a number variable to schema") { - schema.add_variable("myNumber", "123", -1); + log_surgeon::Schema schema; + string const var_name = "myNumber"; + schema.add_variable(var_name, "123", -1); auto const schema_ast = schema.release_schema_ast_ptr(); REQUIRE(schema_ast->m_schema_vars.size() == 1); REQUIRE(schema.release_schema_ast_ptr()->m_schema_vars.empty()); auto& schema_var_ast_ptr = schema_ast->m_schema_vars[0]; REQUIRE(nullptr != schema_var_ast_ptr); - auto& schema_var_ast = dynamic_cast(*schema_var_ast_ptr); - REQUIRE("myNumber" == schema_var_ast.m_name); + auto& schema_var_ast = dynamic_cast(*schema_var_ast_ptr); + REQUIRE(var_name == schema_var_ast.m_name); - REQUIRE_NOTHROW([&]() { - auto& regex_ast_cat = dynamic_cast(*schema_var_ast.m_regex_ptr); - }()); + REQUIRE_NOTHROW([&]() { dynamic_cast(*schema_var_ast.m_regex_ptr); }()); } SECTION("Add a capture variable to schema") { + log_surgeon::Schema schema; std::string const var_name = "capture"; schema.add_variable(var_name, "u(?[0-9]+)", -1); auto const schema_ast = schema.release_schema_ast_ptr(); @@ -48,7 +53,7 @@ TEST_CASE("Test the Schema class", "[Schema]") { auto& schema_var_ast_ptr = schema_ast->m_schema_vars[0]; REQUIRE(nullptr != schema_var_ast_ptr); - auto& schema_var_ast = dynamic_cast(*schema_var_ast_ptr); + auto& schema_var_ast = dynamic_cast(*schema_var_ast_ptr); REQUIRE(var_name == schema_var_ast.m_name); auto* regex_ast_cat_ptr = dynamic_cast(schema_var_ast.m_regex_ptr.get()); @@ -84,23 +89,22 @@ TEST_CASE("Test the Schema class", "[Schema]") { } SECTION("Test AST with tags") { + log_surgeon::Schema schema; schema.add_variable( "capture", "Z|(A(?((?(a)|(b))|(?(c)|(d))))B(?\\d+)C)", -1 ); auto const schema_ast = schema.release_schema_ast_ptr(); - auto& capture_rule_ast - = dynamic_cast(*schema_ast->m_schema_vars[0]); - std::vector all_tags; + auto& capture_rule_ast = dynamic_cast(*schema_ast->m_schema_vars[0]); + vector all_tags; capture_rule_ast.m_regex_ptr->add_tags(all_tags); - std::string expected_serialized_string - = "(Z)|(A(?((?(a)|(b)))|((?(c)|" - "(d))))B(?[0-9]{1,inf})C)"; + string expected_serialized_string = "(Z)|(A(?((?(a)|(b)))|((?(c)|" + "(d))))B(?[0-9]{1,inf})C)"; REQUIRE(capture_rule_ast.m_regex_ptr->serialize(false) == expected_serialized_string); - std::string expected_serialized_string_with_tags + string expected_serialized_string_with_tags = "(Z<~0><~1><~2><~3>)|(A((((a)|(b))<1><~2>)|(((c)|(d))<2><~1>))<0>B([0-9]{1,inf})<" "3>C)"; REQUIRE(capture_rule_ast.m_regex_ptr->serialize(true) From 082090dcf498ca0932db7d3f897da832c122da05 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 13 Sep 2024 12:07:14 -0400 Subject: [PATCH 009/323] Add has_capture_groups(); Add unit-test for has_capture_groups() --- src/log_surgeon/finite_automata/RegexAST.hpp | 63 ++++++++++++++++++++ tests/test-lexer.cpp | 11 ++++ 2 files changed, 74 insertions(+) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 270a7573..353755e6 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -52,6 +52,12 @@ class RegexAST { */ virtual auto add(RegexNFA* nfa, NFAStateType* end_state) -> void = 0; + /** + * Traverse the AST and check if it contains a capture group + * @return true if the AST contains a capture group, false otherwise + */ + virtual auto has_capture_groups() -> bool = 0; + /** * Serialize the AST into a string * @param with_tags @@ -135,6 +141,12 @@ class RegexASTLiteral : public RegexAST { */ auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + /** + * Return false as RegexASTLiteral is a leaf node that is not a capture group + * @return false + */ + auto has_capture_groups() -> bool override { return false; } + /** * serialize the RegexASTLiteral into a string * @param with_tags @@ -201,6 +213,12 @@ class RegexASTInteger : public RegexAST { */ auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + /** + * Return false as RegexASTInteger is a leaf node that is not a capture group + * @return false + */ + auto has_capture_groups() -> bool override { return false; } + /** * serialize the RegexASTInteger into a string * @param with_tags @@ -322,6 +340,12 @@ class RegexASTGroup : public RegexAST { */ auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + /** + * Return false as RegexASTGroup is a leaf node that is not a capture group + * @return false + */ + auto has_capture_groups() -> bool override { return false; } + /** * serialize the RegexASTGroup into a string * @param with_tags @@ -425,6 +449,12 @@ class RegexASTOr : public RegexAST { */ auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + /** + * Traverse the AST and check if RegexASTOr contains a capture group + * @return true if the AST contains a capture group, false otherwise + */ + auto has_capture_groups() -> bool override; + /** * serialize the RegexASTOr into a string * @param with_tags @@ -505,6 +535,12 @@ class RegexASTCat : public RegexAST { */ auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + /** + * Traverse the AST and check if it contains a capture group + * @return true if the AST contains a capture group, false otherwise + */ + auto has_capture_groups() -> bool override; + /** * serialize the RegexASTCat into a string * @param with_tags @@ -586,6 +622,12 @@ class RegexASTMultiplication : public RegexAST { */ auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + /** + * Traverse the AST and check if RegexASTMultiplication contains a capture group + * @return true if the AST contains a capture group, false otherwise + */ + auto has_capture_groups() -> bool override; + /** * serialize the RegexASTMultiplication into a string * @param with_tags @@ -668,6 +710,12 @@ class RegexASTCapture : public RegexAST { */ auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + /** + * Return true as RegexASTCapture is a capture group + * @return true + */ + auto has_capture_groups() -> bool override { return true; } + /** * serialize the RegexASTCapture into a string * @param with_tags @@ -759,6 +807,11 @@ void RegexASTOr::add(RegexNFA* nfa, NFAStateType* en m_right->add(nfa, end_state); } +template +auto RegexASTOr::has_capture_groups() -> bool { + return m_left->has_capture_groups() || m_right->has_capture_groups(); +} + template auto RegexASTOr::add_tags(std::vector& all_tags) -> std::vector { auto positive_left_tags = m_left->add_tags(all_tags); @@ -801,6 +854,11 @@ void RegexASTCat::add(RegexNFA* nfa, NFAStateType* e nfa->set_root(saved_root); } +template +auto RegexASTCat::has_capture_groups() -> bool { + return m_left->has_capture_groups() || m_right->has_capture_groups(); +} + template auto RegexASTCat::add_tags(std::vector& all_tags) -> std::vector { auto positive_left_tags = m_left->add_tags(all_tags); @@ -868,6 +926,11 @@ void RegexASTMultiplication::add( nfa->set_root(saved_root); } +template +auto RegexASTMultiplication::has_capture_groups() -> bool { + return m_operand->has_capture_groups(); +} + template auto RegexASTMultiplication::add_tags(std::vector& all_tags ) -> std::vector { diff --git a/tests/test-lexer.cpp b/tests/test-lexer.cpp index 43babedb..a90816b1 100644 --- a/tests/test-lexer.cpp +++ b/tests/test-lexer.cpp @@ -88,6 +88,17 @@ TEST_CASE("Test the Schema class", "[Schema]") { REQUIRE('9' == regex_ast_group_ast->get_ranges()[0].second); } + SECTION("Test has_capture_groups()") { + log_surgeon::Schema schema; + schema.add_variable("number", "123", -1); + schema.add_variable("capture", "user_id=(?[0-9]+)", -1); + auto const schema_ast = schema.release_schema_ast_ptr(); + auto& number_var_ast = dynamic_cast(*schema_ast->m_schema_vars[0]); + REQUIRE(false == number_var_ast.m_regex_ptr->has_capture_groups()); + auto& capture_var_ast = dynamic_cast(*schema_ast->m_schema_vars[1]); + REQUIRE(capture_var_ast.m_regex_ptr->has_capture_groups()); + } + SECTION("Test AST with tags") { log_surgeon::Schema schema; schema.add_variable( From 2c6d94e533d0ec182c2a546b5f568c6f6a201180 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 13 Sep 2024 13:24:20 -0400 Subject: [PATCH 010/323] Create and use RegexASTEmpty to split RegexASTgroup with min=0 into RegexASTgroup with min = 1 OR'd with RegexASTEmpty --- src/log_surgeon/SchemaParser.cpp | 17 +++- src/log_surgeon/finite_automata/RegexAST.hpp | 82 ++++++++++++++++++++ 2 files changed, 97 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/SchemaParser.cpp b/src/log_surgeon/SchemaParser.cpp index a20172c8..3b4c39ee 100644 --- a/src/log_surgeon/SchemaParser.cpp +++ b/src/log_surgeon/SchemaParser.cpp @@ -28,6 +28,8 @@ using RegexASTCatByte = log_surgeon::finite_automata::RegexASTCat< log_surgeon::finite_automata::RegexNFAByteState>; using RegexASTCaptureByte = log_surgeon::finite_automata::RegexASTCapture< log_surgeon::finite_automata::RegexNFAByteState>; +using RegexASTEmptyByte = log_surgeon::finite_automata::RegexASTEmpty< + log_surgeon::finite_automata::RegexNFAByteState>; using std::make_unique; using std::string; @@ -186,8 +188,11 @@ static auto regex_or_rule(NonTerminal* m) -> unique_ptr { static auto regex_match_zero_or_more_rule(NonTerminal* m) -> unique_ptr { auto& r1 = m->non_terminal_cast(0)->get_parser_ast()->get>(); - return unique_ptr(new ParserValueRegex( - unique_ptr(new RegexASTMultiplicationByte(std::move(r1), 0, 0)) + + // To handle negative tags we need to split up the min == 0 and min > 0 case + return make_unique(make_unique( + make_unique(), + make_unique(std::move(r1), 1, 0) )); } @@ -228,6 +233,14 @@ static auto regex_match_range_rule(NonTerminal* m) -> unique_ptr { max += r5_ptr->get_digit(i) * (uint32_t)pow(10, r5_size - i - 1); } auto& r1 = m->non_terminal_cast(0)->get_parser_ast()->get>(); + + if (min == 0) { + // To handle negative tags we need to split up the min == 0 and min > 0 case + return make_unique(make_unique( + make_unique(), + make_unique(std::move(r1), 1, max) + )); + } return unique_ptr(new ParserValueRegex( unique_ptr(new RegexASTMultiplicationByte(std::move(r1), min, max)) )); diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 353755e6..2a635003 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -100,6 +100,68 @@ class RegexAST { std::vector m_negative_tags; }; +template +class RegexASTEmpty : public RegexAST { +public: + RegexASTEmpty(); + + /** + * Used for cloning a unique_pointer of type RegexASTEmpty + * @return RegexASTEmpty* + */ + [[nodiscard]] auto clone() const -> gsl::owner override { + return new RegexASTEmpty(*this); + } + + /** + * Sets is_possible_input to specify which utf8 characters are allowed in a + * lexer rule containing RegexASTEmpty at a leaf node in its AST, which is nothing + * @param is_possible_input + */ + auto set_possible_inputs_to_true( + [[maybe_unused]] std::array& is_possible_input + ) const -> void override {} + + /** + * Transforms '.' to to be any non-delimiter in a lexer rule, which does + * nothing as RegexASTEmpty is a leaf node that is not a RegexASTGroup + * @param delimiters + */ + auto remove_delimiters_from_wildcard([[maybe_unused]] std::vector& delimiters + ) -> void override { + // Do nothing + } + + /** + * Add the needed RegexNFA::states to the passed in nfa to handle a + * RegexASTEmpty before transitioning to an accepting end_state + * @param nfa + * @param end_state + */ + auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + + /** + * Return false as RegexASTEmpty is a leaf node that is not a capture group + * @return false + */ + auto has_capture_groups() -> bool override { return false; } + + /** + * serialize the RegexASTEmpty into a string + * @param with_tags + * @return string representing the AST + */ + auto serialize(bool const with_tags) -> std::string; + + /** + * Do nothing as RegexASTEmpty is a leaf node that is not a capture group + */ + auto add_tags([[maybe_unused]] std::vector& all_tags + ) -> std::vector override { + return {}; + } +}; + template class RegexASTLiteral : public RegexAST { public: @@ -743,6 +805,26 @@ class RegexASTCapture : public RegexAST { uint32_t m_tag; }; +template +RegexASTEmpty::RegexASTEmpty() = default; + +template +void RegexASTEmpty::add( + [[maybe_unused]] RegexNFA* nfa, + [[maybe_unused]] NFAStateType* end_state +) { + // DO NOTHING +} + +template +auto RegexASTEmpty::serialize(bool const with_tags) -> std::string { + std::string serialized_string; + if (with_tags) { + serialized_string += this->serialize_negative_tags(); + } + return serialized_string; +} + template RegexASTLiteral::RegexASTLiteral(uint32_t character) : m_character(character) {} From 4e02f240c3ab7f6bcbdc56fdff70fb2f2a9713a7 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 13 Sep 2024 13:41:05 -0400 Subject: [PATCH 011/323] Add unit-test for 0 repetition regex --- tests/test-lexer.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test-lexer.cpp b/tests/test-lexer.cpp index a90816b1..e88541a0 100644 --- a/tests/test-lexer.cpp +++ b/tests/test-lexer.cpp @@ -121,4 +121,16 @@ TEST_CASE("Test the Schema class", "[Schema]") { REQUIRE(capture_rule_ast.m_regex_ptr->serialize(true) == expected_serialized_string_with_tags); } + + SECTION("Test reptition regex with min=0") { + log_surgeon::Schema schema; + schema.add_variable("capture", "(?(a)){0,1}", -1); + auto const schema_ast = schema.release_schema_ast_ptr(); + auto& capture_rule_ast = dynamic_cast(*schema_ast->m_schema_vars[0]); + vector all_tags; + capture_rule_ast.m_regex_ptr->add_tags(all_tags); + string expected_serialized_string_with_tags = "(<~0>)|((a)<0>{1,1})"; + REQUIRE(capture_rule_ast.m_regex_ptr->serialize(true) + == expected_serialized_string_with_tags); + } } From bb3c543dadcc1c1218909276d514d9e8dce03ced Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 13 Sep 2024 14:35:31 -0400 Subject: [PATCH 012/323] Add more tests for repetition regex --- tests/test-lexer.cpp | 63 ++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 28 deletions(-) diff --git a/tests/test-lexer.cpp b/tests/test-lexer.cpp index e88541a0..0ea2b7d0 100644 --- a/tests/test-lexer.cpp +++ b/tests/test-lexer.cpp @@ -26,6 +26,21 @@ using RegexASTOrByte = log_surgeon::finite_automata::RegexASTOr; using log_surgeon::SchemaVarAST; +auto test_regex_ast( + string const& regex, + string const& expected_serialized_ast_without_tags, + string const& expected_serialized_ast_with_tags +) -> void { + log_surgeon::Schema schema; + schema.add_variable("capture", regex, -1); + auto const schema_ast = schema.release_schema_ast_ptr(); + auto& capture_rule_ast = dynamic_cast(*schema_ast->m_schema_vars[0]); + vector all_tags; + capture_rule_ast.m_regex_ptr->add_tags(all_tags); + REQUIRE(capture_rule_ast.m_regex_ptr->serialize(false) == expected_serialized_ast_without_tags); + REQUIRE(capture_rule_ast.m_regex_ptr->serialize(true) == expected_serialized_ast_with_tags); +} + TEST_CASE("Test the Schema class", "[Schema]") { SECTION("Add a number variable to schema") { log_surgeon::Schema schema; @@ -100,37 +115,29 @@ TEST_CASE("Test the Schema class", "[Schema]") { } SECTION("Test AST with tags") { - log_surgeon::Schema schema; - schema.add_variable( - "capture", + test_regex_ast( "Z|(A(?((?(a)|(b))|(?(c)|(d))))B(?\\d+)C)", - -1 + "(Z)|(A(?((?(a)|(b)))|((?(c)|(d))))B(?[0-9]{" + "1,inf})C)", + "(Z<~0><~1><~2><~3>)|(A((((a)|(b))<1><~2>)|(((c)|(d))<2><~1>))<0>B([0-9]{1,inf})<3>" + "C)" ); - auto const schema_ast = schema.release_schema_ast_ptr(); - auto& capture_rule_ast = dynamic_cast(*schema_ast->m_schema_vars[0]); - vector all_tags; - capture_rule_ast.m_regex_ptr->add_tags(all_tags); - - string expected_serialized_string = "(Z)|(A(?((?(a)|(b)))|((?(c)|" - "(d))))B(?[0-9]{1,inf})C)"; - REQUIRE(capture_rule_ast.m_regex_ptr->serialize(false) == expected_serialized_string); - - string expected_serialized_string_with_tags - = "(Z<~0><~1><~2><~3>)|(A((((a)|(b))<1><~2>)|(((c)|(d))<2><~1>))<0>B([0-9]{1,inf})<" - "3>C)"; - REQUIRE(capture_rule_ast.m_regex_ptr->serialize(true) - == expected_serialized_string_with_tags); } - SECTION("Test reptition regex with min=0") { - log_surgeon::Schema schema; - schema.add_variable("capture", "(?(a)){0,1}", -1); - auto const schema_ast = schema.release_schema_ast_ptr(); - auto& capture_rule_ast = dynamic_cast(*schema_ast->m_schema_vars[0]); - vector all_tags; - capture_rule_ast.m_regex_ptr->add_tags(all_tags); - string expected_serialized_string_with_tags = "(<~0>)|((a)<0>{1,1})"; - REQUIRE(capture_rule_ast.m_regex_ptr->serialize(true) - == expected_serialized_string_with_tags); + SECTION("Test reptition regex") { + // Repetition without capture groups untagged and tagged AST are the same + test_regex_ast("a{0,10}","()|(a{1,10})","()|(a{1,10})"); + test_regex_ast("a{5,10}","a{5,10}","a{5,10}"); + test_regex_ast("a*","()|(a{1,inf})","()|(a{1,inf})"); + test_regex_ast("a+","a{1,inf}","a{1,inf}"); + + // Repetition with capture groups untagged and tagged AST are different + test_regex_ast("(?a){0,10}","()|((?a){1,10})","(<~0>)|((a)<0>{1,10})"); + test_regex_ast("(?a){5,10}","(?a){5,10}","(a)<0>{5,10}"); + test_regex_ast("(?a)*","()|((?a){1,inf})","(<~0>)|((a)<0>{1,inf})"); + test_regex_ast("(?a)+","(?a){1,inf}","(a)<0>{1,inf}"); + + // Capture group with repetition + test_regex_ast("(?a{0,10})","(?()|(a{1,10}))","(()|(a{1,10}))<0>"); } } From 54027ad89071c3d51c02c3907fa4ecc2a3afb7bb Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 16 Sep 2024 08:14:32 -0400 Subject: [PATCH 013/323] Return by value in literal getters; Use const instead of const& for literal arguments; Use const& for non-literals; Use auto where possible; Use uint32_t over int for ids; replace begin() and end() with cbegin() and cend() --- src/log_surgeon/Lexer.hpp | 4 ++-- src/log_surgeon/Lexer.tpp | 10 +++++----- src/log_surgeon/finite_automata/RegexDFA.hpp | 2 +- src/log_surgeon/finite_automata/RegexDFA.tpp | 4 ++-- src/log_surgeon/finite_automata/RegexNFA.hpp | 10 +++++----- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/log_surgeon/Lexer.hpp b/src/log_surgeon/Lexer.hpp index 7693fef0..c7528181 100644 --- a/src/log_surgeon/Lexer.hpp +++ b/src/log_surgeon/Lexer.hpp @@ -35,7 +35,7 @@ class LexicalRule { */ auto add_ast(finite_automata::RegexNFA* nfa) const -> void; - [[nodiscard]] auto get_variable_id() const -> uint32_t const& { return m_variable_id; } + [[nodiscard]] auto get_variable_id() const -> uint32_t { return m_variable_id; } [[nodiscard]] auto get_regex() const -> finite_automata::RegexAST* { return m_regex.get(); @@ -81,7 +81,7 @@ class Lexer { * @param variable_id * @return finite_automata::RegexAST* */ - auto get_rule(uint32_t const& variable_id) -> finite_automata::RegexAST*; + auto get_rule(uint32_t variable_id) -> finite_automata::RegexAST*; /** * Generate DFA for lexer diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index 4ba1f91e..c7b60b1d 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -362,9 +362,9 @@ void Lexer::add_rule( } template -auto Lexer::get_rule(uint32_t const& variable_id +auto Lexer::get_rule(uint32_t const variable_id ) -> finite_automata::RegexAST* { - for (auto& rule : m_rules) { + for (auto const& rule : m_rules) { if (rule.get_variable_id() == variable_id) { return rule.get_regex(); } @@ -375,7 +375,7 @@ auto Lexer::get_rule(uint32_t const& variable_id template void Lexer::generate() { finite_automata::RegexNFA nfa; - for (auto& rule : m_rules) { + for (auto const& rule : m_rules) { rule.add_ast(&nfa); } m_dfa = nfa_to_dfa(nfa); @@ -392,7 +392,7 @@ void Lexer::generate() { template void Lexer::generate_reverse() { finite_automata::RegexNFA nfa; - for (auto& rule : m_rules) { + for (auto const& rule : m_rules) { rule.add_ast(&nfa); } nfa.reverse(); @@ -409,7 +409,7 @@ void Lexer::generate_reverse() { template void LexicalRule::add_ast(finite_automata::RegexNFA* nfa) const { - NFAStateType* end_state = nfa->new_state(); + auto* end_state = nfa->new_state(); end_state->set_accepting(true); end_state->set_matching_variable_id(m_variable_id); m_regex->add(nfa, end_state); diff --git a/src/log_surgeon/finite_automata/RegexDFA.hpp b/src/log_surgeon/finite_automata/RegexDFA.hpp index 6d8a3c01..52133d4d 100644 --- a/src/log_surgeon/finite_automata/RegexDFA.hpp +++ b/src/log_surgeon/finite_automata/RegexDFA.hpp @@ -23,7 +23,7 @@ class RegexDFAState { public: using Tree = UnicodeIntervalTree*>; - auto add_matching_variable_id(int const& variable_id) -> void { + auto add_matching_variable_id(uint32_t const variable_id) -> void { m_matching_variable_ids.push_back(variable_id); } diff --git a/src/log_surgeon/finite_automata/RegexDFA.tpp b/src/log_surgeon/finite_automata/RegexDFA.tpp index 3bab70c7..cbf6a4db 100644 --- a/src/log_surgeon/finite_automata/RegexDFA.tpp +++ b/src/log_surgeon/finite_automata/RegexDFA.tpp @@ -64,8 +64,8 @@ auto RegexDFA::get_intersect(std::unique_ptr const& dfa_ while (false == unvisited_pairs.empty()) { auto current_pair_it = unvisited_pairs.begin(); if (current_pair_it->is_accepting()) { - auto& matching_variable_ids = current_pair_it->get_first_matching_variable_ids(); - schema_types.insert(matching_variable_ids.begin(), matching_variable_ids.end()); + auto const& matching_variable_ids = current_pair_it->get_first_matching_variable_ids(); + schema_types.insert(matching_variable_ids.cbegin(), matching_variable_ids.cend()); } visited_pairs.insert(*current_pair_it); current_pair_it->get_reachable_pairs(visited_pairs, unvisited_pairs); diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 74fff507..237e9282 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -31,11 +31,11 @@ class RegexNFAState { [[nodiscard]] auto is_accepting() const -> bool const& { return m_accepting; } - auto set_matching_variable_id(int const variable_id) -> void { + auto set_matching_variable_id(uint32_t const variable_id) -> void { m_matching_variable_id = variable_id; } - [[nodiscard]] auto get_matching_variable_id() const -> int const& { + [[nodiscard]] auto get_matching_variable_id() const -> uint32_t { return m_matching_variable_id; } @@ -82,7 +82,7 @@ class RegexNFAState { private: bool m_accepting{false}; - int m_matching_variable_id{0}; + uint32_t m_matching_variable_id{0}; std::vector m_epsilon_transitions; std::array, cSizeOfByte> m_bytes_transitions; // NOTE: We don't need m_tree_transitions for the `stateType == @@ -223,7 +223,7 @@ void RegexNFA::reverse() { // propagate matching_variable_id from old accepting m_states for (NFAStateType* old_accepting_state : new_end->get_epsilon_transitions()) { - int matching_variable_id = old_accepting_state->get_matching_variable_id(); + auto matching_variable_id = old_accepting_state->get_matching_variable_id(); std::stack unvisited_states; std::set visited_states; unvisited_states.push(old_accepting_state); @@ -251,7 +251,7 @@ void RegexNFA::reverse() { for (int32_t i = m_states.size() - 1; i >= 0; --i) { std::unique_ptr& src_state_unique_ptr = m_states[i]; NFAStateType* src_state = src_state_unique_ptr.get(); - int matching_variable_id = src_state->get_matching_variable_id(); + auto matching_variable_id = src_state->get_matching_variable_id(); for (uint32_t byte = 0; byte < cSizeOfByte; byte++) { std::vector byte_transitions = src_state->get_byte_transitions(byte); for (int32_t j = byte_transitions.size() - 1; j >= 0; --j) { From e58274ffcb3743a844a72a68e57ea49600d66660 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 16 Sep 2024 08:59:11 -0400 Subject: [PATCH 014/323] Refactor new_state() --- src/log_surgeon/finite_automata/RegexDFA.hpp | 4 ++-- src/log_surgeon/finite_automata/RegexDFA.tpp | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexDFA.hpp b/src/log_surgeon/finite_automata/RegexDFA.hpp index 52133d4d..3e0813bd 100644 --- a/src/log_surgeon/finite_automata/RegexDFA.hpp +++ b/src/log_surgeon/finite_automata/RegexDFA.hpp @@ -113,11 +113,11 @@ class RegexDFA { /** * Creates a new DFA state based on a set of NFA states and adds it to * m_states - * @param set + * @param nfa_state_set * @return DFAStateType* */ template - auto new_state(std::set const& set) -> DFAStateType*; + auto new_state(std::set const& nfa_state_set) -> DFAStateType*; auto get_root() const -> DFAStateType const* { return m_states.at(0).get(); } diff --git a/src/log_surgeon/finite_automata/RegexDFA.tpp b/src/log_surgeon/finite_automata/RegexDFA.tpp index cbf6a4db..891d0856 100644 --- a/src/log_surgeon/finite_automata/RegexDFA.tpp +++ b/src/log_surgeon/finite_automata/RegexDFA.tpp @@ -42,10 +42,9 @@ template template auto RegexDFA::new_state(std::set const& nfa_state_set ) -> DFAStateType* { - std::unique_ptr ptr = std::make_unique(); - m_states.push_back(std::move(ptr)); + m_states.emplace_back(std::make_unique()); DFAStateType* dfa_state = m_states.back().get(); - for (NFAStateType const* nfa_state : nfa_state_set) { + for (auto const* nfa_state : nfa_state_set) { if (nfa_state->is_accepting()) { dfa_state->add_matching_variable_id(nfa_state->get_matching_variable_id()); } From 13218714a7a05630bbcfe2d09de24e9a2eef538b Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 16 Sep 2024 10:38:11 -0400 Subject: [PATCH 015/323] Rename get_first_matching_variable_ids() to get_matching_variable_ids(); Add docstrign to RegexDFAStatePair --- src/log_surgeon/finite_automata/RegexDFA.hpp | 9 ++++++++- src/log_surgeon/finite_automata/RegexDFA.tpp | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexDFA.hpp b/src/log_surgeon/finite_automata/RegexDFA.hpp index 3e0813bd..57a1e0df 100644 --- a/src/log_surgeon/finite_automata/RegexDFA.hpp +++ b/src/log_surgeon/finite_automata/RegexDFA.hpp @@ -54,6 +54,13 @@ class RegexDFAState { std::conditional_t> m_tree_transitions; }; +/** + * This class represents a pair of regex states. The intended use is for the two states in the pair + * to belong to unique DFAs. A pair is considered accepting if both states are accepting in + * their respective DFA. A different pair is considered reachable if both its states are reachable + * in their respective DFAs from this pair's states. The first state in the pair contains the + * variable types the pair matches. + */ template class RegexDFAStatePair { public: @@ -95,7 +102,7 @@ class RegexDFAStatePair { /** * @return The matching variable ids of the first state of the pair */ - [[nodiscard]] auto get_first_matching_variable_ids() const -> std::vector const& { + [[nodiscard]] auto get_matching_variable_ids() const -> std::vector const& { return m_state1->get_matching_variable_ids(); } diff --git a/src/log_surgeon/finite_automata/RegexDFA.tpp b/src/log_surgeon/finite_automata/RegexDFA.tpp index 891d0856..74fd02f3 100644 --- a/src/log_surgeon/finite_automata/RegexDFA.tpp +++ b/src/log_surgeon/finite_automata/RegexDFA.tpp @@ -63,7 +63,7 @@ auto RegexDFA::get_intersect(std::unique_ptr const& dfa_ while (false == unvisited_pairs.empty()) { auto current_pair_it = unvisited_pairs.begin(); if (current_pair_it->is_accepting()) { - auto const& matching_variable_ids = current_pair_it->get_first_matching_variable_ids(); + auto const& matching_variable_ids = current_pair_it->get_matching_variable_ids(); schema_types.insert(matching_variable_ids.cbegin(), matching_variable_ids.cend()); } visited_pairs.insert(*current_pair_it); From c9047551baa028103d9262ae7ad80126cff589b7 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 16 Sep 2024 10:46:55 -0400 Subject: [PATCH 016/323] Remove redundant docstrings --- src/log_surgeon/finite_automata/RegexDFA.hpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexDFA.hpp b/src/log_surgeon/finite_automata/RegexDFA.hpp index 57a1e0df..e242b557 100644 --- a/src/log_surgeon/finite_automata/RegexDFA.hpp +++ b/src/log_surgeon/finite_automata/RegexDFA.hpp @@ -92,16 +92,10 @@ class RegexDFAStatePair { std::set>& unvisited_pairs ) const -> void; - /** - * @return Whether both states are accepting - */ [[nodiscard]] auto is_accepting() const -> bool { return m_state1->is_accepting() && m_state2->is_accepting(); } - /** - * @return The matching variable ids of the first state of the pair - */ [[nodiscard]] auto get_matching_variable_ids() const -> std::vector const& { return m_state1->get_matching_variable_ids(); } From ffe9a0fd3acfa312242330b2d2a95cf7f3f4d3a2 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 16 Sep 2024 11:17:09 -0400 Subject: [PATCH 017/323] Remove has_capture_groups() --- src/log_surgeon/finite_automata/RegexAST.hpp | 69 -------------------- tests/test-lexer.cpp | 11 ---- 2 files changed, 80 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 2a635003..54b1fdca 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -52,12 +52,6 @@ class RegexAST { */ virtual auto add(RegexNFA* nfa, NFAStateType* end_state) -> void = 0; - /** - * Traverse the AST and check if it contains a capture group - * @return true if the AST contains a capture group, false otherwise - */ - virtual auto has_capture_groups() -> bool = 0; - /** * Serialize the AST into a string * @param with_tags @@ -140,12 +134,6 @@ class RegexASTEmpty : public RegexAST { */ auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; - /** - * Return false as RegexASTEmpty is a leaf node that is not a capture group - * @return false - */ - auto has_capture_groups() -> bool override { return false; } - /** * serialize the RegexASTEmpty into a string * @param with_tags @@ -203,12 +191,6 @@ class RegexASTLiteral : public RegexAST { */ auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; - /** - * Return false as RegexASTLiteral is a leaf node that is not a capture group - * @return false - */ - auto has_capture_groups() -> bool override { return false; } - /** * serialize the RegexASTLiteral into a string * @param with_tags @@ -275,12 +257,6 @@ class RegexASTInteger : public RegexAST { */ auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; - /** - * Return false as RegexASTInteger is a leaf node that is not a capture group - * @return false - */ - auto has_capture_groups() -> bool override { return false; } - /** * serialize the RegexASTInteger into a string * @param with_tags @@ -402,12 +378,6 @@ class RegexASTGroup : public RegexAST { */ auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; - /** - * Return false as RegexASTGroup is a leaf node that is not a capture group - * @return false - */ - auto has_capture_groups() -> bool override { return false; } - /** * serialize the RegexASTGroup into a string * @param with_tags @@ -511,12 +481,6 @@ class RegexASTOr : public RegexAST { */ auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; - /** - * Traverse the AST and check if RegexASTOr contains a capture group - * @return true if the AST contains a capture group, false otherwise - */ - auto has_capture_groups() -> bool override; - /** * serialize the RegexASTOr into a string * @param with_tags @@ -597,12 +561,6 @@ class RegexASTCat : public RegexAST { */ auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; - /** - * Traverse the AST and check if it contains a capture group - * @return true if the AST contains a capture group, false otherwise - */ - auto has_capture_groups() -> bool override; - /** * serialize the RegexASTCat into a string * @param with_tags @@ -684,12 +642,6 @@ class RegexASTMultiplication : public RegexAST { */ auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; - /** - * Traverse the AST and check if RegexASTMultiplication contains a capture group - * @return true if the AST contains a capture group, false otherwise - */ - auto has_capture_groups() -> bool override; - /** * serialize the RegexASTMultiplication into a string * @param with_tags @@ -772,12 +724,6 @@ class RegexASTCapture : public RegexAST { */ auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; - /** - * Return true as RegexASTCapture is a capture group - * @return true - */ - auto has_capture_groups() -> bool override { return true; } - /** * serialize the RegexASTCapture into a string * @param with_tags @@ -889,11 +835,6 @@ void RegexASTOr::add(RegexNFA* nfa, NFAStateType* en m_right->add(nfa, end_state); } -template -auto RegexASTOr::has_capture_groups() -> bool { - return m_left->has_capture_groups() || m_right->has_capture_groups(); -} - template auto RegexASTOr::add_tags(std::vector& all_tags) -> std::vector { auto positive_left_tags = m_left->add_tags(all_tags); @@ -936,11 +877,6 @@ void RegexASTCat::add(RegexNFA* nfa, NFAStateType* e nfa->set_root(saved_root); } -template -auto RegexASTCat::has_capture_groups() -> bool { - return m_left->has_capture_groups() || m_right->has_capture_groups(); -} - template auto RegexASTCat::add_tags(std::vector& all_tags) -> std::vector { auto positive_left_tags = m_left->add_tags(all_tags); @@ -1008,11 +944,6 @@ void RegexASTMultiplication::add( nfa->set_root(saved_root); } -template -auto RegexASTMultiplication::has_capture_groups() -> bool { - return m_operand->has_capture_groups(); -} - template auto RegexASTMultiplication::add_tags(std::vector& all_tags ) -> std::vector { diff --git a/tests/test-lexer.cpp b/tests/test-lexer.cpp index 0ea2b7d0..2031ed58 100644 --- a/tests/test-lexer.cpp +++ b/tests/test-lexer.cpp @@ -103,17 +103,6 @@ TEST_CASE("Test the Schema class", "[Schema]") { REQUIRE('9' == regex_ast_group_ast->get_ranges()[0].second); } - SECTION("Test has_capture_groups()") { - log_surgeon::Schema schema; - schema.add_variable("number", "123", -1); - schema.add_variable("capture", "user_id=(?[0-9]+)", -1); - auto const schema_ast = schema.release_schema_ast_ptr(); - auto& number_var_ast = dynamic_cast(*schema_ast->m_schema_vars[0]); - REQUIRE(false == number_var_ast.m_regex_ptr->has_capture_groups()); - auto& capture_var_ast = dynamic_cast(*schema_ast->m_schema_vars[1]); - REQUIRE(capture_var_ast.m_regex_ptr->has_capture_groups()); - } - SECTION("Test AST with tags") { test_regex_ast( "Z|(A(?((?(a)|(b))|(?(c)|(d))))B(?\\d+)C)", From 913ed1a35fb0e35358665ce5bf670525454048fe Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 16 Sep 2024 13:48:57 -0400 Subject: [PATCH 018/323] Const and auto changes --- src/log_surgeon/finite_automata/RegexDFA.tpp | 2 +- src/log_surgeon/finite_automata/RegexNFA.hpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexDFA.tpp b/src/log_surgeon/finite_automata/RegexDFA.tpp index 74fd02f3..458a5565 100644 --- a/src/log_surgeon/finite_automata/RegexDFA.tpp +++ b/src/log_surgeon/finite_automata/RegexDFA.tpp @@ -43,7 +43,7 @@ template auto RegexDFA::new_state(std::set const& nfa_state_set ) -> DFAStateType* { m_states.emplace_back(std::make_unique()); - DFAStateType* dfa_state = m_states.back().get(); + auto* dfa_state = m_states.back().get(); for (auto const* nfa_state : nfa_state_set) { if (nfa_state->is_accepting()) { dfa_state->add_matching_variable_id(nfa_state->get_matching_variable_id()); diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 237e9282..63b67b0f 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -223,7 +223,7 @@ void RegexNFA::reverse() { // propagate matching_variable_id from old accepting m_states for (NFAStateType* old_accepting_state : new_end->get_epsilon_transitions()) { - auto matching_variable_id = old_accepting_state->get_matching_variable_id(); + auto const matching_variable_id = old_accepting_state->get_matching_variable_id(); std::stack unvisited_states; std::set visited_states; unvisited_states.push(old_accepting_state); @@ -251,7 +251,7 @@ void RegexNFA::reverse() { for (int32_t i = m_states.size() - 1; i >= 0; --i) { std::unique_ptr& src_state_unique_ptr = m_states[i]; NFAStateType* src_state = src_state_unique_ptr.get(); - auto matching_variable_id = src_state->get_matching_variable_id(); + auto const matching_variable_id = src_state->get_matching_variable_id(); for (uint32_t byte = 0; byte < cSizeOfByte; byte++) { std::vector byte_transitions = src_state->get_byte_transitions(byte); for (int32_t j = byte_transitions.size() - 1; j >= 0; --j) { From 795add3a97aac3e7de1bb0437603998bc6c50298 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 16 Sep 2024 14:44:45 -0400 Subject: [PATCH 019/323] Add tagged-nfa --- src/log_surgeon/Lexer.hpp | 4 +- src/log_surgeon/Lexer.tpp | 18 +- src/log_surgeon/LogParser.cpp | 4 - src/log_surgeon/finite_automata/RegexAST.hpp | 43 ++-- src/log_surgeon/finite_automata/RegexNFA.hpp | 43 ++++ tests/CMakeLists.txt | 2 +- tests/test-NFA.cpp | 195 +++++++++++++++++++ 7 files changed, 288 insertions(+), 21 deletions(-) create mode 100644 tests/test-NFA.cpp diff --git a/src/log_surgeon/Lexer.hpp b/src/log_surgeon/Lexer.hpp index 7693fef0..533229e4 100644 --- a/src/log_surgeon/Lexer.hpp +++ b/src/log_surgeon/Lexer.hpp @@ -33,7 +33,7 @@ class LexicalRule { * Adds AST representing the lexical rule to the NFA * @param nfa */ - auto add_ast(finite_automata::RegexNFA* nfa) const -> void; + auto add_ast(finite_automata::RegexNFA* nfa) -> void; [[nodiscard]] auto get_variable_id() const -> uint32_t const& { return m_variable_id; } @@ -92,7 +92,7 @@ class Lexer { * Generate DFA for a reverse lexer matching the reverse of the words in the * original language */ - auto generate_reverse() -> void; + // auto generate_reverse() -> void; /** * Reset the lexer to start a new lexing (reset buffers, reset vars tracking diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index 4ba1f91e..daa465f2 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -378,6 +378,7 @@ void Lexer::generate() { for (auto& rule : m_rules) { rule.add_ast(&nfa); } + // TODO: DFA ignores tags. E.g., treats "capture:user=(?\d+)" as "capture:user=\d+" m_dfa = nfa_to_dfa(nfa); DFAStateType const* state = m_dfa->get_root(); for (uint32_t i = 0; i < cSizeOfByte; i++) { @@ -389,6 +390,8 @@ void Lexer::generate() { } } +/* +//TODO: needs to handle reversing tagged NFA template void Lexer::generate_reverse() { finite_automata::RegexNFA nfa; @@ -406,13 +409,16 @@ void Lexer::generate_reverse() { } } } +*/ template -void LexicalRule::add_ast(finite_automata::RegexNFA* nfa) const { +void LexicalRule::add_ast(finite_automata::RegexNFA* nfa) { + std::vector all_tags; + m_regex->add_tags(all_tags); NFAStateType* end_state = nfa->new_state(); end_state->set_accepting(true); end_state->set_matching_variable_id(m_variable_id); - m_regex->add(nfa, end_state); + m_regex->add_with_negative_tags(nfa, end_state); } template @@ -428,6 +434,14 @@ auto Lexer::epsilon_closure(NFAStateType const* stat for (NFAStateType* const u : t->get_epsilon_transitions()) { stack.push(u); } + + // TODO: currently treat tagged transitions as epsilon transitions + for (auto const& positive_tagged_transition : t->get_positive_tagged_transitions()) { + stack.push(positive_tagged_transition.state); + } + for (auto const& negative_tagged_transition : t->get_negative_tagged_transitions()) { + stack.push(negative_tagged_transition.state); + } } } return closure_set; diff --git a/src/log_surgeon/LogParser.cpp b/src/log_surgeon/LogParser.cpp index aa16cc06..7187c39c 100644 --- a/src/log_surgeon/LogParser.cpp +++ b/src/log_surgeon/LogParser.cpp @@ -89,10 +89,6 @@ void LogParser::add_rules(std::unique_ptr schema_ast) { // prevent timestamps from going into the dictionary continue; } - // currently capture groups are not yet supported - if ("capture" == rule->m_name) { - continue; - } // transform '.' from any-character into any non-delimiter character rule->m_regex_ptr->remove_delimiters_from_wildcard(delimiters); diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 54b1fdca..e609873b 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -71,6 +71,23 @@ class RegexAST { return serialized_string; } + /** + * Handles the addition of an intermediate state with negative transitions if needed. + * @param nfa + * @param end_state + */ + void add_with_negative_tags(RegexNFA* nfa, NFAStateType* end_state) { + // Handle negative tags as: + // root --(regex transitions)--> intermediate_state --(negative tags)--> end_state + if (false == m_negative_tags.empty()) { + NFAStateType* intermediate_state = nfa->new_state(); + add(nfa, intermediate_state); + intermediate_state->add_negative_tagged_transition(m_negative_tags, end_state); + } else { + add(nfa, end_state); + } + } + /** * Traverse the AST and add positive and negative tags. */ @@ -831,8 +848,8 @@ RegexASTOr::RegexASTOr( template void RegexASTOr::add(RegexNFA* nfa, NFAStateType* end_state) { - m_left->add(nfa, end_state); - m_right->add(nfa, end_state); + m_left->add_with_negative_tags(nfa, end_state); + m_right->add_with_negative_tags(nfa, end_state); } template @@ -871,9 +888,9 @@ template void RegexASTCat::add(RegexNFA* nfa, NFAStateType* end_state) { NFAStateType* saved_root = nfa->get_root(); NFAStateType* intermediate_state = nfa->new_state(); - m_left->add(nfa, intermediate_state); + m_left->add_with_negative_tags(nfa, intermediate_state); nfa->set_root(intermediate_state); - m_right->add(nfa, end_state); + m_right->add_with_negative_tags(nfa, end_state); nfa->set_root(saved_root); } @@ -919,27 +936,27 @@ void RegexASTMultiplication::add( } else { for (uint32_t i = 1; i < this->m_min; i++) { NFAStateType* intermediate_state = nfa->new_state(); - m_operand->add(nfa, intermediate_state); + m_operand->add_with_negative_tags(nfa, intermediate_state); nfa->set_root(intermediate_state); } - m_operand->add(nfa, end_state); + m_operand->add_with_negative_tags(nfa, end_state); } if (this->is_infinite()) { nfa->set_root(end_state); - m_operand->add(nfa, end_state); + m_operand->add_with_negative_tags(nfa, end_state); } else if (this->m_max > this->m_min) { if (this->m_min != 0) { NFAStateType* intermediate_state = nfa->new_state(); - m_operand->add(nfa, intermediate_state); + m_operand->add_with_negative_tags(nfa, intermediate_state); nfa->set_root(intermediate_state); } for (uint32_t i = this->m_min + 1; i < this->m_max; ++i) { - m_operand->add(nfa, end_state); + m_operand->add_with_negative_tags(nfa, end_state); NFAStateType* intermediate_state = nfa->new_state(); - m_operand->add(nfa, intermediate_state); + m_operand->add_with_negative_tags(nfa, intermediate_state); nfa->set_root(intermediate_state); } - m_operand->add(nfa, end_state); + m_operand->add_with_negative_tags(nfa, end_state); } nfa->set_root(saved_root); } @@ -968,7 +985,9 @@ auto RegexASTMultiplication::serialize(bool const with_tags) -> st template void RegexASTCapture::add(RegexNFA* nfa, NFAStateType* end_state) { - m_group_regex_ast->add(nfa, end_state); + NFAStateType* intermediate_state = nfa->new_state(); + m_group_regex_ast->add_with_negative_tags(nfa, intermediate_state); + intermediate_state->add_positive_tagged_transition(m_tag, end_state); } template diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 6c1e8953..9b110591 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -22,6 +22,21 @@ enum class RegexNFAStateType : uint8_t { UTF8 }; +template +class RegexNFAState; + +template +struct PositiveTaggedTransition { + uint32_t tag{}; + RegexNFAState const* state{}; +}; + +template +struct NegativeTaggedTransition { + std::vector tags; + RegexNFAState const* state{}; +}; + template class RegexNFAState { public: @@ -39,6 +54,32 @@ class RegexNFAState { return m_matching_variable_id; } + auto + add_positive_tagged_transition(uint32_t const tag, RegexNFAState const* dest_state) -> void { + m_positive_tagged_transitions.push_back( + PositiveTaggedTransition(tag, dest_state) + ); + } + + [[nodiscard]] auto get_positive_tagged_transitions( + ) const -> std::vector> const& { + return m_positive_tagged_transitions; + } + + auto add_negative_tagged_transition( + std::vector const& tags, + RegexNFAState const* dest_state + ) -> void { + m_negative_tagged_transitions.push_back( + NegativeTaggedTransition(tags, dest_state) + ); + } + + [[nodiscard]] auto get_negative_tagged_transitions( + ) const -> std::vector> const& { + return m_negative_tagged_transitions; + } + auto set_epsilon_transitions(std::vector& epsilon_transitions) -> void { m_epsilon_transitions = epsilon_transitions; } @@ -83,6 +124,8 @@ class RegexNFAState { private: bool m_accepting{false}; int m_matching_variable_id{0}; + std::vector> m_positive_tagged_transitions; + std::vector> m_negative_tagged_transitions; std::vector m_epsilon_transitions; std::array, cSizeOfByte> m_bytes_transitions; // NOTE: We don't need m_tree_transitions for the `stateType == diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 66ff605d..8c45b07b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -17,7 +17,7 @@ set( ../src/log_surgeon/Token.hpp ) -set(SOURCES_TESTS test-lexer.cpp) +set(SOURCES_TESTS test-lexer.cpp test-NFA.cpp) add_executable(unit-test ${SOURCES_LOG_SURGEON} ${SOURCES_TESTS}) target_link_libraries(unit-test PRIVATE Catch2::Catch2WithMain log_surgeon::log_surgeon) diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp new file mode 100644 index 00000000..be6adf18 --- /dev/null +++ b/tests/test-NFA.cpp @@ -0,0 +1,195 @@ +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +using log_surgeon::cSizeOfByte; +using log_surgeon::finite_automata::RegexNFAByteState; +using log_surgeon::Schema; +using log_surgeon::SchemaVarAST; +using std::string; + +using ByteLexicalRule = log_surgeon::LexicalRule; +using ByteNFA = log_surgeon::finite_automata::RegexNFA; +using RegexASTCatByte = log_surgeon::finite_automata::RegexASTCat; +using RegexASTCaptureByte = log_surgeon::finite_automata::RegexASTCapture; +using RegexASTGroupByte = log_surgeon::finite_automata::RegexASTGroup; +using RegexASTLiteralByte = log_surgeon::finite_automata::RegexASTLiteral; +using RegexASTMultiplicationByte + = log_surgeon::finite_automata::RegexASTMultiplication; +using RegexASTOrByte = log_surgeon::finite_automata::RegexASTOr; + +TEST_CASE("Test NFA", "[NFA]") { + Schema schema; + string const var_name = "capture"; + schema.add_variable( + var_name, + "Z|(A(?((?(a)|(b))|(?(c)|(d))))B(?\\d+)C)", + -1 + ); + auto const schema_ast = schema.release_schema_ast_ptr(); + auto& capture_rule_ast = dynamic_cast(*schema_ast->m_schema_vars[0]); + ByteNFA nfa; + ByteLexicalRule rule(0, std::move(capture_rule_ast.m_regex_ptr)); + rule.add_ast(&nfa); + + // Add helper for updating state_queue and visited_states + std::queue state_queue; + std::unordered_set visited_states; + auto add_to_queue = [&](auto const* dest_state) { + if (visited_states.insert(dest_state).second) { + state_queue.push(dest_state); + } + }; + + // Assigne state ids + std::map state_ids; + auto const* root = nfa.get_root(); + state_queue.push(root); + visited_states.insert(root); + while (false == state_queue.empty()) { + auto const* current_state = state_queue.front(); + state_queue.pop(); + state_ids.insert({current_state, state_ids.size()}); + for (uint32_t idx = 0; idx < cSizeOfByte; idx++) { + for (auto const* dest_state : current_state->get_byte_transitions(idx)) { + add_to_queue(dest_state); + } + } + for (auto const* dest_state : current_state->get_epsilon_transitions()) { + add_to_queue(dest_state); + } + for (auto const& [tag, dest_state] : current_state->get_positive_tagged_transitions()) { + add_to_queue(dest_state); + } + for (auto const& [tags, dest_state] : current_state->get_negative_tagged_transitions()) { + add_to_queue(dest_state); + } + } + + // Serialize NFA + std::string serialized_nfa; + visited_states.clear(); + state_queue.push(root); + visited_states.insert(root); + while (false == state_queue.empty()) { + auto const* current_state = state_queue.front(); + state_queue.pop(); + serialized_nfa += std::to_string(state_ids.find(current_state)->second) += ":"; + if (current_state->is_accepting()) { + serialized_nfa += "accepting_tag=" + + std::to_string(current_state->get_matching_variable_id()) + ","; + } + serialized_nfa += "byte_transitions={"; + for (uint32_t idx = 0; idx < cSizeOfByte; idx++) { + for (auto const* dest_state : current_state->get_byte_transitions(idx)) { + serialized_nfa += std::string(1, static_cast(idx)) + "-->" + + std::to_string(state_ids.find(dest_state)->second) + ","; + add_to_queue(dest_state); + } + } + serialized_nfa += "},epsilon_transitions={"; + for (auto const* dest_state : current_state->get_epsilon_transitions()) { + serialized_nfa += std::to_string(state_ids.find(dest_state)->second) + ","; + add_to_queue(dest_state); + } + serialized_nfa += "},positive_tagged_transitions={"; + for (auto const& [tag, dest_state] : current_state->get_positive_tagged_transitions()) { + serialized_nfa += std::to_string(state_ids.find(dest_state)->second); + serialized_nfa += "[" + std::to_string(tag) + "],"; + add_to_queue(dest_state); + } + serialized_nfa += "},negative_tagged_transitions={"; + for (auto const& [tags, dest_state] : current_state->get_negative_tagged_transitions()) { + serialized_nfa += std::to_string(state_ids.find(dest_state)->second); + serialized_nfa += "["; + for (auto const& tag : tags) { + serialized_nfa += std::to_string(tag) + ","; + } + serialized_nfa += "],"; + add_to_queue(dest_state); + } + serialized_nfa += "}"; + serialized_nfa += "\n"; + } + + // Compare against expected output + std::string expected_serialized_nfa = "0:byte_transitions={A-->1,Z-->2,}," + "epsilon_transitions={}," + "positive_tagged_transitions={}," + "negative_tagged_transitions={}\n"; + expected_serialized_nfa += "1:byte_transitions={a-->3,b-->3,c-->4,d-->4,}," + "epsilon_transitions={}," + "positive_tagged_transitions={}," + "negative_tagged_transitions={}\n"; + expected_serialized_nfa += "2:byte_transitions={}," + "epsilon_transitions={}," + "positive_tagged_transitions={}," + "negative_tagged_transitions={5[0,1,2,3,],}\n"; + expected_serialized_nfa += "3:byte_transitions={}," + "epsilon_transitions={}," + "positive_tagged_transitions={6[1],}," + "negative_tagged_transitions={}\n"; + expected_serialized_nfa += "4:byte_transitions={}," + "epsilon_transitions={}," + "positive_tagged_transitions={7[2],}," + "negative_tagged_transitions={}\n"; + expected_serialized_nfa += "5:accepting_tag=0,byte_transitions={}," + "epsilon_transitions={}," + "positive_tagged_transitions={}," + "negative_tagged_transitions={}\n"; + expected_serialized_nfa += "6:byte_transitions={}," + "epsilon_transitions={}," + "positive_tagged_transitions={}," + "negative_tagged_transitions={8[2,],}\n"; + expected_serialized_nfa += "7:byte_transitions={}," + "epsilon_transitions={}," + "positive_tagged_transitions={}," + "negative_tagged_transitions={8[1,],}\n"; + expected_serialized_nfa += "8:byte_transitions={}," + "epsilon_transitions={}," + "positive_tagged_transitions={9[0],}," + "negative_tagged_transitions={}\n"; + expected_serialized_nfa += "9:byte_transitions={B-->10,}," + "epsilon_transitions={}," + "positive_tagged_transitions={}," + "negative_tagged_transitions={}\n"; + expected_serialized_nfa += "10:byte_transitions={0-->11,1-->11,2-->11,3-->11,4-->11,5-->11,6-->" + "11,7-->11,8-->11,9-->11,}," + "epsilon_transitions={}," + "positive_tagged_transitions={}," + "negative_tagged_transitions={}\n"; + expected_serialized_nfa += "11:byte_transitions={0-->11,1-->11,2-->11,3-->11,4-->11,5-->11,6-->" + "11,7-->11,8-->11,9-->11,}," + "epsilon_transitions={}," + "positive_tagged_transitions={12[3],}," + "negative_tagged_transitions={}\n"; + expected_serialized_nfa += "12:byte_transitions={C-->5,}," + "epsilon_transitions={}," + "positive_tagged_transitions={}," + "negative_tagged_transitions={}\n"; + + // Compare expected and actual line-by-line + std::stringstream ss_actual(serialized_nfa); + std::stringstream ss_expected(expected_serialized_nfa); + std::string actual_line; + std::string expected_line; + while (std::getline(ss_actual, actual_line) && std::getline(ss_expected, expected_line)) { + REQUIRE(actual_line == expected_line); + } + std::getline(ss_actual, actual_line); + REQUIRE(actual_line.empty()); + std::getline(ss_expected, expected_line); + REQUIRE(expected_line.empty()); +} From 6e45657dfee324675dc4d887a56e1f29398daef4 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 17 Sep 2024 11:35:02 -0400 Subject: [PATCH 020/323] Clarify that the add functions are adding to the nfa; Make add to nfa functions const --- src/log_surgeon/Lexer.hpp | 10 ++- src/log_surgeon/Lexer.tpp | 9 +-- src/log_surgeon/finite_automata/RegexAST.hpp | 82 +++++++++++--------- tests/test-NFA.cpp | 2 +- 4 files changed, 59 insertions(+), 44 deletions(-) diff --git a/src/log_surgeon/Lexer.hpp b/src/log_surgeon/Lexer.hpp index 533229e4..ae883aad 100644 --- a/src/log_surgeon/Lexer.hpp +++ b/src/log_surgeon/Lexer.hpp @@ -29,11 +29,19 @@ class LexicalRule { : m_variable_id(variable_id), m_regex(std::move(regex)) {} + /** + * Adds positive and negative tags needed by capture groups to the AST nodes. + */ + auto add_tags() -> void { + std::vector all_tags; + m_regex->add_tags(all_tags); + } + /** * Adds AST representing the lexical rule to the NFA * @param nfa */ - auto add_ast(finite_automata::RegexNFA* nfa) -> void; + auto add_to_nfa(finite_automata::RegexNFA* nfa) const -> void; [[nodiscard]] auto get_variable_id() const -> uint32_t const& { return m_variable_id; } diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index daa465f2..0f8ccfc3 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -376,7 +376,8 @@ template void Lexer::generate() { finite_automata::RegexNFA nfa; for (auto& rule : m_rules) { - rule.add_ast(&nfa); + rule.add_tags(); + rule.add_to_nfa(&nfa); } // TODO: DFA ignores tags. E.g., treats "capture:user=(?\d+)" as "capture:user=\d+" m_dfa = nfa_to_dfa(nfa); @@ -412,13 +413,11 @@ void Lexer::generate_reverse() { */ template -void LexicalRule::add_ast(finite_automata::RegexNFA* nfa) { - std::vector all_tags; - m_regex->add_tags(all_tags); +void LexicalRule::add_to_nfa(finite_automata::RegexNFA* nfa) const { NFAStateType* end_state = nfa->new_state(); end_state->set_accepting(true); end_state->set_matching_variable_id(m_variable_id); - m_regex->add_with_negative_tags(nfa, end_state); + m_regex->add_to_nfa_with_negative_tags(nfa, end_state); } template diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index e609873b..53fd039d 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -50,7 +50,7 @@ class RegexAST { * @param nfa * @param end_state */ - virtual auto add(RegexNFA* nfa, NFAStateType* end_state) -> void = 0; + virtual auto add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const -> void = 0; /** * Serialize the AST into a string @@ -76,15 +76,15 @@ class RegexAST { * @param nfa * @param end_state */ - void add_with_negative_tags(RegexNFA* nfa, NFAStateType* end_state) { + void add_to_nfa_with_negative_tags(RegexNFA* nfa, NFAStateType* end_state) { // Handle negative tags as: // root --(regex transitions)--> intermediate_state --(negative tags)--> end_state if (false == m_negative_tags.empty()) { NFAStateType* intermediate_state = nfa->new_state(); - add(nfa, intermediate_state); + add_to_nfa(nfa, intermediate_state); intermediate_state->add_negative_tagged_transition(m_negative_tags, end_state); } else { - add(nfa, end_state); + add_to_nfa(nfa, end_state); } } @@ -149,7 +149,7 @@ class RegexASTEmpty : public RegexAST { * @param nfa * @param end_state */ - auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + auto add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const -> void override; /** * serialize the RegexASTEmpty into a string @@ -206,7 +206,7 @@ class RegexASTLiteral : public RegexAST { * @param nfa * @param end_state */ - auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + auto add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const -> void override; /** * serialize the RegexASTLiteral into a string @@ -272,7 +272,7 @@ class RegexASTInteger : public RegexAST { * @param nfa * @param end_state */ - auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + auto add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const -> void override; /** * serialize the RegexASTInteger into a string @@ -393,7 +393,7 @@ class RegexASTGroup : public RegexAST { * @param nfa * @param end_state */ - auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + auto add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const -> void override; /** * serialize the RegexASTGroup into a string @@ -496,7 +496,7 @@ class RegexASTOr : public RegexAST { * @param nfa * @param end_state */ - auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + auto add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const -> void override; /** * serialize the RegexASTOr into a string @@ -576,7 +576,7 @@ class RegexASTCat : public RegexAST { * @param nfa * @param end_state */ - auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + auto add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const -> void override; /** * serialize the RegexASTCat into a string @@ -657,7 +657,7 @@ class RegexASTMultiplication : public RegexAST { * @param nfa * @param end_state */ - auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + auto add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const -> void override; /** * serialize the RegexASTMultiplication into a string @@ -739,7 +739,7 @@ class RegexASTCapture : public RegexAST { * @param nfa * @param end_state */ - auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + auto add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const -> void override; /** * serialize the RegexASTCapture into a string @@ -772,10 +772,10 @@ template RegexASTEmpty::RegexASTEmpty() = default; template -void RegexASTEmpty::add( +void RegexASTEmpty::add_to_nfa( [[maybe_unused]] RegexNFA* nfa, [[maybe_unused]] NFAStateType* end_state -) { +) const { // DO NOTHING } @@ -792,7 +792,8 @@ template RegexASTLiteral::RegexASTLiteral(uint32_t character) : m_character(character) {} template -void RegexASTLiteral::add(RegexNFA* nfa, NFAStateType* end_state) { +void RegexASTLiteral::add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) + const { nfa->add_root_interval(Interval(m_character, m_character), end_state); } @@ -819,10 +820,10 @@ RegexASTInteger::RegexASTInteger(RegexASTInteger* left, uint32_t d } template -void RegexASTInteger::add( +void RegexASTInteger::add_to_nfa( [[maybe_unused]] RegexNFA* nfa, [[maybe_unused]] NFAStateType* end_state -) { +) const { throw std::runtime_error("Unsupported"); } @@ -847,9 +848,10 @@ RegexASTOr::RegexASTOr( m_right(std::move(right)) {} template -void RegexASTOr::add(RegexNFA* nfa, NFAStateType* end_state) { - m_left->add_with_negative_tags(nfa, end_state); - m_right->add_with_negative_tags(nfa, end_state); +void RegexASTOr::add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) + const { + m_left->add_to_nfa_with_negative_tags(nfa, end_state); + m_right->add_to_nfa_with_negative_tags(nfa, end_state); } template @@ -885,12 +887,13 @@ RegexASTCat::RegexASTCat( m_right(std::move(right)) {} template -void RegexASTCat::add(RegexNFA* nfa, NFAStateType* end_state) { +void RegexASTCat::add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) + const { NFAStateType* saved_root = nfa->get_root(); NFAStateType* intermediate_state = nfa->new_state(); - m_left->add_with_negative_tags(nfa, intermediate_state); + m_left->add_to_nfa_with_negative_tags(nfa, intermediate_state); nfa->set_root(intermediate_state); - m_right->add_with_negative_tags(nfa, end_state); + m_right->add_to_nfa_with_negative_tags(nfa, end_state); nfa->set_root(saved_root); } @@ -926,37 +929,37 @@ RegexASTMultiplication::RegexASTMultiplication( m_max(max) {} template -void RegexASTMultiplication::add( +void RegexASTMultiplication::add_to_nfa( RegexNFA* nfa, NFAStateType* end_state -) { +) const { NFAStateType* saved_root = nfa->get_root(); if (this->m_min == 0) { nfa->get_root()->add_epsilon_transition(end_state); } else { for (uint32_t i = 1; i < this->m_min; i++) { NFAStateType* intermediate_state = nfa->new_state(); - m_operand->add_with_negative_tags(nfa, intermediate_state); + m_operand->add_to_nfa_with_negative_tags(nfa, intermediate_state); nfa->set_root(intermediate_state); } - m_operand->add_with_negative_tags(nfa, end_state); + m_operand->add_to_nfa_with_negative_tags(nfa, end_state); } if (this->is_infinite()) { nfa->set_root(end_state); - m_operand->add_with_negative_tags(nfa, end_state); + m_operand->add_to_nfa_with_negative_tags(nfa, end_state); } else if (this->m_max > this->m_min) { if (this->m_min != 0) { NFAStateType* intermediate_state = nfa->new_state(); - m_operand->add_with_negative_tags(nfa, intermediate_state); + m_operand->add_to_nfa_with_negative_tags(nfa, intermediate_state); nfa->set_root(intermediate_state); } for (uint32_t i = this->m_min + 1; i < this->m_max; ++i) { - m_operand->add_with_negative_tags(nfa, end_state); + m_operand->add_to_nfa_with_negative_tags(nfa, end_state); NFAStateType* intermediate_state = nfa->new_state(); - m_operand->add_with_negative_tags(nfa, intermediate_state); + m_operand->add_to_nfa_with_negative_tags(nfa, intermediate_state); nfa->set_root(intermediate_state); } - m_operand->add_with_negative_tags(nfa, end_state); + m_operand->add_to_nfa_with_negative_tags(nfa, end_state); } nfa->set_root(saved_root); } @@ -984,9 +987,10 @@ auto RegexASTMultiplication::serialize(bool const with_tags) -> st } template -void RegexASTCapture::add(RegexNFA* nfa, NFAStateType* end_state) { +void RegexASTCapture::add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) + const { NFAStateType* intermediate_state = nfa->new_state(); - m_group_regex_ast->add_with_negative_tags(nfa, intermediate_state); + m_group_regex_ast->add_to_nfa_with_negative_tags(nfa, intermediate_state); intermediate_state->add_positive_tagged_transition(m_tag, end_state); } @@ -1128,9 +1132,13 @@ auto RegexASTGroup::complement(std::vector const& ranges } template -void RegexASTGroup::add(RegexNFA* nfa, NFAStateType* end_state) { - std::sort(this->m_ranges.begin(), this->m_ranges.end()); - std::vector merged_ranges = RegexASTGroup::merge(this->m_ranges); +void RegexASTGroup::add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) + const { + // TODO: there should be a better way to do this with a set and keep m_ranges sorted, but we + // have to consider removing overlap + taking the compliment. + std::vector merged_ranges = m_ranges; + std::sort(merged_ranges.begin(), merged_ranges.end()); + merged_ranges = merge(merged_ranges); if (this->m_negate) { merged_ranges = complement(merged_ranges); } diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index be6adf18..5f3f1e80 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -42,7 +42,7 @@ TEST_CASE("Test NFA", "[NFA]") { auto& capture_rule_ast = dynamic_cast(*schema_ast->m_schema_vars[0]); ByteNFA nfa; ByteLexicalRule rule(0, std::move(capture_rule_ast.m_regex_ptr)); - rule.add_ast(&nfa); + rule.add_to_nfa(&nfa); // Add helper for updating state_queue and visited_states std::queue state_queue; From 7aa8a9238f80c5c5010f443446a99095c7a77c2f Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 17 Sep 2024 11:56:18 -0400 Subject: [PATCH 021/323] Changed AST add functions to indicate the AST are being added to the NFA; Made add to nfa functions const --- src/log_surgeon/Lexer.hpp | 2 +- src/log_surgeon/Lexer.tpp | 6 +- src/log_surgeon/finite_automata/RegexAST.hpp | 70 +++++++++++--------- 3 files changed, 43 insertions(+), 35 deletions(-) diff --git a/src/log_surgeon/Lexer.hpp b/src/log_surgeon/Lexer.hpp index c7528181..2b580a96 100644 --- a/src/log_surgeon/Lexer.hpp +++ b/src/log_surgeon/Lexer.hpp @@ -33,7 +33,7 @@ class LexicalRule { * Adds AST representing the lexical rule to the NFA * @param nfa */ - auto add_ast(finite_automata::RegexNFA* nfa) const -> void; + auto add_to_nfa(finite_automata::RegexNFA* nfa) const -> void; [[nodiscard]] auto get_variable_id() const -> uint32_t { return m_variable_id; } diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index c7b60b1d..cd83ae01 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -376,7 +376,7 @@ template void Lexer::generate() { finite_automata::RegexNFA nfa; for (auto const& rule : m_rules) { - rule.add_ast(&nfa); + rule.add_to_nfa(&nfa); } m_dfa = nfa_to_dfa(nfa); DFAStateType const* state = m_dfa->get_root(); @@ -408,11 +408,11 @@ void Lexer::generate_reverse() { } template -void LexicalRule::add_ast(finite_automata::RegexNFA* nfa) const { +void LexicalRule::add_to_nfa(finite_automata::RegexNFA* nfa) const { auto* end_state = nfa->new_state(); end_state->set_accepting(true); end_state->set_matching_variable_id(m_variable_id); - m_regex->add(nfa, end_state); + m_regex->add_to_nfa(nfa, end_state); } template diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 7f4db644..8c124cb0 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -50,7 +50,7 @@ class RegexAST { * @param nfa * @param end_state */ - virtual auto add(RegexNFA* nfa, NFAStateType* end_state) -> void = 0; + virtual auto add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const -> void = 0; protected: RegexAST(RegexAST const& rhs) = default; @@ -98,7 +98,7 @@ class RegexASTLiteral : public RegexAST { * @param nfa * @param end_state */ - auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + auto add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const -> void override; [[nodiscard]] auto get_character() const -> uint32_t const& { return m_character; } @@ -149,7 +149,7 @@ class RegexASTInteger : public RegexAST { * @param nfa * @param end_state */ - auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + auto add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const -> void override; [[nodiscard]] auto get_digits() const -> std::vector const& { return m_digits; } @@ -255,7 +255,7 @@ class RegexASTGroup : public RegexAST { * @param nfa * @param end_state */ - auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + auto add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const -> void override; auto add_range(uint32_t min, uint32_t max) -> void { m_ranges.emplace_back(min, max); } @@ -343,7 +343,7 @@ class RegexASTOr : public RegexAST { * @param nfa * @param end_state */ - auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + auto add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const -> void override; private: std::unique_ptr> m_left; @@ -403,7 +403,7 @@ class RegexASTCat : public RegexAST { * @param nfa * @param end_state */ - auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + auto add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const -> void override; [[nodiscard]] auto get_left() const -> std::unique_ptr> const& { return m_left; @@ -472,7 +472,7 @@ class RegexASTMultiplication : public RegexAST { * @param nfa * @param end_state */ - auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + auto add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const -> void override; [[nodiscard]] auto is_infinite() const -> bool { return this->m_max == 0; } @@ -542,7 +542,7 @@ class RegexASTCapture : public RegexAST { * @param nfa * @param end_state */ - auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + auto add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const -> void override; [[nodiscard]] auto get_group_name() const -> std::string const& { return m_group_name; } @@ -560,7 +560,8 @@ template RegexASTLiteral::RegexASTLiteral(uint32_t character) : m_character(character) {} template -void RegexASTLiteral::add(RegexNFA* nfa, NFAStateType* end_state) { +void RegexASTLiteral::add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) + const { nfa->add_root_interval(Interval(m_character, m_character), end_state); } @@ -578,10 +579,10 @@ RegexASTInteger::RegexASTInteger(RegexASTInteger* left, uint32_t d } template -void RegexASTInteger::add( +void RegexASTInteger::add_to_nfa( [[maybe_unused]] RegexNFA* nfa, [[maybe_unused]] NFAStateType* end_state -) { +) const { throw std::runtime_error("Unsupported"); } @@ -594,9 +595,10 @@ RegexASTOr::RegexASTOr( m_right(std::move(right)) {} template -void RegexASTOr::add(RegexNFA* nfa, NFAStateType* end_state) { - m_left->add(nfa, end_state); - m_right->add(nfa, end_state); +void RegexASTOr::add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) + const { + m_left->add_to_nfa(nfa, end_state); + m_right->add_to_nfa(nfa, end_state); } template @@ -608,12 +610,13 @@ RegexASTCat::RegexASTCat( m_right(std::move(right)) {} template -void RegexASTCat::add(RegexNFA* nfa, NFAStateType* end_state) { +void RegexASTCat::add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) + const { NFAStateType* saved_root = nfa->get_root(); NFAStateType* intermediate_state = nfa->new_state(); - m_left->add(nfa, intermediate_state); + m_left->add_to_nfa(nfa, intermediate_state); nfa->set_root(intermediate_state); - m_right->add(nfa, end_state); + m_right->add_to_nfa(nfa, end_state); nfa->set_root(saved_root); } @@ -628,44 +631,45 @@ RegexASTMultiplication::RegexASTMultiplication( m_max(max) {} template -void RegexASTMultiplication::add( +void RegexASTMultiplication::add_to_nfa( RegexNFA* nfa, NFAStateType* end_state -) { +) const { NFAStateType* saved_root = nfa->get_root(); if (this->m_min == 0) { nfa->get_root()->add_epsilon_transition(end_state); } else { for (uint32_t i = 1; i < this->m_min; i++) { NFAStateType* intermediate_state = nfa->new_state(); - m_operand->add(nfa, intermediate_state); + m_operand->add_to_nfa(nfa, intermediate_state); nfa->set_root(intermediate_state); } - m_operand->add(nfa, end_state); + m_operand->add_to_nfa(nfa, end_state); } if (this->is_infinite()) { nfa->set_root(end_state); - m_operand->add(nfa, end_state); + m_operand->add_to_nfa(nfa, end_state); } else if (this->m_max > this->m_min) { if (this->m_min != 0) { NFAStateType* intermediate_state = nfa->new_state(); - m_operand->add(nfa, intermediate_state); + m_operand->add_to_nfa(nfa, intermediate_state); nfa->set_root(intermediate_state); } for (uint32_t i = this->m_min + 1; i < this->m_max; ++i) { - m_operand->add(nfa, end_state); + m_operand->add_to_nfa(nfa, end_state); NFAStateType* intermediate_state = nfa->new_state(); - m_operand->add(nfa, intermediate_state); + m_operand->add_to_nfa(nfa, intermediate_state); nfa->set_root(intermediate_state); } - m_operand->add(nfa, end_state); + m_operand->add_to_nfa(nfa, end_state); } nfa->set_root(saved_root); } template -void RegexASTCapture::add(RegexNFA* nfa, NFAStateType* end_state) { - m_group_regex_ast->add(nfa, end_state); +void RegexASTCapture::add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) + const { + m_group_regex_ast->add_to_nfa(nfa, end_state); } template @@ -781,9 +785,13 @@ auto RegexASTGroup::complement(std::vector const& ranges } template -void RegexASTGroup::add(RegexNFA* nfa, NFAStateType* end_state) { - std::sort(this->m_ranges.begin(), this->m_ranges.end()); - std::vector merged_ranges = RegexASTGroup::merge(this->m_ranges); +void RegexASTGroup::add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) + const { + // TODO: there should be a better way to do this with a set and keep m_ranges sorted, but we + // have to consider removing overlap + taking the compliment. + std::vector merged_ranges = m_ranges; + std::sort(merged_ranges.begin(), merged_ranges.end()); + merged_ranges = merge(merged_ranges); if (this->m_negate) { merged_ranges = complement(merged_ranges); } From bedad75cd2ad8514bd74847f7f0b3755689f605f Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 17 Sep 2024 12:59:06 -0400 Subject: [PATCH 022/323] Change add in RegexASTEmpty to add_to_nfa --- src/log_surgeon/finite_automata/RegexAST.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 2bd62b3b..e4fa5e9d 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -132,7 +132,7 @@ class RegexASTEmpty : public RegexAST { * @param nfa * @param end_state */ - auto add(RegexNFA* nfa, NFAStateType* end_state) -> void override; + auto add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const -> void override; /** * serialize the RegexASTEmpty into a string @@ -755,10 +755,10 @@ template RegexASTEmpty::RegexASTEmpty() = default; template -void RegexASTEmpty::add( +void RegexASTEmpty::add_to_nfa( [[maybe_unused]] RegexNFA* nfa, [[maybe_unused]] NFAStateType* end_state -) { +) const { // DO NOTHING } From cd54e6458cd69040b03d595dbc9cfc3cfbdbe088 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 17 Sep 2024 14:08:01 -0400 Subject: [PATCH 023/323] Fix and refactor NFA unit-test --- tests/test-NFA.cpp | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index 5f3f1e80..74470c4b 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -42,6 +42,7 @@ TEST_CASE("Test NFA", "[NFA]") { auto& capture_rule_ast = dynamic_cast(*schema_ast->m_schema_vars[0]); ByteNFA nfa; ByteLexicalRule rule(0, std::move(capture_rule_ast.m_regex_ptr)); + rule.add_tags(); rule.add_to_nfa(&nfa); // Add helper for updating state_queue and visited_states @@ -181,15 +182,5 @@ TEST_CASE("Test NFA", "[NFA]") { "negative_tagged_transitions={}\n"; // Compare expected and actual line-by-line - std::stringstream ss_actual(serialized_nfa); - std::stringstream ss_expected(expected_serialized_nfa); - std::string actual_line; - std::string expected_line; - while (std::getline(ss_actual, actual_line) && std::getline(ss_expected, expected_line)) { - REQUIRE(actual_line == expected_line); - } - std::getline(ss_actual, actual_line); - REQUIRE(actual_line.empty()); - std::getline(ss_expected, expected_line); - REQUIRE(expected_line.empty()); + REQUIRE(serialized_nfa == expected_serialized_nfa); } From 38ab6fe0b417d179db3b114747536829f31b4fae Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 8 Oct 2024 05:16:09 -0400 Subject: [PATCH 024/323] Fix compiler error. --- src/log_surgeon/LALR1Parser.tpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/LALR1Parser.tpp b/src/log_surgeon/LALR1Parser.tpp index b5de8cab..e6b25c72 100644 --- a/src/log_surgeon/LALR1Parser.tpp +++ b/src/log_surgeon/LALR1Parser.tpp @@ -682,9 +682,9 @@ auto LALR1Parser::get_next_symbol() -> Token { template auto LALR1Parser::parse_advance(Token& next_token, bool* accept) -> bool { - for (int const& type : *(next_token.m_type_ids_ptr)) { + for (auto const type : *next_token.m_type_ids_ptr) { if (parse_symbol(type, next_token, accept)) { - return (*accept); + return *accept; } } assert(*accept == false); From 4c6b9c6bc07cb37b21911801004424d22a58a46a Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 8 Oct 2024 05:37:47 -0400 Subject: [PATCH 025/323] Fix compiler error where macos considers a struct default constructor to be explicit. --- src/log_surgeon/Lexer.tpp | 4 ++-- src/log_surgeon/finite_automata/RegexNFA.hpp | 19 +++++++++++++++---- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index ed297628..165e8a1a 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -435,10 +435,10 @@ auto Lexer::epsilon_closure(NFAStateType const* stat // TODO: currently treat tagged transitions as epsilon transitions for (auto const& positive_tagged_transition : t->get_positive_tagged_transitions()) { - stack.push(positive_tagged_transition.state); + stack.push(positive_tagged_transition.m_dest_state); } for (auto const& negative_tagged_transition : t->get_negative_tagged_transitions()) { - stack.push(negative_tagged_transition.state); + stack.push(negative_tagged_transition.m_dest_state); } } } diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 0133b8ef..42d64a1d 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -27,14 +27,25 @@ class RegexNFAState; template struct PositiveTaggedTransition { - uint32_t tag{}; - RegexNFAState const* state{}; + PositiveTaggedTransition(uint32_t const tag, RegexNFAState const* dest_state) + : m_tag(tag), + m_dest_state(dest_state) {} + + uint32_t m_tag{}; + RegexNFAState const* m_dest_state{}; }; template struct NegativeTaggedTransition { - std::set tags; - RegexNFAState const* state{}; + NegativeTaggedTransition( + std::set const& tags, + RegexNFAState const* dest_state + ) + : m_tags(tags), + m_dest_state(dest_state) {} + + std::set m_tags; + RegexNFAState const* m_dest_state{}; }; template From 2e71aaaac9c656b991120e116240d1d507ae7e10 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 8 Oct 2024 05:42:33 -0400 Subject: [PATCH 026/323] Add state_type explicitly. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 42d64a1d..75310714 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -79,7 +79,7 @@ class RegexNFAState { auto add_negative_tagged_transition( std::set const& tags, - RegexNFAState const* dest_state + RegexNFAState const* dest_state ) -> void { m_negative_tagged_transitions.push_back( NegativeTaggedTransition(tags, dest_state) From c062a2c9cd21735fb038ccd979059f30f6ddabde Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 8 Oct 2024 05:43:30 -0400 Subject: [PATCH 027/323] Add state_type explicitly. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 75310714..1e12d595 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -65,8 +65,10 @@ class RegexNFAState { return m_matching_variable_id; } - auto - add_positive_tagged_transition(uint32_t const tag, RegexNFAState const* dest_state) -> void { + auto add_positive_tagged_transition( + uint32_t const tag, + RegexNFAState const* dest_state + ) -> void { m_positive_tagged_transitions.push_back( PositiveTaggedTransition(tag, dest_state) ); From eaa56741e749d81a5bf6957744f65e0ea4f39184 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 8 Oct 2024 05:46:01 -0400 Subject: [PATCH 028/323] Remove commented out code. --- src/log_surgeon/Lexer.hpp | 6 ------ src/log_surgeon/Lexer.tpp | 21 --------------------- 2 files changed, 27 deletions(-) diff --git a/src/log_surgeon/Lexer.hpp b/src/log_surgeon/Lexer.hpp index 10ca0891..db08f52f 100644 --- a/src/log_surgeon/Lexer.hpp +++ b/src/log_surgeon/Lexer.hpp @@ -88,12 +88,6 @@ class Lexer { */ auto generate() -> void; - /** - * Generate DFA for a reverse lexer matching the reverse of the words in the - * original language - */ - // auto generate_reverse() -> void; - /** * Reset the lexer to start a new lexing (reset buffers, reset vars tracking * positions) diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index 165e8a1a..d51e44ac 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -390,27 +390,6 @@ void Lexer::generate() { } } -/* -//TODO: needs to handle reversing tagged NFA -template -void Lexer::generate_reverse() { - finite_automata::RegexNFA nfa; - for (auto const& rule : m_rules) { - rule.add_to_nfa(&nfa); - } - nfa.reverse(); - m_dfa = nfa_to_dfa(nfa); - DFAStateType const* state = m_dfa->get_root(); - for (uint32_t i = 0; i < cSizeOfByte; i++) { - if (state->next(i) != nullptr) { - m_is_first_char[i] = true; - } else { - m_is_first_char[i] = false; - } - } -} -*/ - template void LexicalRule::add_to_nfa(finite_automata::RegexNFA* nfa) const { auto* end_state = nfa->new_state(); From f15047409dabedb4bc4e2419c0e012aa3ade6569 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 8 Oct 2024 05:49:32 -0400 Subject: [PATCH 029/323] Remove errent +=. --- tests/test-NFA.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index f651b11f..ea4f60ec 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -86,7 +86,7 @@ TEST_CASE("Test NFA", "[NFA]") { while (false == state_queue.empty()) { auto const* current_state = state_queue.front(); state_queue.pop(); - serialized_nfa += std::to_string(state_ids.find(current_state)->second) += ":"; + serialized_nfa += std::to_string(state_ids.find(current_state)->second) + ":"; if (current_state->is_accepting()) { serialized_nfa += "accepting_tag=" + std::to_string(current_state->get_matching_variable_id()) + ","; From bdafe104818fe8f4c47bd1a3429b28d82c8526e6 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 8 Oct 2024 05:52:05 -0400 Subject: [PATCH 030/323] Replace constructors with aggregate initialization. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 1e12d595..c441cba9 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -27,23 +27,12 @@ class RegexNFAState; template struct PositiveTaggedTransition { - PositiveTaggedTransition(uint32_t const tag, RegexNFAState const* dest_state) - : m_tag(tag), - m_dest_state(dest_state) {} - uint32_t m_tag{}; RegexNFAState const* m_dest_state{}; }; template struct NegativeTaggedTransition { - NegativeTaggedTransition( - std::set const& tags, - RegexNFAState const* dest_state - ) - : m_tags(tags), - m_dest_state(dest_state) {} - std::set m_tags; RegexNFAState const* m_dest_state{}; }; @@ -70,7 +59,7 @@ class RegexNFAState { RegexNFAState const* dest_state ) -> void { m_positive_tagged_transitions.push_back( - PositiveTaggedTransition(tag, dest_state) + PositiveTaggedTransition{tag, dest_state} ); } @@ -84,7 +73,7 @@ class RegexNFAState { RegexNFAState const* dest_state ) -> void { m_negative_tagged_transitions.push_back( - NegativeTaggedTransition(tags, dest_state) + NegativeTaggedTransition{tags, dest_state} ); } From 335bb343098ba7583a09ec32d6cd35717f22eded Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 8 Oct 2024 05:57:19 -0400 Subject: [PATCH 031/323] Replace static inline with static constexpr. --- src/log_surgeon/Lexer.hpp | 5 ++--- tests/test-lexer.cpp | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/log_surgeon/Lexer.hpp b/src/log_surgeon/Lexer.hpp index db08f52f..fcaf1881 100644 --- a/src/log_surgeon/Lexer.hpp +++ b/src/log_surgeon/Lexer.hpp @@ -49,9 +49,8 @@ class LexicalRule { template class Lexer { public: - // std::vector can be declared as constexpr in c++20 - static inline std::vector const cTokenEndTypes = {(uint32_t)SymbolID::TokenEndID}; - static inline std::vector const cTokenUncaughtStringTypes + static constexpr std::vector cTokenEndTypes = {(uint32_t)SymbolID::TokenEndID}; + static constexpr std::vector cTokenUncaughtStringTypes = {(uint32_t)SymbolID::TokenUncaughtStringID}; /** diff --git a/tests/test-lexer.cpp b/tests/test-lexer.cpp index 10484dfc..abfa460e 100644 --- a/tests/test-lexer.cpp +++ b/tests/test-lexer.cpp @@ -77,6 +77,7 @@ TEST_CASE("Test the Schema class", "[Schema]") { string const var_name = "myNumber"; string const var_schema = var_name + string(":") + string("123"); schema.add_variable(string_view(var_schema), -1); + auto const schema_ast = schema.release_schema_ast_ptr(); REQUIRE(schema_ast->m_schema_vars.size() == 1); REQUIRE(schema.release_schema_ast_ptr()->m_schema_vars.empty()); From 84463908cf547eb6a8818f01b70c09865c098ab2 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 8 Oct 2024 05:59:17 -0400 Subject: [PATCH 032/323] Undo last commit. --- src/log_surgeon/Lexer.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/Lexer.hpp b/src/log_surgeon/Lexer.hpp index fcaf1881..db08f52f 100644 --- a/src/log_surgeon/Lexer.hpp +++ b/src/log_surgeon/Lexer.hpp @@ -49,8 +49,9 @@ class LexicalRule { template class Lexer { public: - static constexpr std::vector cTokenEndTypes = {(uint32_t)SymbolID::TokenEndID}; - static constexpr std::vector cTokenUncaughtStringTypes + // std::vector can be declared as constexpr in c++20 + static inline std::vector const cTokenEndTypes = {(uint32_t)SymbolID::TokenEndID}; + static inline std::vector const cTokenUncaughtStringTypes = {(uint32_t)SymbolID::TokenUncaughtStringID}; /** From 73d8e4662953d82215ecd4c4fcaa9db077a8463f Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 8 Oct 2024 06:01:32 -0400 Subject: [PATCH 033/323] Fix comment. --- tests/test-NFA.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index ea4f60ec..1bfa43fa 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -53,7 +53,7 @@ TEST_CASE("Test NFA", "[NFA]") { } }; - // Assigne state ids + // Assign state IDs std::map state_ids; auto const* root = nfa.get_root(); state_queue.push(root); From 7871f80d741bbc750f57d886ed0449c28da69bfa Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 8 Oct 2024 06:12:35 -0400 Subject: [PATCH 034/323] Finish changes of int to uint32_t for SymbolID. --- src/log_surgeon/Constants.hpp | 2 +- src/log_surgeon/LALR1Parser.tpp | 20 +++++++++---------- src/log_surgeon/LogEvent.cpp | 4 ++-- src/log_surgeon/LogParser.cpp | 8 ++++---- src/log_surgeon/Parser.tpp | 34 ++++++++++++++++----------------- 5 files changed, 34 insertions(+), 34 deletions(-) diff --git a/src/log_surgeon/Constants.hpp b/src/log_surgeon/Constants.hpp index 85ae1670..3e6818aa 100644 --- a/src/log_surgeon/Constants.hpp +++ b/src/log_surgeon/Constants.hpp @@ -23,7 +23,7 @@ enum class ErrorCode { Truncated, }; -enum class SymbolID { +enum class SymbolID : uint32_t { TokenEndID, TokenUncaughtStringID, TokenIntId, diff --git a/src/log_surgeon/LALR1Parser.tpp b/src/log_surgeon/LALR1Parser.tpp index e6b25c72..263d56e0 100644 --- a/src/log_surgeon/LALR1Parser.tpp +++ b/src/log_surgeon/LALR1Parser.tpp @@ -55,14 +55,14 @@ namespace { template LALR1Parser::LALR1Parser() { - m_terminals.insert((int)SymbolID::TokenEndID); - m_terminals.insert((int)SymbolID::TokenUncaughtStringID); - m_terminals.insert((int)SymbolID::TokenIntId); - m_terminals.insert((int)SymbolID::TokenFloatId); - m_terminals.insert((int)SymbolID::TokenHexId); - m_terminals.insert((int)SymbolID::TokenFirstTimestampId); - m_terminals.insert((int)SymbolID::TokenNewlineTimestampId); - m_terminals.insert((int)SymbolID::TokenNewlineId); + m_terminals.insert((uint32_t)SymbolID::TokenEndID); + m_terminals.insert((uint32_t)SymbolID::TokenUncaughtStringID); + m_terminals.insert((uint32_t)SymbolID::TokenIntId); + m_terminals.insert((uint32_t)SymbolID::TokenFloatId); + m_terminals.insert((uint32_t)SymbolID::TokenHexId); + m_terminals.insert((uint32_t)SymbolID::TokenFirstTimestampId); + m_terminals.insert((uint32_t)SymbolID::TokenNewlineTimestampId); + m_terminals.insert((uint32_t)SymbolID::TokenNewlineId); } template @@ -327,7 +327,7 @@ void LALR1Parser::generate_lr1_item_sets() { m_spontaneous_map[l0_item.m_production].end() ); if (l0_item.m_production == m_productions[m_root_production_id].get()) { - lookaheads[l0_item].insert((int)SymbolID::TokenEndID); + lookaheads[l0_item].insert((uint32_t)SymbolID::TokenEndID); } } } @@ -480,7 +480,7 @@ void LALR1Parser::generate_lalr1_action() { if (item.has_dot_at_end()) { if (item.m_production == m_productions[m_root_production_id].get()) { Action action = true; - item_set_ptr->m_actions[(int)SymbolID::TokenEndID] = action; + item_set_ptr->m_actions[(uint32_t)SymbolID::TokenEndID] = action; } else { Action& action = item_set_ptr->m_actions[item.m_lookahead]; if (!std::holds_alternative(action)) { diff --git a/src/log_surgeon/LogEvent.cpp b/src/log_surgeon/LogEvent.cpp index abe11608..02d66a71 100644 --- a/src/log_surgeon/LogEvent.cpp +++ b/src/log_surgeon/LogEvent.cpp @@ -52,10 +52,10 @@ auto LogEventView::get_logtype() const -> std::string { std::string logtype; for (uint32_t i = 1; i < m_log_output_buffer->pos(); i++) { Token& token = m_log_output_buffer->get_mutable_token(i); - if (token.m_type_ids_ptr->at(0) == (uint32_t)log_surgeon::SymbolID::TokenUncaughtStringID) { + if (token.m_type_ids_ptr->at(0) == (uint32_t)SymbolID::TokenUncaughtStringID) { logtype += token.to_string_view(); } else { - if ((uint32_t)log_surgeon::SymbolID::TokenNewlineId != token.m_type_ids_ptr->at(0)) { + if ((uint32_t)SymbolID::TokenNewlineId != token.m_type_ids_ptr->at(0)) { logtype += token.get_delimiter(); } logtype += "<"; diff --git a/src/log_surgeon/LogParser.cpp b/src/log_surgeon/LogParser.cpp index af258361..e1781ef4 100644 --- a/src/log_surgeon/LogParser.cpp +++ b/src/log_surgeon/LogParser.cpp @@ -235,14 +235,14 @@ auto LogParser::parse(LogParser::ParsingAction& parsing_action) -> ErrorCode { int token_type = next_token.m_type_ids_ptr->at(0); bool found_start_of_next_message = (output_buffer->has_timestamp() - && token_type == (int)SymbolID::TokenNewlineTimestampId) + && token_type == (uint32_t)SymbolID::TokenNewlineTimestampId) || (!output_buffer->has_timestamp() && next_token.get_char(0) == '\n' - && token_type != (int)SymbolID::TokenNewlineId); - if (token_type == (int)SymbolID::TokenEndID) { + && token_type != (uint32_t)SymbolID::TokenNewlineId); + if (token_type == (uint32_t)SymbolID::TokenEndID) { parsing_action = ParsingAction::CompressAndFinish; return ErrorCode::Success; } - if (false == output_buffer->has_timestamp() && token_type == (int)SymbolID::TokenNewlineId) + if (false == output_buffer->has_timestamp() && token_type == (uint32_t)SymbolID::TokenNewlineId) { m_input_buffer.set_consumed_pos(output_buffer->get_curr_token().m_end_pos); output_buffer->advance_to_next_token(); diff --git a/src/log_surgeon/Parser.tpp b/src/log_surgeon/Parser.tpp index 69c8a5ea..b923320e 100644 --- a/src/log_surgeon/Parser.tpp +++ b/src/log_surgeon/Parser.tpp @@ -10,23 +10,23 @@ namespace log_surgeon { template Parser::Parser() { // TODO move clp-reserved symbols out of the parser - m_lexer.m_symbol_id[cTokenEnd] = (int)SymbolID::TokenEndID; - m_lexer.m_symbol_id[cTokenUncaughtString] = (int)SymbolID::TokenUncaughtStringID; - m_lexer.m_symbol_id[cTokenInt] = (int)SymbolID::TokenIntId; - m_lexer.m_symbol_id[cTokenFloat] = (int)SymbolID::TokenFloatId; - m_lexer.m_symbol_id[cTokenHex] = (int)SymbolID::TokenHexId; - m_lexer.m_symbol_id[cTokenFirstTimestamp] = (int)SymbolID::TokenFirstTimestampId; - m_lexer.m_symbol_id[cTokenNewlineTimestamp] = (int)SymbolID::TokenNewlineTimestampId; - m_lexer.m_symbol_id[cTokenNewline] = (int)SymbolID::TokenNewlineId; - - m_lexer.m_id_symbol[(int)SymbolID::TokenEndID] = cTokenEnd; - m_lexer.m_id_symbol[(int)SymbolID::TokenUncaughtStringID] = cTokenUncaughtString; - m_lexer.m_id_symbol[(int)SymbolID::TokenIntId] = cTokenInt; - m_lexer.m_id_symbol[(int)SymbolID::TokenFloatId] = cTokenFloat; - m_lexer.m_id_symbol[(int)SymbolID::TokenHexId] = cTokenHex; - m_lexer.m_id_symbol[(int)SymbolID::TokenFirstTimestampId] = cTokenFirstTimestamp; - m_lexer.m_id_symbol[(int)SymbolID::TokenNewlineTimestampId] = cTokenNewlineTimestamp; - m_lexer.m_id_symbol[(int)SymbolID::TokenNewlineId] = cTokenNewline; + m_lexer.m_symbol_id[cTokenEnd] = (uint32_t)SymbolID::TokenEndID; + m_lexer.m_symbol_id[cTokenUncaughtString] = (uint32_t)SymbolID::TokenUncaughtStringID; + m_lexer.m_symbol_id[cTokenInt] = (uint32_t)SymbolID::TokenIntId; + m_lexer.m_symbol_id[cTokenFloat] = (uint32_t)SymbolID::TokenFloatId; + m_lexer.m_symbol_id[cTokenHex] = (uint32_t)SymbolID::TokenHexId; + m_lexer.m_symbol_id[cTokenFirstTimestamp] = (uint32_t)SymbolID::TokenFirstTimestampId; + m_lexer.m_symbol_id[cTokenNewlineTimestamp] = (uint32_t)SymbolID::TokenNewlineTimestampId; + m_lexer.m_symbol_id[cTokenNewline] = (uint32_t)SymbolID::TokenNewlineId; + + m_lexer.m_id_symbol[(uint32_t)SymbolID::TokenEndID] = cTokenEnd; + m_lexer.m_id_symbol[(uint32_t)SymbolID::TokenUncaughtStringID] = cTokenUncaughtString; + m_lexer.m_id_symbol[(uint32_t)SymbolID::TokenIntId] = cTokenInt; + m_lexer.m_id_symbol[(uint32_t)SymbolID::TokenFloatId] = cTokenFloat; + m_lexer.m_id_symbol[(uint32_t)SymbolID::TokenHexId] = cTokenHex; + m_lexer.m_id_symbol[(uint32_t)SymbolID::TokenFirstTimestampId] = cTokenFirstTimestamp; + m_lexer.m_id_symbol[(uint32_t)SymbolID::TokenNewlineTimestampId] = cTokenNewlineTimestamp; + m_lexer.m_id_symbol[(uint32_t)SymbolID::TokenNewlineId] = cTokenNewline; } template From 56483c908bc837bd5230c6fee0035e112eeefb9c Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 8 Oct 2024 06:19:41 -0400 Subject: [PATCH 035/323] Added comment explaining use of uint32_t for SymbolID. --- src/log_surgeon/Constants.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/log_surgeon/Constants.hpp b/src/log_surgeon/Constants.hpp index 3e6818aa..b63b34c9 100644 --- a/src/log_surgeon/Constants.hpp +++ b/src/log_surgeon/Constants.hpp @@ -23,6 +23,7 @@ enum class ErrorCode { Truncated, }; +// Using uint32_t as the underlying type to ensure consistency with token identifier types. enum class SymbolID : uint32_t { TokenEndID, TokenUncaughtStringID, From cafa9730ee55848dcf540acd46fd9b4e01f7d960 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 8 Oct 2024 08:33:10 -0400 Subject: [PATCH 036/323] Finish removing ints that should be uint32_t. --- src/log_surgeon/LogParser.cpp | 4 ++-- src/log_surgeon/SchemaParser.cpp | 2 +- src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp | 2 +- src/log_surgeon/finite_automata/UnicodeIntervalTree.tpp | 4 ++-- src/log_surgeon/utils.hpp | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/log_surgeon/LogParser.cpp b/src/log_surgeon/LogParser.cpp index e1781ef4..6035f8a3 100644 --- a/src/log_surgeon/LogParser.cpp +++ b/src/log_surgeon/LogParser.cpp @@ -122,7 +122,7 @@ void LogParser::add_rules(std::unique_ptr schema_ast) { for (uint32_t i = 0; i <= rule->m_line_num; i++) { schema_reader.try_read_to_delimiter('\n', false, false, line); } - int colon_pos = 0; + uint32_t colon_pos = 0; for (char i : line) { colon_pos++; if (i == ':') { @@ -232,7 +232,7 @@ auto LogParser::parse(LogParser::ParsingAction& parsing_action) -> ErrorCode { return err; } output_buffer->set_curr_token(next_token); - int token_type = next_token.m_type_ids_ptr->at(0); + auto token_type = next_token.m_type_ids_ptr->at(0); bool found_start_of_next_message = (output_buffer->has_timestamp() && token_type == (uint32_t)SymbolID::TokenNewlineTimestampId) diff --git a/src/log_surgeon/SchemaParser.cpp b/src/log_surgeon/SchemaParser.cpp index 3b1c9c45..6fd33029 100644 --- a/src/log_surgeon/SchemaParser.cpp +++ b/src/log_surgeon/SchemaParser.cpp @@ -64,7 +64,7 @@ auto SchemaParser::try_schema_file(string const& schema_file_path) -> unique_ptr strfmt("Failed to read '%s', errno=%d", schema_file_path.c_str(), errno) ); } - int code{static_cast>(error_code)}; + auto code{static_cast>(error_code)}; throw std::runtime_error( strfmt("Failed to read '%s', error_code=%d", schema_file_path.c_str(), code) ); diff --git a/src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp b/src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp index 9c0ccdd2..3aee485d 100644 --- a/src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp +++ b/src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp @@ -163,7 +163,7 @@ class UnicodeIntervalTree { T m_value; uint32_t m_lower{}; uint32_t m_upper{}; - int m_height{}; + uint32_t m_height{}; std::unique_ptr m_left; std::unique_ptr m_right; }; diff --git a/src/log_surgeon/finite_automata/UnicodeIntervalTree.tpp b/src/log_surgeon/finite_automata/UnicodeIntervalTree.tpp index 10606fc4..7e1b0763 100644 --- a/src/log_surgeon/finite_automata/UnicodeIntervalTree.tpp +++ b/src/log_surgeon/finite_automata/UnicodeIntervalTree.tpp @@ -170,11 +170,11 @@ auto UnicodeIntervalTree::Node::balance_factor() -> int { template auto UnicodeIntervalTree::Node::balance(std::unique_ptr node ) -> std::unique_ptr::Node> { - int factor = node->balance_factor(); + auto factor = node->balance_factor(); if (factor * factor <= 1) { return node; } - int sub_factor + auto sub_factor = (factor < 0) ? node->m_left->balance_factor() : node->m_right->balance_factor(); if (factor * sub_factor > 0) { return Node::rotate(std::move(node), factor); diff --git a/src/log_surgeon/utils.hpp b/src/log_surgeon/utils.hpp index bb3140f0..976eb207 100644 --- a/src/log_surgeon/utils.hpp +++ b/src/log_surgeon/utils.hpp @@ -8,7 +8,7 @@ template auto strfmt(std::string const& fmt, Args... args) -> std::string { - int size = std::snprintf(nullptr, 0, fmt.c_str(), args...); + auto size = std::snprintf(nullptr, 0, fmt.c_str(), args...); if (size <= 0) { throw std::runtime_error("Error during formatting."); } From a2b1bfd58ed67e858d35c20e7ae5dac48e3807f3 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 8 Oct 2024 08:35:26 -0400 Subject: [PATCH 037/323] Fix formatting. --- src/log_surgeon/LogParser.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/log_surgeon/LogParser.cpp b/src/log_surgeon/LogParser.cpp index 6035f8a3..6f779194 100644 --- a/src/log_surgeon/LogParser.cpp +++ b/src/log_surgeon/LogParser.cpp @@ -242,7 +242,8 @@ auto LogParser::parse(LogParser::ParsingAction& parsing_action) -> ErrorCode { parsing_action = ParsingAction::CompressAndFinish; return ErrorCode::Success; } - if (false == output_buffer->has_timestamp() && token_type == (uint32_t)SymbolID::TokenNewlineId) + if (false == output_buffer->has_timestamp() + && token_type == (uint32_t)SymbolID::TokenNewlineId) { m_input_buffer.set_consumed_pos(output_buffer->get_curr_token().m_end_pos); output_buffer->advance_to_next_token(); From 2fb4831712cd581b75e0fa0e961eb6e831a1912e Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 10 Oct 2024 13:49:01 -0400 Subject: [PATCH 038/323] Rename SymbolID to SymbolId; Remove redundant ID for SymbolIds enum values. --- src/log_surgeon/Constants.hpp | 18 ++++++++--------- src/log_surgeon/LALR1Parser.tpp | 26 ++++++++++++------------- src/log_surgeon/Lexer.hpp | 4 ++-- src/log_surgeon/LogEvent.cpp | 4 ++-- src/log_surgeon/LogParser.cpp | 16 ++++++++-------- src/log_surgeon/Parser.tpp | 34 ++++++++++++++++----------------- 6 files changed, 51 insertions(+), 51 deletions(-) diff --git a/src/log_surgeon/Constants.hpp b/src/log_surgeon/Constants.hpp index b63b34c9..e7ee8c00 100644 --- a/src/log_surgeon/Constants.hpp +++ b/src/log_surgeon/Constants.hpp @@ -24,15 +24,15 @@ enum class ErrorCode { }; // Using uint32_t as the underlying type to ensure consistency with token identifier types. -enum class SymbolID : uint32_t { - TokenEndID, - TokenUncaughtStringID, - TokenIntId, - TokenFloatId, - TokenHexId, - TokenFirstTimestampId, - TokenNewlineTimestampId, - TokenNewlineId +enum class SymbolId : uint32_t { + TokenEnd, + TokenUncaughtString, + TokenInt, + TokenFloat, + TokenHex, + TokenFirstTimestamp, + TokenNewlineTimestamp, + TokenNewline }; constexpr char cTokenEnd[] = "$end"; diff --git a/src/log_surgeon/LALR1Parser.tpp b/src/log_surgeon/LALR1Parser.tpp index 263d56e0..2edfa29a 100644 --- a/src/log_surgeon/LALR1Parser.tpp +++ b/src/log_surgeon/LALR1Parser.tpp @@ -55,14 +55,14 @@ namespace { template LALR1Parser::LALR1Parser() { - m_terminals.insert((uint32_t)SymbolID::TokenEndID); - m_terminals.insert((uint32_t)SymbolID::TokenUncaughtStringID); - m_terminals.insert((uint32_t)SymbolID::TokenIntId); - m_terminals.insert((uint32_t)SymbolID::TokenFloatId); - m_terminals.insert((uint32_t)SymbolID::TokenHexId); - m_terminals.insert((uint32_t)SymbolID::TokenFirstTimestampId); - m_terminals.insert((uint32_t)SymbolID::TokenNewlineTimestampId); - m_terminals.insert((uint32_t)SymbolID::TokenNewlineId); + m_terminals.insert((uint32_t)SymbolId::TokenEnd); + m_terminals.insert((uint32_t)SymbolId::TokenUncaughtString); + m_terminals.insert((uint32_t)SymbolId::TokenInt); + m_terminals.insert((uint32_t)SymbolId::TokenFloat); + m_terminals.insert((uint32_t)SymbolId::TokenHex); + m_terminals.insert((uint32_t)SymbolId::TokenFirstTimestamp); + m_terminals.insert((uint32_t)SymbolId::TokenNewlineTimestamp); + m_terminals.insert((uint32_t)SymbolId::TokenNewline); } template @@ -327,7 +327,7 @@ void LALR1Parser::generate_lr1_item_sets() { m_spontaneous_map[l0_item.m_production].end() ); if (l0_item.m_production == m_productions[m_root_production_id].get()) { - lookaheads[l0_item].insert((uint32_t)SymbolID::TokenEndID); + lookaheads[l0_item].insert((uint32_t)SymbolId::TokenEnd); } } } @@ -480,7 +480,7 @@ void LALR1Parser::generate_lalr1_action() { if (item.has_dot_at_end()) { if (item.m_production == m_productions[m_root_production_id].get()) { Action action = true; - item_set_ptr->m_actions[(uint32_t)SymbolID::TokenEndID] = action; + item_set_ptr->m_actions[(uint32_t)SymbolId::TokenEnd] = action; } else { Action& action = item_set_ptr->m_actions[item.m_lookahead]; if (!std::holds_alternative(action)) { @@ -561,7 +561,7 @@ template auto LALR1Parser::get_input_until_next_newline(Token* error_token ) -> std::string { std::string rest_of_line; - bool next_is_end_token = (error_token->m_type_ids_ptr->at(0) == (uint32_t)SymbolID::TokenEndID); + bool next_is_end_token = (error_token->m_type_ids_ptr->at(0) == (uint32_t)SymbolId::TokenEnd); bool next_has_newline = (error_token->to_string().find('\n') != std::string::npos) || (error_token->to_string().find('\r') != std::string::npos); while (!next_has_newline && !next_is_end_token) { @@ -570,7 +570,7 @@ auto LALR1Parser::get_input_until_next_newline(Token || (token.to_string().find('\r') != std::string::npos); if (!next_has_newline) { rest_of_line += token.to_string(); - next_is_end_token = (token.m_type_ids_ptr->at(0) == (uint32_t)SymbolID::TokenEndID); + next_is_end_token = (token.m_type_ids_ptr->at(0) == (uint32_t)SymbolId::TokenEnd); } } rest_of_line += "\n"; @@ -594,7 +594,7 @@ auto LALR1Parser::report_error() -> std::string { error_indicator += " "; } error_indicator += "^\n"; - if (token.m_type_ids_ptr->at(0) == (uint32_t)SymbolID::TokenEndID && consumed_input.empty()) { + if (token.m_type_ids_ptr->at(0) == (uint32_t)SymbolId::TokenEnd && consumed_input.empty()) { error_type = "empty file"; error_indicator = "^\n"; } else { diff --git a/src/log_surgeon/Lexer.hpp b/src/log_surgeon/Lexer.hpp index db08f52f..3d46a08c 100644 --- a/src/log_surgeon/Lexer.hpp +++ b/src/log_surgeon/Lexer.hpp @@ -50,9 +50,9 @@ template class Lexer { public: // std::vector can be declared as constexpr in c++20 - static inline std::vector const cTokenEndTypes = {(uint32_t)SymbolID::TokenEndID}; + static inline std::vector const cTokenEndTypes = {(uint32_t)SymbolId::TokenEnd}; static inline std::vector const cTokenUncaughtStringTypes - = {(uint32_t)SymbolID::TokenUncaughtStringID}; + = {(uint32_t)SymbolId::TokenUncaughtString}; /** * Generate a DFA from an NFA diff --git a/src/log_surgeon/LogEvent.cpp b/src/log_surgeon/LogEvent.cpp index 02d66a71..bfb06987 100644 --- a/src/log_surgeon/LogEvent.cpp +++ b/src/log_surgeon/LogEvent.cpp @@ -52,10 +52,10 @@ auto LogEventView::get_logtype() const -> std::string { std::string logtype; for (uint32_t i = 1; i < m_log_output_buffer->pos(); i++) { Token& token = m_log_output_buffer->get_mutable_token(i); - if (token.m_type_ids_ptr->at(0) == (uint32_t)SymbolID::TokenUncaughtStringID) { + if (token.m_type_ids_ptr->at(0) == (uint32_t)SymbolId::TokenUncaughtString) { logtype += token.to_string_view(); } else { - if ((uint32_t)SymbolID::TokenNewlineId != token.m_type_ids_ptr->at(0)) { + if ((uint32_t)SymbolId::TokenNewline != token.m_type_ids_ptr->at(0)) { logtype += token.get_delimiter(); } logtype += "<"; diff --git a/src/log_surgeon/LogParser.cpp b/src/log_surgeon/LogParser.cpp index 6f779194..b98e7121 100644 --- a/src/log_surgeon/LogParser.cpp +++ b/src/log_surgeon/LogParser.cpp @@ -181,7 +181,7 @@ auto LogParser::parse(LogParser::ParsingAction& parsing_action) -> ErrorCode { return err; } if (false == output_buffer->has_timestamp() - && next_token.m_type_ids_ptr->at(0) == (uint32_t)SymbolID::TokenNewlineTimestampId) + && next_token.m_type_ids_ptr->at(0) == (uint32_t)SymbolId::TokenNewlineTimestamp) { // TODO: combine the below with found_start_of_next_message // into 1 function @@ -206,14 +206,14 @@ auto LogParser::parse(LogParser::ParsingAction& parsing_action) -> ErrorCode { return ErrorCode::Success; } } - if (next_token.m_type_ids_ptr->at(0) == (uint32_t)SymbolID::TokenEndID) { + if (next_token.m_type_ids_ptr->at(0) == (uint32_t)SymbolId::TokenEnd) { output_buffer->set_token(0, next_token); output_buffer->set_pos(1); parsing_action = ParsingAction::CompressAndFinish; return ErrorCode::Success; } - if (next_token.m_type_ids_ptr->at(0) == (uint32_t)SymbolID::TokenFirstTimestampId - || next_token.m_type_ids_ptr->at(0) == (uint32_t)SymbolID::TokenNewlineTimestampId) + if (next_token.m_type_ids_ptr->at(0) == (uint32_t)SymbolId::TokenFirstTimestamp + || next_token.m_type_ids_ptr->at(0) == (uint32_t)SymbolId::TokenNewlineTimestamp) { output_buffer->set_has_timestamp(true); output_buffer->set_token(0, next_token); @@ -235,15 +235,15 @@ auto LogParser::parse(LogParser::ParsingAction& parsing_action) -> ErrorCode { auto token_type = next_token.m_type_ids_ptr->at(0); bool found_start_of_next_message = (output_buffer->has_timestamp() - && token_type == (uint32_t)SymbolID::TokenNewlineTimestampId) + && token_type == (uint32_t)SymbolId::TokenNewlineTimestamp) || (!output_buffer->has_timestamp() && next_token.get_char(0) == '\n' - && token_type != (uint32_t)SymbolID::TokenNewlineId); - if (token_type == (uint32_t)SymbolID::TokenEndID) { + && token_type != (uint32_t)SymbolId::TokenNewline); + if (token_type == (uint32_t)SymbolId::TokenEnd) { parsing_action = ParsingAction::CompressAndFinish; return ErrorCode::Success; } if (false == output_buffer->has_timestamp() - && token_type == (uint32_t)SymbolID::TokenNewlineId) + && token_type == (uint32_t)SymbolId::TokenNewline) { m_input_buffer.set_consumed_pos(output_buffer->get_curr_token().m_end_pos); output_buffer->advance_to_next_token(); diff --git a/src/log_surgeon/Parser.tpp b/src/log_surgeon/Parser.tpp index b923320e..be307af3 100644 --- a/src/log_surgeon/Parser.tpp +++ b/src/log_surgeon/Parser.tpp @@ -10,23 +10,23 @@ namespace log_surgeon { template Parser::Parser() { // TODO move clp-reserved symbols out of the parser - m_lexer.m_symbol_id[cTokenEnd] = (uint32_t)SymbolID::TokenEndID; - m_lexer.m_symbol_id[cTokenUncaughtString] = (uint32_t)SymbolID::TokenUncaughtStringID; - m_lexer.m_symbol_id[cTokenInt] = (uint32_t)SymbolID::TokenIntId; - m_lexer.m_symbol_id[cTokenFloat] = (uint32_t)SymbolID::TokenFloatId; - m_lexer.m_symbol_id[cTokenHex] = (uint32_t)SymbolID::TokenHexId; - m_lexer.m_symbol_id[cTokenFirstTimestamp] = (uint32_t)SymbolID::TokenFirstTimestampId; - m_lexer.m_symbol_id[cTokenNewlineTimestamp] = (uint32_t)SymbolID::TokenNewlineTimestampId; - m_lexer.m_symbol_id[cTokenNewline] = (uint32_t)SymbolID::TokenNewlineId; - - m_lexer.m_id_symbol[(uint32_t)SymbolID::TokenEndID] = cTokenEnd; - m_lexer.m_id_symbol[(uint32_t)SymbolID::TokenUncaughtStringID] = cTokenUncaughtString; - m_lexer.m_id_symbol[(uint32_t)SymbolID::TokenIntId] = cTokenInt; - m_lexer.m_id_symbol[(uint32_t)SymbolID::TokenFloatId] = cTokenFloat; - m_lexer.m_id_symbol[(uint32_t)SymbolID::TokenHexId] = cTokenHex; - m_lexer.m_id_symbol[(uint32_t)SymbolID::TokenFirstTimestampId] = cTokenFirstTimestamp; - m_lexer.m_id_symbol[(uint32_t)SymbolID::TokenNewlineTimestampId] = cTokenNewlineTimestamp; - m_lexer.m_id_symbol[(uint32_t)SymbolID::TokenNewlineId] = cTokenNewline; + m_lexer.m_symbol_id[cTokenEnd] = (uint32_t)SymbolId::TokenEnd; + m_lexer.m_symbol_id[cTokenUncaughtString] = (uint32_t)SymbolId::TokenUncaughtString; + m_lexer.m_symbol_id[cTokenInt] = (uint32_t)SymbolId::TokenInt; + m_lexer.m_symbol_id[cTokenFloat] = (uint32_t)SymbolId::TokenFloat; + m_lexer.m_symbol_id[cTokenHex] = (uint32_t)SymbolId::TokenHex; + m_lexer.m_symbol_id[cTokenFirstTimestamp] = (uint32_t)SymbolId::TokenFirstTimestamp; + m_lexer.m_symbol_id[cTokenNewlineTimestamp] = (uint32_t)SymbolId::TokenNewlineTimestamp; + m_lexer.m_symbol_id[cTokenNewline] = (uint32_t)SymbolId::TokenNewline; + + m_lexer.m_id_symbol[(uint32_t)SymbolId::TokenEnd] = cTokenEnd; + m_lexer.m_id_symbol[(uint32_t)SymbolId::TokenUncaughtString] = cTokenUncaughtString; + m_lexer.m_id_symbol[(uint32_t)SymbolId::TokenInt] = cTokenInt; + m_lexer.m_id_symbol[(uint32_t)SymbolId::TokenFloat] = cTokenFloat; + m_lexer.m_id_symbol[(uint32_t)SymbolId::TokenHex] = cTokenHex; + m_lexer.m_id_symbol[(uint32_t)SymbolId::TokenFirstTimestamp] = cTokenFirstTimestamp; + m_lexer.m_id_symbol[(uint32_t)SymbolId::TokenNewlineTimestamp] = cTokenNewlineTimestamp; + m_lexer.m_id_symbol[(uint32_t)SymbolId::TokenNewline] = cTokenNewline; } template From 79482b11adff38bfb7545d3141ff0652bfc9824e Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 10 Oct 2024 13:52:59 -0400 Subject: [PATCH 039/323] Use docstring instead of inline comment. --- src/log_surgeon/Constants.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/log_surgeon/Constants.hpp b/src/log_surgeon/Constants.hpp index e7ee8c00..17f03723 100644 --- a/src/log_surgeon/Constants.hpp +++ b/src/log_surgeon/Constants.hpp @@ -23,7 +23,10 @@ enum class ErrorCode { Truncated, }; -// Using uint32_t as the underlying type to ensure consistency with token identifier types. +/** + * Enum for token IDs. + * NOTE: we use `uint32_t` as the underlying type to ensure consistency with token identifier types. + */ enum class SymbolId : uint32_t { TokenEnd, TokenUncaughtString, From 02378542ff4502755468ffc2f7fc1de2d7e36df1 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Thu, 10 Oct 2024 13:57:27 -0400 Subject: [PATCH 040/323] Use `auto`. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/LogEvent.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/LogEvent.cpp b/src/log_surgeon/LogEvent.cpp index bfb06987..709e62ce 100644 --- a/src/log_surgeon/LogEvent.cpp +++ b/src/log_surgeon/LogEvent.cpp @@ -104,7 +104,7 @@ LogEvent::LogEvent(LogEventView const& src) : LogEventView{src.get_log_parser()} } for (uint32_t i = 0; i < get_log_output_buffer()->pos(); i++) { Token& token = get_log_output_buffer()->get_mutable_token(i); - std::vector const& token_types = *token.m_type_ids_ptr; + auto const& token_types = *token.m_type_ids_ptr; add_token(token_types[0], &token); } } From c935af50465f65ad6fc4339eb99ae90a907304eb Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Thu, 10 Oct 2024 14:00:14 -0400 Subject: [PATCH 041/323] Use `const` for error code. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/SchemaParser.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/SchemaParser.cpp b/src/log_surgeon/SchemaParser.cpp index 6fd33029..d74167fa 100644 --- a/src/log_surgeon/SchemaParser.cpp +++ b/src/log_surgeon/SchemaParser.cpp @@ -64,7 +64,7 @@ auto SchemaParser::try_schema_file(string const& schema_file_path) -> unique_ptr strfmt("Failed to read '%s', errno=%d", schema_file_path.c_str(), errno) ); } - auto code{static_cast>(error_code)}; + auto const code{static_cast>(error_code)}; throw std::runtime_error( strfmt("Failed to read '%s', error_code=%d", schema_file_path.c_str(), code) ); From 91b5e78b127ebfe00e1175231b7ba2ef202759cb Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Thu, 10 Oct 2024 14:01:26 -0400 Subject: [PATCH 042/323] Use `auto` and `const` for `add_to_nfa_with_negative_tags`. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexAST.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 194eae54..0733aa3e 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -100,7 +100,8 @@ class RegexAST { * @param nfa * @param end_state */ - void add_to_nfa_with_negative_tags(RegexNFA* nfa, NFAStateType* end_state) { + auto add_to_nfa_with_negative_tags(RegexNFA* nfa, NFAStateType* end_state) const + -> void { // Handle negative tags as: // root --(regex transitions)--> intermediate_state --(negative tags)--> end_state if (false == m_negative_tags.empty()) { From f6c86ec330e2e8854b2a670f167daf63eaa3c7d3 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Thu, 10 Oct 2024 14:12:05 -0400 Subject: [PATCH 043/323] Use 'auto' for `intermediate_state`. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexAST.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 0733aa3e..17c76249 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -105,7 +105,7 @@ class RegexAST { // Handle negative tags as: // root --(regex transitions)--> intermediate_state --(negative tags)--> end_state if (false == m_negative_tags.empty()) { - NFAStateType* intermediate_state = nfa->new_state(); + auto* intermediate_state = nfa->new_state(); add_to_nfa(nfa, intermediate_state); intermediate_state->add_negative_tagged_transition(m_negative_tags, end_state); } else { From 8fd70d7ada5015781c5ba4ef5f0d4864dd788966 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Thu, 10 Oct 2024 14:13:06 -0400 Subject: [PATCH 044/323] Replace `find` with `at`. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- tests/test-NFA.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index 1bfa43fa..23002a8c 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -86,7 +86,7 @@ TEST_CASE("Test NFA", "[NFA]") { while (false == state_queue.empty()) { auto const* current_state = state_queue.front(); state_queue.pop(); - serialized_nfa += std::to_string(state_ids.find(current_state)->second) + ":"; + serialized_nfa += std::to_string(state_ids.at(current_state)) + ":"; if (current_state->is_accepting()) { serialized_nfa += "accepting_tag=" + std::to_string(current_state->get_matching_variable_id()) + ","; From dd03a3522386afc3e9a7e224766c92e39c70d060 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Thu, 10 Oct 2024 14:26:58 -0400 Subject: [PATCH 045/323] Use `auto` for `intermediate_state`. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexAST.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 17c76249..57dec64b 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -891,7 +891,7 @@ template template void RegexASTCapture::add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const { - NFAStateType* intermediate_state = nfa->new_state(); + auto* intermediate_state = nfa->new_state(); m_group_regex_ast->add_to_nfa_with_negative_tags(nfa, intermediate_state); intermediate_state->add_positive_tagged_transition(m_tag, end_state); } From fd6bb020e0b2087ab9362f50bd580d5573be5744 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 10 Oct 2024 14:31:56 -0400 Subject: [PATCH 046/323] Added constructors for tagged transition classes. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index c441cba9..5f255b01 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -26,13 +26,24 @@ template class RegexNFAState; template -struct PositiveTaggedTransition { +class PositiveTaggedTransition { + PositiveTaggedTransition(uint32_t const tag, RegexNFAState const* dest_state) + : m_tag(tag), + m_dest_state(dest_state) {} + uint32_t m_tag{}; RegexNFAState const* m_dest_state{}; }; template -struct NegativeTaggedTransition { +class NegativeTaggedTransition { + NegativeTaggedTransition( + std::set const& tags, + RegexNFAState const* dest_state + ) + : m_tags(tags), + m_dest_state(dest_state) {} + std::set m_tags; RegexNFAState const* m_dest_state{}; }; From 65861c3a7f03baaabce8e42426b46ff00bf1e6b8 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 10 Oct 2024 14:48:03 -0400 Subject: [PATCH 047/323] Add getters to tagged transition classes. --- src/log_surgeon/Lexer.tpp | 4 +-- src/log_surgeon/finite_automata/RegexNFA.hpp | 16 +++++++++ tests/test-NFA.cpp | 36 +++++++++++++------- 3 files changed, 41 insertions(+), 15 deletions(-) diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index d51e44ac..425c4759 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -414,10 +414,10 @@ auto Lexer::epsilon_closure(NFAStateType const* stat // TODO: currently treat tagged transitions as epsilon transitions for (auto const& positive_tagged_transition : t->get_positive_tagged_transitions()) { - stack.push(positive_tagged_transition.m_dest_state); + stack.push(positive_tagged_transition.get_dest_state()); } for (auto const& negative_tagged_transition : t->get_negative_tagged_transitions()) { - stack.push(negative_tagged_transition.m_dest_state); + stack.push(negative_tagged_transition.get_dest_state()); } } } diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 5f255b01..77738ab6 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -27,16 +27,25 @@ class RegexNFAState; template class PositiveTaggedTransition { +public: PositiveTaggedTransition(uint32_t const tag, RegexNFAState const* dest_state) : m_tag(tag), m_dest_state(dest_state) {} + [[nodiscard]] auto get_tag() const -> uint32_t { return m_tag; } + + [[nodiscard]] auto get_dest_state() const -> RegexNFAState const* { + return m_dest_state; + } + +private: uint32_t m_tag{}; RegexNFAState const* m_dest_state{}; }; template class NegativeTaggedTransition { +public: NegativeTaggedTransition( std::set const& tags, RegexNFAState const* dest_state @@ -44,6 +53,13 @@ class NegativeTaggedTransition { : m_tags(tags), m_dest_state(dest_state) {} + [[nodiscard]] auto get_tags() const -> std::set const& { return m_tags; } + + [[nodiscard]] auto get_dest_state() const -> RegexNFAState const* { + return m_dest_state; + } + +private: std::set m_tags; RegexNFAState const* m_dest_state{}; }; diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index 23002a8c..e57600b2 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -70,11 +70,15 @@ TEST_CASE("Test NFA", "[NFA]") { for (auto const* dest_state : current_state->get_epsilon_transitions()) { add_to_queue(dest_state); } - for (auto const& [tag, dest_state] : current_state->get_positive_tagged_transitions()) { - add_to_queue(dest_state); + for (auto const& positive_tagged_transition : + current_state->get_positive_tagged_transitions()) + { + add_to_queue(positive_tagged_transition.get_dest_state()); } - for (auto const& [tags, dest_state] : current_state->get_negative_tagged_transitions()) { - add_to_queue(dest_state); + for (auto const& negative_tagged_transition : + current_state->get_negative_tagged_transitions()) + { + add_to_queue(negative_tagged_transition.get_dest_state()); } } @@ -101,24 +105,30 @@ TEST_CASE("Test NFA", "[NFA]") { } serialized_nfa += "},epsilon_transitions={"; for (auto const* dest_state : current_state->get_epsilon_transitions()) { - serialized_nfa += std::to_string(state_ids.find(dest_state)->second) + ","; + serialized_nfa += std::to_string(state_ids.at(dest_state)) + ","; add_to_queue(dest_state); } serialized_nfa += "},positive_tagged_transitions={"; - for (auto const& [tag, dest_state] : current_state->get_positive_tagged_transitions()) { - serialized_nfa += std::to_string(state_ids.find(dest_state)->second); - serialized_nfa += "[" + std::to_string(tag) + "],"; - add_to_queue(dest_state); + for (auto const& positive_tagged_transition : + current_state->get_positive_tagged_transitions()) + { + serialized_nfa + += std::to_string(state_ids.at(positive_tagged_transition.get_dest_state())); + serialized_nfa += "[" + std::to_string(positive_tagged_transition.get_tag()) + "],"; + add_to_queue(positive_tagged_transition.get_dest_state()); } serialized_nfa += "},negative_tagged_transitions={"; - for (auto const& [tags, dest_state] : current_state->get_negative_tagged_transitions()) { - serialized_nfa += std::to_string(state_ids.find(dest_state)->second); + for (auto const& negative_tagged_transition : + current_state->get_negative_tagged_transitions()) + { + serialized_nfa + += std::to_string(state_ids.at(negative_tagged_transition.get_dest_state())); serialized_nfa += "["; - for (auto const& tag : tags) { + for (auto const& tag : negative_tagged_transition.get_tags()) { serialized_nfa += std::to_string(tag) + ","; } serialized_nfa += "],"; - add_to_queue(dest_state); + add_to_queue(negative_tagged_transition.get_dest_state()); } serialized_nfa += "}"; serialized_nfa += "\n"; From dfb7dcf3f5ae298a2c14a82dd4b0263d5450851d Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 10 Oct 2024 14:49:20 -0400 Subject: [PATCH 048/323] Use emplace_back instead of push_back for tagged transitions. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 77738ab6..21a515f4 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -85,9 +85,7 @@ class RegexNFAState { uint32_t const tag, RegexNFAState const* dest_state ) -> void { - m_positive_tagged_transitions.push_back( - PositiveTaggedTransition{tag, dest_state} - ); + m_positive_tagged_transitions.emplace_back(tag, dest_state); } [[nodiscard]] auto get_positive_tagged_transitions( @@ -99,9 +97,7 @@ class RegexNFAState { std::set const& tags, RegexNFAState const* dest_state ) -> void { - m_negative_tagged_transitions.push_back( - NegativeTaggedTransition{tags, dest_state} - ); + m_negative_tagged_transitions.emplace_back(tags, dest_state); } [[nodiscard]] auto get_negative_tagged_transitions( From 473787ef2b97f46c9fd35d3f12b16be1e0cfd106 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Thu, 10 Oct 2024 14:52:23 -0400 Subject: [PATCH 049/323] Use `const` for `factor`. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/UnicodeIntervalTree.tpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/UnicodeIntervalTree.tpp b/src/log_surgeon/finite_automata/UnicodeIntervalTree.tpp index 7e1b0763..a3646856 100644 --- a/src/log_surgeon/finite_automata/UnicodeIntervalTree.tpp +++ b/src/log_surgeon/finite_automata/UnicodeIntervalTree.tpp @@ -170,7 +170,7 @@ auto UnicodeIntervalTree::Node::balance_factor() -> int { template auto UnicodeIntervalTree::Node::balance(std::unique_ptr node ) -> std::unique_ptr::Node> { - auto factor = node->balance_factor(); + auto const factor = node->balance_factor(); if (factor * factor <= 1) { return node; } From 2a08121da61a4523ca88ce94175b9c5ab1f5382f Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Thu, 10 Oct 2024 14:54:14 -0400 Subject: [PATCH 050/323] Use `const` for `sub_factor`. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/UnicodeIntervalTree.tpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/UnicodeIntervalTree.tpp b/src/log_surgeon/finite_automata/UnicodeIntervalTree.tpp index a3646856..e5b2413f 100644 --- a/src/log_surgeon/finite_automata/UnicodeIntervalTree.tpp +++ b/src/log_surgeon/finite_automata/UnicodeIntervalTree.tpp @@ -174,7 +174,7 @@ auto UnicodeIntervalTree::Node::balance(std::unique_ptr node if (factor * factor <= 1) { return node; } - auto sub_factor + auto const sub_factor = (factor < 0) ? node->m_left->balance_factor() : node->m_right->balance_factor(); if (factor * sub_factor > 0) { return Node::rotate(std::move(node), factor); From 0b7e38b1a1d2b250c0240f64d6598d73a8596bf1 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Thu, 10 Oct 2024 14:55:27 -0400 Subject: [PATCH 051/323] Use list initialization for `rule`. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- tests/test-NFA.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index e57600b2..514fa2b6 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -41,7 +41,7 @@ TEST_CASE("Test NFA", "[NFA]") { auto const schema_ast = schema.release_schema_ast_ptr(); auto& capture_rule_ast = dynamic_cast(*schema_ast->m_schema_vars[0]); ByteNFA nfa; - ByteLexicalRule rule(0, std::move(capture_rule_ast.m_regex_ptr)); + ByteLexicalRule rule{0, std::move(capture_rule_ast.m_regex_ptr)}; rule.add_to_nfa(&nfa); // Add helper for updating state_queue and visited_states From f3b0f6a2440d7688c7fd20b89d03de0fd6017c3b Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Thu, 10 Oct 2024 15:08:27 -0400 Subject: [PATCH 052/323] Use list initialization for `var_schema`. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- tests/test-NFA.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index 514fa2b6..4108b21f 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -32,10 +32,12 @@ using RegexASTOrByte = log_surgeon::finite_automata::RegexASTOr((?(a)|(b))|(?(c)|(d))))B(?" - "\\d+)C)"); + string const var_name{"capture"}; + string const var_schema{ + var_name + ":" + + "Z|(A(?((?(a)|(b))|(?(c)|(d))))B(?" + "\\d+)C)" + }; schema.add_variable(var_schema, -1); auto const schema_ast = schema.release_schema_ast_ptr(); From 74793b33260359cba607ead5de91116b97e3092d Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Thu, 10 Oct 2024 15:09:38 -0400 Subject: [PATCH 053/323] Group `visited_states` modifications together. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- tests/test-NFA.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index 4108b21f..522e36ef 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -86,8 +86,8 @@ TEST_CASE("Test NFA", "[NFA]") { // Serialize NFA std::string serialized_nfa; - visited_states.clear(); state_queue.push(root); + visited_states.clear(); visited_states.insert(root); while (false == state_queue.empty()) { auto const* current_state = state_queue.front(); From d2f38fa04c3c2de7908f614758e6d5710b68305d Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 10 Oct 2024 15:24:31 -0400 Subject: [PATCH 054/323] Use unordered_map instead of map for state_ids. --- tests/test-NFA.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index 522e36ef..6148e00e 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -19,6 +19,7 @@ using log_surgeon::finite_automata::RegexNFAByteState; using log_surgeon::Schema; using log_surgeon::SchemaVarAST; using std::string; +using std::unordered_map; using ByteLexicalRule = log_surgeon::LexicalRule; using ByteNFA = log_surgeon::finite_automata::RegexNFA; @@ -56,7 +57,7 @@ TEST_CASE("Test NFA", "[NFA]") { }; // Assign state IDs - std::map state_ids; + unordered_map state_ids; auto const* root = nfa.get_root(); state_queue.push(root); visited_states.insert(root); From a7f7a14df7201b7c6b048bbe1aa0c1ceba04e1ef Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 10 Oct 2024 15:35:44 -0400 Subject: [PATCH 055/323] Make add_to_queue lambda a helper called add_to_queue_and_visited. --- tests/test-NFA.cpp | 64 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 14 deletions(-) diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index 6148e00e..1c853c44 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -18,8 +18,10 @@ using log_surgeon::cSizeOfByte; using log_surgeon::finite_automata::RegexNFAByteState; using log_surgeon::Schema; using log_surgeon::SchemaVarAST; +using std::queue; using std::string; using std::unordered_map; +using std::unordered_set; using ByteLexicalRule = log_surgeon::LexicalRule; using ByteNFA = log_surgeon::finite_automata::RegexNFA; @@ -31,6 +33,30 @@ using RegexASTMultiplicationByte = log_surgeon::finite_automata::RegexASTMultiplication; using RegexASTOrByte = log_surgeon::finite_automata::RegexASTOr; +namespace { +/** + * Add a destination state to the queue and set of visited states if it has not yet been visited. + * @param dest_state + * @param visited_states + * @param state_queue + */ +auto add_to_queue_and_visited( + RegexNFAByteState const* dest_state, + queue& state_queue, + unordered_set& visited_states +) -> void; + +auto add_to_queue_and_visited( + RegexNFAByteState const* dest_state, + queue& state_queue, + unordered_set& visited_states +) -> void { + if (visited_states.insert(dest_state).second) { + state_queue.push(dest_state); + } +} +} // namespace + TEST_CASE("Test NFA", "[NFA]") { Schema schema; string const var_name{"capture"}; @@ -47,14 +73,8 @@ TEST_CASE("Test NFA", "[NFA]") { ByteLexicalRule rule{0, std::move(capture_rule_ast.m_regex_ptr)}; rule.add_to_nfa(&nfa); - // Add helper for updating state_queue and visited_states std::queue state_queue; std::unordered_set visited_states; - auto add_to_queue = [&](auto const* dest_state) { - if (visited_states.insert(dest_state).second) { - state_queue.push(dest_state); - } - }; // Assign state IDs unordered_map state_ids; @@ -67,21 +87,29 @@ TEST_CASE("Test NFA", "[NFA]") { state_ids.insert({current_state, state_ids.size()}); for (uint32_t idx = 0; idx < cSizeOfByte; idx++) { for (auto const* dest_state : current_state->get_byte_transitions(idx)) { - add_to_queue(dest_state); + add_to_queue_and_visited(dest_state, state_queue, visited_states); } } for (auto const* dest_state : current_state->get_epsilon_transitions()) { - add_to_queue(dest_state); + add_to_queue_and_visited(dest_state, state_queue, visited_states); } for (auto const& positive_tagged_transition : current_state->get_positive_tagged_transitions()) { - add_to_queue(positive_tagged_transition.get_dest_state()); + add_to_queue_and_visited( + positive_tagged_transition.get_dest_state(), + state_queue, + visited_states + ); } for (auto const& negative_tagged_transition : current_state->get_negative_tagged_transitions()) { - add_to_queue(negative_tagged_transition.get_dest_state()); + add_to_queue_and_visited( + negative_tagged_transition.get_dest_state(), + state_queue, + visited_states + ); } } @@ -103,13 +131,13 @@ TEST_CASE("Test NFA", "[NFA]") { for (auto const* dest_state : current_state->get_byte_transitions(idx)) { serialized_nfa += std::string(1, static_cast(idx)) + "-->" + std::to_string(state_ids.find(dest_state)->second) + ","; - add_to_queue(dest_state); + add_to_queue_and_visited(dest_state, state_queue, visited_states); } } serialized_nfa += "},epsilon_transitions={"; for (auto const* dest_state : current_state->get_epsilon_transitions()) { serialized_nfa += std::to_string(state_ids.at(dest_state)) + ","; - add_to_queue(dest_state); + add_to_queue_and_visited(dest_state, state_queue, visited_states); } serialized_nfa += "},positive_tagged_transitions={"; for (auto const& positive_tagged_transition : @@ -118,7 +146,11 @@ TEST_CASE("Test NFA", "[NFA]") { serialized_nfa += std::to_string(state_ids.at(positive_tagged_transition.get_dest_state())); serialized_nfa += "[" + std::to_string(positive_tagged_transition.get_tag()) + "],"; - add_to_queue(positive_tagged_transition.get_dest_state()); + add_to_queue_and_visited( + positive_tagged_transition.get_dest_state(), + state_queue, + visited_states + ); } serialized_nfa += "},negative_tagged_transitions={"; for (auto const& negative_tagged_transition : @@ -131,7 +163,11 @@ TEST_CASE("Test NFA", "[NFA]") { serialized_nfa += std::to_string(tag) + ","; } serialized_nfa += "],"; - add_to_queue(negative_tagged_transition.get_dest_state()); + add_to_queue_and_visited( + negative_tagged_transition.get_dest_state(), + state_queue, + visited_states + ); } serialized_nfa += "}"; serialized_nfa += "\n"; From d244a809563166762a7016fdd633c20daa08203c Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 10 Oct 2024 15:45:49 -0400 Subject: [PATCH 056/323] Replace const& with std::move when dealing with negative_tags. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 21a515f4..e678fa67 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -47,10 +47,10 @@ template class NegativeTaggedTransition { public: NegativeTaggedTransition( - std::set const& tags, + std::set tags, RegexNFAState const* dest_state ) - : m_tags(tags), + : m_tags(std::move(tags)), m_dest_state(dest_state) {} [[nodiscard]] auto get_tags() const -> std::set const& { return m_tags; } @@ -94,10 +94,10 @@ class RegexNFAState { } auto add_negative_tagged_transition( - std::set const& tags, + std::set tags, RegexNFAState const* dest_state ) -> void { - m_negative_tagged_transitions.emplace_back(tags, dest_state); + m_negative_tagged_transitions.emplace_back(std::move(tags), dest_state); } [[nodiscard]] auto get_negative_tagged_transitions( From 158df37a47101cc63d3e6541f677be042c3514dc Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 10 Oct 2024 16:11:57 -0400 Subject: [PATCH 057/323] Run auto-formatter. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index e678fa67..19d497cc 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -46,10 +46,7 @@ class PositiveTaggedTransition { template class NegativeTaggedTransition { public: - NegativeTaggedTransition( - std::set tags, - RegexNFAState const* dest_state - ) + NegativeTaggedTransition(std::set tags, RegexNFAState const* dest_state) : m_tags(std::move(tags)), m_dest_state(dest_state) {} From f82b46fff75810302167c3ec214de48890905f43 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 16 Oct 2024 15:09:14 -0400 Subject: [PATCH 058/323] Remove incorrect comment. --- src/log_surgeon/Lexer.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/log_surgeon/Lexer.hpp b/src/log_surgeon/Lexer.hpp index 3d46a08c..020b1599 100644 --- a/src/log_surgeon/Lexer.hpp +++ b/src/log_surgeon/Lexer.hpp @@ -49,7 +49,6 @@ class LexicalRule { template class Lexer { public: - // std::vector can be declared as constexpr in c++20 static inline std::vector const cTokenEndTypes = {(uint32_t)SymbolId::TokenEnd}; static inline std::vector const cTokenUncaughtStringTypes = {(uint32_t)SymbolId::TokenUncaughtString}; From c87caf9cdbb998ed498ff281a8eb9d2515026722 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 20 Oct 2024 05:34:56 -0400 Subject: [PATCH 059/323] Move LexicalRule to its own class; Pass rules into NFA construction; Use more using in test-NFA.cpp. --- CMakeLists.txt | 1 + src/log_surgeon/Lexer.hpp | 29 +----------- src/log_surgeon/Lexer.tpp | 13 +----- src/log_surgeon/LexicalRule.hpp | 46 ++++++++++++++++++ src/log_surgeon/finite_automata/RegexAST.hpp | 4 +- src/log_surgeon/finite_automata/RegexNFA.hpp | 11 ++++- tests/test-NFA.cpp | 49 +++++++++++--------- 7 files changed, 88 insertions(+), 65 deletions(-) create mode 100644 src/log_surgeon/LexicalRule.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 7f7d78ef..8a9916a3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -73,6 +73,7 @@ set(SOURCE_FILES src/log_surgeon/LALR1Parser.tpp src/log_surgeon/Lexer.hpp src/log_surgeon/Lexer.tpp + src/log_surgeon/LexicalRule.hpp src/log_surgeon/LogEvent.cpp src/log_surgeon/LogEvent.hpp src/log_surgeon/LogParser.cpp diff --git a/src/log_surgeon/Lexer.hpp b/src/log_surgeon/Lexer.hpp index 020b1599..ddb12cfa 100644 --- a/src/log_surgeon/Lexer.hpp +++ b/src/log_surgeon/Lexer.hpp @@ -14,38 +14,11 @@ #include #include #include +#include #include #include namespace log_surgeon { -template -class LexicalRule { -public: - // Constructor - LexicalRule( - uint32_t const variable_id, - std::unique_ptr> regex - ) - : m_variable_id(variable_id), - m_regex(std::move(regex)) {} - - /** - * Adds AST representing the lexical rule to the NFA - * @param nfa - */ - auto add_to_nfa(finite_automata::RegexNFA* nfa) const -> void; - - [[nodiscard]] auto get_variable_id() const -> uint32_t { return m_variable_id; } - - [[nodiscard]] auto get_regex() const -> finite_automata::RegexAST* { - return m_regex.get(); - } - -private: - uint32_t m_variable_id; - std::unique_ptr> m_regex; -}; - template class Lexer { public: diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index 425c4759..9e51dfdf 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -374,10 +374,7 @@ auto Lexer::get_rule(uint32_t const variable_id template void Lexer::generate() { - finite_automata::RegexNFA nfa; - for (auto const& rule : m_rules) { - rule.add_to_nfa(&nfa); - } + finite_automata::RegexNFA nfa(m_rules); // TODO: DFA ignores tags. E.g., treats "capture:user=(?\d+)" as "capture:user=\d+" m_dfa = nfa_to_dfa(nfa); DFAStateType const* state = m_dfa->get_root(); @@ -390,14 +387,6 @@ void Lexer::generate() { } } -template -void LexicalRule::add_to_nfa(finite_automata::RegexNFA* nfa) const { - auto* end_state = nfa->new_state(); - end_state->set_accepting(true); - end_state->set_matching_variable_id(m_variable_id); - m_regex->add_to_nfa_with_negative_tags(nfa, end_state); -} - template auto Lexer::epsilon_closure(NFAStateType const* state_ptr ) -> std::set { diff --git a/src/log_surgeon/LexicalRule.hpp b/src/log_surgeon/LexicalRule.hpp new file mode 100644 index 00000000..9e038e7c --- /dev/null +++ b/src/log_surgeon/LexicalRule.hpp @@ -0,0 +1,46 @@ +#ifndef LOG_SURGEON_LEXICAL_RULE_HPP +#define LOG_SURGEON_LEXICAL_RULE_HPP +#include +#include + +#include + +namespace log_surgeon { +template +class LexicalRule { +public: + // Constructor + LexicalRule( + uint32_t const variable_id, + std::unique_ptr> regex + ) + : m_variable_id(variable_id), + m_regex(std::move(regex)) {} + + /** + * Adds AST representing the lexical rule to the NFA + * @param nfa + */ + auto add_to_nfa(finite_automata::RegexNFA* nfa) const -> void; + + [[nodiscard]] auto get_variable_id() const -> uint32_t { return m_variable_id; } + + [[nodiscard]] auto get_regex() const -> finite_automata::RegexAST* { + return m_regex.get(); + } + +private: + uint32_t m_variable_id; + std::unique_ptr> m_regex; +}; + +template +void LexicalRule::add_to_nfa(finite_automata::RegexNFA* nfa) const { + auto* end_state = nfa->new_state(); + end_state->set_accepting(true); + end_state->set_matching_variable_id(m_variable_id); + m_regex->add_to_nfa_with_negative_tags(nfa, end_state); +} +} // namespace log_surgeon + +#endif // LOG_SURGEON_LEXER_HPP diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 57dec64b..edfa7b0e 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -19,9 +19,11 @@ #include #include -#include +#include namespace log_surgeon::finite_automata { +template +class RegexNFA; // TODO: rename `RegexAST` to `RegexASTNode` /** diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 19d497cc..ba537fa9 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -15,6 +15,7 @@ #include #include +#include namespace log_surgeon::finite_automata { enum class RegexNFAStateType : uint8_t { @@ -166,7 +167,7 @@ class RegexNFA { public: using StateVec = std::vector; - RegexNFA() : m_root{new_state()} {} + explicit RegexNFA(std::vector> const& m_rules); /** * Create a unique_ptr for an NFA state and add it to m_states @@ -239,6 +240,14 @@ void RegexNFAState::add_interval(Interval interval, RegexNFAState* d } } +template +RegexNFA::RegexNFA(std::vector> const& m_rules) + : m_root{new_state()} { + for (auto const& rule : m_rules) { + rule.add_to_nfa(this); + } +} + template void RegexNFA::reverse() { // add new end with all accepting pointing to it diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index 1c853c44..73b5495e 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -20,8 +20,11 @@ using log_surgeon::Schema; using log_surgeon::SchemaVarAST; using std::queue; using std::string; +using std::stringstream; +using std::to_string; using std::unordered_map; using std::unordered_set; +using std::vector; using ByteLexicalRule = log_surgeon::LexicalRule; using ByteNFA = log_surgeon::finite_automata::RegexNFA; @@ -69,12 +72,12 @@ TEST_CASE("Test NFA", "[NFA]") { auto const schema_ast = schema.release_schema_ast_ptr(); auto& capture_rule_ast = dynamic_cast(*schema_ast->m_schema_vars[0]); - ByteNFA nfa; - ByteLexicalRule rule{0, std::move(capture_rule_ast.m_regex_ptr)}; - rule.add_to_nfa(&nfa); + vector rules; + rules.emplace_back(0, move(capture_rule_ast.m_regex_ptr)); + ByteNFA nfa(rules); - std::queue state_queue; - std::unordered_set visited_states; + queue state_queue; + unordered_set visited_states; // Assign state IDs unordered_map state_ids; @@ -114,29 +117,29 @@ TEST_CASE("Test NFA", "[NFA]") { } // Serialize NFA - std::string serialized_nfa; + string serialized_nfa; state_queue.push(root); visited_states.clear(); visited_states.insert(root); while (false == state_queue.empty()) { auto const* current_state = state_queue.front(); state_queue.pop(); - serialized_nfa += std::to_string(state_ids.at(current_state)) + ":"; + serialized_nfa += to_string(state_ids.at(current_state)) + ":"; if (current_state->is_accepting()) { serialized_nfa += "accepting_tag=" - + std::to_string(current_state->get_matching_variable_id()) + ","; + + to_string(current_state->get_matching_variable_id()) + ","; } serialized_nfa += "byte_transitions={"; for (uint32_t idx = 0; idx < cSizeOfByte; idx++) { for (auto const* dest_state : current_state->get_byte_transitions(idx)) { - serialized_nfa += std::string(1, static_cast(idx)) + "-->" - + std::to_string(state_ids.find(dest_state)->second) + ","; + serialized_nfa += string(1, static_cast(idx)) + "-->" + + to_string(state_ids.find(dest_state)->second) + ","; add_to_queue_and_visited(dest_state, state_queue, visited_states); } } serialized_nfa += "},epsilon_transitions={"; for (auto const* dest_state : current_state->get_epsilon_transitions()) { - serialized_nfa += std::to_string(state_ids.at(dest_state)) + ","; + serialized_nfa += to_string(state_ids.at(dest_state)) + ","; add_to_queue_and_visited(dest_state, state_queue, visited_states); } serialized_nfa += "},positive_tagged_transitions={"; @@ -144,8 +147,8 @@ TEST_CASE("Test NFA", "[NFA]") { current_state->get_positive_tagged_transitions()) { serialized_nfa - += std::to_string(state_ids.at(positive_tagged_transition.get_dest_state())); - serialized_nfa += "[" + std::to_string(positive_tagged_transition.get_tag()) + "],"; + += to_string(state_ids.at(positive_tagged_transition.get_dest_state())); + serialized_nfa += "[" + to_string(positive_tagged_transition.get_tag()) + "],"; add_to_queue_and_visited( positive_tagged_transition.get_dest_state(), state_queue, @@ -157,10 +160,10 @@ TEST_CASE("Test NFA", "[NFA]") { current_state->get_negative_tagged_transitions()) { serialized_nfa - += std::to_string(state_ids.at(negative_tagged_transition.get_dest_state())); + += to_string(state_ids.at(negative_tagged_transition.get_dest_state())); serialized_nfa += "["; for (auto const& tag : negative_tagged_transition.get_tags()) { - serialized_nfa += std::to_string(tag) + ","; + serialized_nfa += to_string(tag) + ","; } serialized_nfa += "],"; add_to_queue_and_visited( @@ -174,7 +177,7 @@ TEST_CASE("Test NFA", "[NFA]") { } // Compare against expected output - std::string expected_serialized_nfa = "0:byte_transitions={A-->1,Z-->2,}," + string expected_serialized_nfa = "0:byte_transitions={A-->1,Z-->2,}," "epsilon_transitions={}," "positive_tagged_transitions={}," "negative_tagged_transitions={}\n"; @@ -230,15 +233,15 @@ TEST_CASE("Test NFA", "[NFA]") { "negative_tagged_transitions={}\n"; // Compare expected and actual line-by-line - std::stringstream ss_actual(serialized_nfa); - std::stringstream ss_expected(expected_serialized_nfa); - std::string actual_line; - std::string expected_line; - while (std::getline(ss_actual, actual_line) && std::getline(ss_expected, expected_line)) { + stringstream ss_actual(serialized_nfa); + stringstream ss_expected(expected_serialized_nfa); + string actual_line; + string expected_line; + while (getline(ss_actual, actual_line) && getline(ss_expected, expected_line)) { REQUIRE(actual_line == expected_line); } - std::getline(ss_actual, actual_line); + getline(ss_actual, actual_line); REQUIRE(actual_line.empty()); - std::getline(ss_expected, expected_line); + getline(ss_expected, expected_line); REQUIRE(expected_line.empty()); } From abe55e265a66e52601107ed460df5f46a0945993 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 20 Oct 2024 07:59:05 -0400 Subject: [PATCH 060/323] Add tagged transitions during RegexNFAState construction; Remove unused functions. --- src/log_surgeon/finite_automata/RegexAST.hpp | 7 +- src/log_surgeon/finite_automata/RegexNFA.hpp | 194 +++++-------------- 2 files changed, 54 insertions(+), 147 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index edfa7b0e..84d682ed 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -107,9 +107,9 @@ class RegexAST { // Handle negative tags as: // root --(regex transitions)--> intermediate_state --(negative tags)--> end_state if (false == m_negative_tags.empty()) { - auto* intermediate_state = nfa->new_state(); + auto* intermediate_state + = nfa->new_state_with_negative_tagged_transitions(m_negative_tags, end_state); add_to_nfa(nfa, intermediate_state); - intermediate_state->add_negative_tagged_transition(m_negative_tags, end_state); } else { add_to_nfa(nfa, end_state); } @@ -893,9 +893,8 @@ template template void RegexASTCapture::add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const { - auto* intermediate_state = nfa->new_state(); + auto* intermediate_state = nfa->new_state_with_a_positive_tagged_transition(m_tag, end_state); m_group_regex_ast->add_to_nfa_with_negative_tags(nfa, intermediate_state); - intermediate_state->add_positive_tagged_transition(m_tag, end_state); } template diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index ba537fa9..8f792cfd 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -65,6 +65,16 @@ class NegativeTaggedTransition { template class RegexNFAState { public: + RegexNFAState() = default; + + explicit RegexNFAState(uint32_t const tag, RegexNFAState const* dest_state) { + m_positive_tagged_transitions.emplace_back(tag, dest_state); + } + + explicit RegexNFAState(std::set tags, RegexNFAState const* dest_state) { + m_negative_tagged_transitions.emplace_back(std::move(tags), dest_state); + } + using Tree = UnicodeIntervalTree; auto set_accepting(bool accepting) -> void { m_accepting = accepting; } @@ -79,61 +89,33 @@ class RegexNFAState { return m_matching_variable_id; } - auto add_positive_tagged_transition( - uint32_t const tag, - RegexNFAState const* dest_state - ) -> void { - m_positive_tagged_transitions.emplace_back(tag, dest_state); - } - [[nodiscard]] auto get_positive_tagged_transitions( ) const -> std::vector> const& { return m_positive_tagged_transitions; } - auto add_negative_tagged_transition( - std::set tags, - RegexNFAState const* dest_state - ) -> void { - m_negative_tagged_transitions.emplace_back(std::move(tags), dest_state); - } - [[nodiscard]] auto get_negative_tagged_transitions( ) const -> std::vector> const& { return m_negative_tagged_transitions; } - auto set_epsilon_transitions(std::vector& epsilon_transitions) -> void { - m_epsilon_transitions = epsilon_transitions; - } - auto add_epsilon_transition(RegexNFAState* epsilon_transition) -> void { m_epsilon_transitions.push_back(epsilon_transition); } - auto clear_epsilon_transitions() -> void { m_epsilon_transitions.clear(); } - [[nodiscard]] auto get_epsilon_transitions() const -> std::vector const& { return m_epsilon_transitions; } - auto set_byte_transitions(uint8_t byte, std::vector& byte_transitions) -> void { - m_bytes_transitions[byte] = byte_transitions; - } - auto add_byte_transition(uint8_t byte, RegexNFAState* dest_state) -> void { m_bytes_transitions[byte].push_back(dest_state); } - auto clear_byte_transitions(uint8_t byte) -> void { m_bytes_transitions[byte].clear(); } - [[nodiscard]] auto get_byte_transitions(uint8_t byte ) const -> std::vector const& { return m_bytes_transitions[byte]; } - auto reset_tree_transitions() -> void { m_tree_transitions.reset(); } - auto get_tree_transitions() -> Tree const& { return m_tree_transitions; } /** @@ -170,11 +152,30 @@ class RegexNFA { explicit RegexNFA(std::vector> const& m_rules); /** - * Create a unique_ptr for an NFA state and add it to m_states + * Create a unique_ptr for an NFA state with no tagged transitions and add it to m_states. * @return NFAStateType* */ auto new_state() -> NFAStateType*; + /** + * Create a unique_ptr for an NFA state with a positive tagged transition and add it to + * m_states. + * @return NFAStateType* + */ + auto new_state_with_a_positive_tagged_transition( + uint32_t tag, + NFAStateType const* dest_state + ) -> NFAStateType*; + + /** + * Create a unique_ptr for an NFA state with negative tagged transitions and add it to m_states. + * @return NFAStateType* + */ + auto new_state_with_negative_tagged_transitions( + std::set tags, + NFAStateType const* dest_state + ) -> NFAStateType*; + /** * Reverse the NFA such that it matches on its reverse language */ @@ -249,128 +250,35 @@ RegexNFA::RegexNFA(std::vector> const& m } template -void RegexNFA::reverse() { - // add new end with all accepting pointing to it - NFAStateType* new_end = new_state(); - for (std::unique_ptr& state_ptr : m_states) { - if (state_ptr->is_accepting()) { - state_ptr->add_epsilon_transition(new_end); - state_ptr->set_accepting(false); - } - } - // move edges from NFA to maps - std::map, std::vector> byte_edges; - std::map, bool> epsilon_edges; - for (std::unique_ptr& src_state_ptr : m_states) { - // TODO: handle utf8 case with if constexpr (RegexNFAUTF8State == - // NFAStateType) ~ don't really need this though - for (uint32_t byte = 0; byte < cSizeOfByte; byte++) { - for (NFAStateType* dest_state_ptr : src_state_ptr->get_byte_transitions(byte)) { - std::pair edge{src_state_ptr.get(), dest_state_ptr}; - byte_edges[edge].push_back(byte); - } - src_state_ptr->clear_byte_transitions(byte); - } - for (NFAStateType* dest_state_ptr : src_state_ptr->get_epsilon_transitions()) { - epsilon_edges - [std::pair(src_state_ptr.get(), dest_state_ptr)] - = true; - } - src_state_ptr->clear_epsilon_transitions(); - } - - // insert edges from maps back into NFA, but in the reverse direction - for (std::unique_ptr& src_state_ptr : m_states) { - for (std::unique_ptr& dest_state_ptr : m_states) { - std::pair key(src_state_ptr.get(), dest_state_ptr.get()); - auto byte_it = byte_edges.find(key); - if (byte_it != byte_edges.end()) { - for (uint8_t byte : byte_it->second) { - dest_state_ptr->add_byte_transition(byte, src_state_ptr.get()); - } - } - auto epsilon_it = epsilon_edges.find(key); - if (epsilon_it != epsilon_edges.end()) { - dest_state_ptr->add_epsilon_transition(src_state_ptr.get()); - } - } - } - - // propagate matching_variable_id from old accepting m_states - for (NFAStateType* old_accepting_state : new_end->get_epsilon_transitions()) { - auto const matching_variable_id = old_accepting_state->get_matching_variable_id(); - std::stack unvisited_states; - std::set visited_states; - unvisited_states.push(old_accepting_state); - while (!unvisited_states.empty()) { - NFAStateType* current_state = unvisited_states.top(); - current_state->set_matching_variable_id(matching_variable_id); - unvisited_states.pop(); - visited_states.insert(current_state); - for (uint32_t byte = 0; byte < cSizeOfByte; byte++) { - std::vector byte_transitions - = current_state->get_byte_transitions(byte); - for (NFAStateType* next_state : byte_transitions) { - if (false == visited_states.contains(next_state)) { - unvisited_states.push(next_state); - } - } - } - for (NFAStateType* next_state : current_state->get_epsilon_transitions()) { - if (false == visited_states.contains(next_state)) { - unvisited_states.push(next_state); - } - } - } - } - for (int32_t i = m_states.size() - 1; i >= 0; --i) { - std::unique_ptr& src_state_unique_ptr = m_states[i]; - NFAStateType* src_state = src_state_unique_ptr.get(); - auto const matching_variable_id = src_state->get_matching_variable_id(); - for (uint32_t byte = 0; byte < cSizeOfByte; byte++) { - std::vector byte_transitions = src_state->get_byte_transitions(byte); - for (int32_t j = byte_transitions.size() - 1; j >= 0; --j) { - NFAStateType*& dest_state = byte_transitions[j]; - if (dest_state == m_root) { - dest_state = new_state(); - assert(dest_state != nullptr); - dest_state->set_matching_variable_id(matching_variable_id); - dest_state->set_accepting(true); - } - } - src_state->clear_byte_transitions(byte); - src_state->set_byte_transitions(byte, byte_transitions); - } - std::vector epsilon_transitions = src_state->get_epsilon_transitions(); - for (int32_t j = epsilon_transitions.size() - 1; j >= 0; --j) { - NFAStateType*& dest_state = epsilon_transitions[j]; - if (dest_state == m_root) { - dest_state = new_state(); - dest_state->set_matching_variable_id(src_state->get_matching_variable_id()); - dest_state->set_accepting(true); - } - } - src_state->clear_epsilon_transitions(); - src_state->set_epsilon_transitions(epsilon_transitions); - } +auto RegexNFA::new_state() -> NFAStateType* { + std::unique_ptr ptr = std::make_unique(); + NFAStateType* state = ptr.get(); + m_states.push_back(std::move(ptr)); + return state; +} - for (uint32_t i = 0; i < m_states.size(); i++) { - if (m_states[i].get() == m_root) { - m_states.erase(m_states.begin() + i); - break; - } - } - // start from the end - m_root = new_end; +template +auto RegexNFA::new_state_with_a_positive_tagged_transition( + uint32_t const tag, + NFAStateType const* dest_state +) -> NFAStateType* { + std::unique_ptr ptr = std::make_unique(tag, dest_state); + NFAStateType* state = ptr.get(); + m_states.push_back(std::move(ptr)); + return state; } template -auto RegexNFA::new_state() -> NFAStateType* { - std::unique_ptr ptr = std::make_unique(); +auto RegexNFA::new_state_with_negative_tagged_transitions( + std::set tags, + NFAStateType const* dest_state +) -> NFAStateType* { + std::unique_ptr ptr = std::make_unique(tags, dest_state); NFAStateType* state = ptr.get(); m_states.push_back(std::move(ptr)); return state; } + } // namespace log_surgeon::finite_automata #endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_HPP From 6d1db102e33ab8b9ce8095b79501a6503ca2cc34 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 20 Oct 2024 08:04:41 -0400 Subject: [PATCH 061/323] Fix compiler errors in intersect-test. --- examples/intersect-test.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/examples/intersect-test.cpp b/examples/intersect-test.cpp index 9b22dbd1..3a053cc3 100644 --- a/examples/intersect-test.cpp +++ b/examples/intersect-test.cpp @@ -14,6 +14,9 @@ using log_surgeon::ParserAST; using log_surgeon::SchemaVarAST; using std::string; using std::unique_ptr; +using std::vector; + +using ByteLexicalRule = log_surgeon::LexicalRule; auto get_intersect_for_query( std::map& m_id_symbol, @@ -30,13 +33,13 @@ auto get_intersect_for_query( } log_surgeon::Schema schema; schema.add_variable(string("search:") + processed_search_string, -1); - RegexNFA nfa; auto schema_ast = schema.release_schema_ast_ptr(); + vector rules; for (unique_ptr const& parser_ast : schema_ast->m_schema_vars) { auto* schema_var_ast = dynamic_cast(parser_ast.get()); - LexicalRule rule(0, std::move(schema_var_ast->m_regex_ptr)); - rule.add_to_nfa(&nfa); + rules.emplace_back(0, std::move(schema_var_ast->m_regex_ptr)); } + RegexNFA nfa(rules); auto dfa2 = ByteLexer::nfa_to_dfa(nfa); auto schema_types = dfa1->get_intersect(dfa2); std::cout << search_string << ":"; @@ -67,14 +70,14 @@ auto main() -> int { schema.add_variable("v6:123", -1); } std::map m_id_symbol; - RegexNFA nfa; auto schema_ast = schema.release_schema_ast_ptr(); + vector rules; for (unique_ptr const& parser_ast : schema_ast->m_schema_vars) { auto* var_ast = dynamic_cast(parser_ast.get()); - LexicalRule rule(m_id_symbol.size(), std::move(var_ast->m_regex_ptr)); + rules.emplace_back(m_id_symbol.size(), std::move(var_ast->m_regex_ptr)); m_id_symbol[m_id_symbol.size()] = var_ast->m_name; - rule.add_to_nfa(&nfa); } + RegexNFA nfa(rules); auto dfa = ByteLexer::nfa_to_dfa(nfa); get_intersect_for_query(m_id_symbol, dfa, "*1*"); get_intersect_for_query(m_id_symbol, dfa, "*a*"); From a5413d08c0c4284691afdf1a4ba5fdcb26575bf2 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 20 Oct 2024 08:21:14 -0400 Subject: [PATCH 062/323] Run linter. --- tests/test-NFA.cpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index 73b5495e..48b148b8 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -146,8 +146,7 @@ TEST_CASE("Test NFA", "[NFA]") { for (auto const& positive_tagged_transition : current_state->get_positive_tagged_transitions()) { - serialized_nfa - += to_string(state_ids.at(positive_tagged_transition.get_dest_state())); + serialized_nfa += to_string(state_ids.at(positive_tagged_transition.get_dest_state())); serialized_nfa += "[" + to_string(positive_tagged_transition.get_tag()) + "],"; add_to_queue_and_visited( positive_tagged_transition.get_dest_state(), @@ -159,8 +158,7 @@ TEST_CASE("Test NFA", "[NFA]") { for (auto const& negative_tagged_transition : current_state->get_negative_tagged_transitions()) { - serialized_nfa - += to_string(state_ids.at(negative_tagged_transition.get_dest_state())); + serialized_nfa += to_string(state_ids.at(negative_tagged_transition.get_dest_state())); serialized_nfa += "["; for (auto const& tag : negative_tagged_transition.get_tags()) { serialized_nfa += to_string(tag) + ","; @@ -178,9 +176,9 @@ TEST_CASE("Test NFA", "[NFA]") { // Compare against expected output string expected_serialized_nfa = "0:byte_transitions={A-->1,Z-->2,}," - "epsilon_transitions={}," - "positive_tagged_transitions={}," - "negative_tagged_transitions={}\n"; + "epsilon_transitions={}," + "positive_tagged_transitions={}," + "negative_tagged_transitions={}\n"; expected_serialized_nfa += "1:byte_transitions={a-->3,b-->3,c-->4,d-->4,}," "epsilon_transitions={}," "positive_tagged_transitions={}," From a4a4ab7fd984acb55d71062789c68229f34a2ba0 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 20 Oct 2024 08:22:52 -0400 Subject: [PATCH 063/323] Fix headgaurd comment in LexicalRule.hpp. --- src/log_surgeon/LexicalRule.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/LexicalRule.hpp b/src/log_surgeon/LexicalRule.hpp index 9e038e7c..fbd544f6 100644 --- a/src/log_surgeon/LexicalRule.hpp +++ b/src/log_surgeon/LexicalRule.hpp @@ -43,4 +43,4 @@ void LexicalRule::add_to_nfa(finite_automata::RegexNFA Date: Sun, 20 Oct 2024 08:24:28 -0400 Subject: [PATCH 064/323] Run linter. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 8f792cfd..c2febd7d 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -162,10 +162,8 @@ class RegexNFA { * m_states. * @return NFAStateType* */ - auto new_state_with_a_positive_tagged_transition( - uint32_t tag, - NFAStateType const* dest_state - ) -> NFAStateType*; + auto new_state_with_a_positive_tagged_transition(uint32_t tag, NFAStateType const* dest_state) + -> NFAStateType*; /** * Create a unique_ptr for an NFA state with negative tagged transitions and add it to m_states. From abb2656ef78433f3202c0fb3ac3c5da63aab06c7 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 20 Oct 2024 08:30:13 -0400 Subject: [PATCH 065/323] Improve naming of intermediate state for postive and negative tagged states. --- src/log_surgeon/finite_automata/RegexAST.hpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 84d682ed..676141c5 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -105,11 +105,11 @@ class RegexAST { auto add_to_nfa_with_negative_tags(RegexNFA* nfa, NFAStateType* end_state) const -> void { // Handle negative tags as: - // root --(regex transitions)--> intermediate_state --(negative tags)--> end_state + // root --(regex)--> state_with_negative_tagged_transitions --(negative tags)--> end_state if (false == m_negative_tags.empty()) { - auto* intermediate_state + auto* state_with_negative_tagged_transitions = nfa->new_state_with_negative_tagged_transitions(m_negative_tags, end_state); - add_to_nfa(nfa, intermediate_state); + add_to_nfa(nfa, state_with_negative_tagged_transitions); } else { add_to_nfa(nfa, end_state); } @@ -893,8 +893,9 @@ template template void RegexASTCapture::add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const { - auto* intermediate_state = nfa->new_state_with_a_positive_tagged_transition(m_tag, end_state); - m_group_regex_ast->add_to_nfa_with_negative_tags(nfa, intermediate_state); + auto* state_with_a_positive_tagged_transition + = nfa->new_state_with_a_positive_tagged_transition(m_tag, end_state); + m_group_regex_ast->add_to_nfa_with_negative_tags(nfa, state_with_a_positive_tagged_transition); } template From 2f1c588e007cb2b0601259af96bc34c112e7c118 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 20 Oct 2024 09:50:32 -0400 Subject: [PATCH 066/323] Move serialize method from test into classes; Clean up serialize code using fmt::format. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 172 ++++++++++++++++++- tests/test-NFA.cpp | 155 ++--------------- 2 files changed, 181 insertions(+), 146 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index c2febd7d..09c5bcb9 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -5,14 +5,17 @@ #include #include #include -#include #include -#include +#include #include #include +#include +#include #include #include +#include + #include #include #include @@ -26,6 +29,9 @@ enum class RegexNFAStateType : uint8_t { template class RegexNFAState; +using RegexNFAByteState = RegexNFAState; +using RegexNFAUTF8State = RegexNFAState; + template class PositiveTaggedTransition { public: @@ -39,6 +45,13 @@ class PositiveTaggedTransition { return m_dest_state; } + /** + * Serialize the positive tagged transition into a string. + */ + [[nodiscard]] auto serialize( + std::unordered_map const& state_ids + ) const -> std::string; + private: uint32_t m_tag{}; RegexNFAState const* m_dest_state{}; @@ -57,6 +70,13 @@ class NegativeTaggedTransition { return m_dest_state; } + /** + * Serialize the negative tagged transitions into a string. + */ + [[nodiscard]] auto serialize( + std::unordered_map const& state_ids + ) const -> std::string; + private: std::set m_tags; RegexNFAState const* m_dest_state{}; @@ -65,6 +85,8 @@ class NegativeTaggedTransition { template class RegexNFAState { public: + using Tree = UnicodeIntervalTree; + RegexNFAState() = default; explicit RegexNFAState(uint32_t const tag, RegexNFAState const* dest_state) { @@ -75,8 +97,6 @@ class RegexNFAState { m_negative_tagged_transitions.emplace_back(std::move(tags), dest_state); } - using Tree = UnicodeIntervalTree; - auto set_accepting(bool accepting) -> void { m_accepting = accepting; } [[nodiscard]] auto is_accepting() const -> bool const& { return m_accepting; } @@ -126,6 +146,13 @@ class RegexNFAState { */ auto add_interval(Interval interval, RegexNFAState* dest_state) -> void; + /** + * Serialize the NFA state into a string. + */ + [[nodiscard]] auto serialize( + std::unordered_map const& state_ids + ) const -> std::string; + private: bool m_accepting{false}; uint32_t m_matching_variable_id{0}; @@ -140,9 +167,6 @@ class RegexNFAState { m_tree_transitions; }; -using RegexNFAByteState = RegexNFAState; -using RegexNFAUTF8State = RegexNFAState; - // TODO: rename `RegexNFA` to `NFA` template class RegexNFA { @@ -175,9 +199,9 @@ class RegexNFA { ) -> NFAStateType*; /** - * Reverse the NFA such that it matches on its reverse language + * Serialize the NFA into a string. */ - auto reverse() -> void; + [[nodiscard]] auto serialize() const -> std::string; auto add_root_interval(Interval interval, NFAStateType* dest_state) -> void { m_root->add_interval(interval, dest_state); @@ -188,10 +212,37 @@ class RegexNFA { auto get_root() -> NFAStateType* { return m_root; } private: + /** + * Add a destination state to the queue and set of visited states if it has not yet been + * visited. + * @param dest_state + * @param visited_states + * @param state_queue + */ + static auto add_to_queue_and_visited( + RegexNFAByteState const* dest_state, + std::queue& state_queue, + std::unordered_set& visited_states + ) -> void; + std::vector> m_states; NFAStateType* m_root; }; +template +auto PositiveTaggedTransition::serialize( + std::unordered_map const& state_ids +) const -> std::string { + return fmt::format("{}[{}]", state_ids.at(get_dest_state()), get_tag()); +} + +template +auto NegativeTaggedTransition::serialize( + std::unordered_map const& state_ids +) const -> std::string { + return fmt::format("{}[{}]", state_ids.at(get_dest_state()), fmt::join(get_tags(), ",")); +} + template void RegexNFAState::add_interval(Interval interval, RegexNFAState* dest_state) { if (interval.first < cSizeOfByte) { @@ -239,6 +290,46 @@ void RegexNFAState::add_interval(Interval interval, RegexNFAState* d } } +template +auto RegexNFAState::serialize( + std::unordered_map const& state_ids +) const -> std::string { + std::vector byte_transitions; + for (uint32_t idx = 0; idx < cSizeOfByte; idx++) { + for (auto const* dest_state : m_bytes_transitions[idx]) { + byte_transitions.push_back( + fmt::format("{}-->{}", static_cast(idx), state_ids.at(dest_state)) + ); + } + } + std::vector epsilon_transitions; + for (auto const* dest_state : m_epsilon_transitions) { + epsilon_transitions.push_back(std::to_string(state_ids.at(dest_state))); + } + std::vector positive_tagged_transitions; + for (auto const& positive_tagged_transition : m_positive_tagged_transitions) { + positive_tagged_transitions.push_back(positive_tagged_transition.serialize(state_ids)); + } + std::vector negative_tagged_transitions; + for (auto const& negative_tagged_transition : m_negative_tagged_transitions) { + negative_tagged_transitions.push_back(negative_tagged_transition.serialize(state_ids)); + } + + auto accepting_tag_string + = m_accepting ? fmt::format("accepting_tag={},", m_matching_variable_id) : ""; + + return fmt::format( + "{}:{}byte_transitions={{{}}},epsilon_transitions={{{}}},positive_tagged_transitions={{" + "{}}},negative_tagged_transitions={{{}}}", + state_ids.at(this), + accepting_tag_string, + fmt::join(byte_transitions, ","), + fmt::join(epsilon_transitions, ","), + fmt::join(positive_tagged_transitions, ","), + fmt::join(negative_tagged_transitions, ",") + ); +} + template RegexNFA::RegexNFA(std::vector> const& m_rules) : m_root{new_state()} { @@ -277,6 +368,69 @@ auto RegexNFA::new_state_with_negative_tagged_transitions( return state; } +template +auto RegexNFA::add_to_queue_and_visited( + RegexNFAByteState const* dest_state, + std::queue& state_queue, + std::unordered_set& visited_states +) -> void { + if (visited_states.insert(dest_state).second) { + state_queue.push(dest_state); + } +} + +template +auto RegexNFA::serialize() const -> std::string { + std::queue state_queue; + std::queue state_queue_copy; + std::unordered_set visited_states; + + // Assign state IDs + std::unordered_map state_ids; + add_to_queue_and_visited(m_root, state_queue, visited_states); + while (false == state_queue.empty()) { + auto const* current_state = state_queue.front(); + state_queue_copy.push(current_state); + state_queue.pop(); + state_ids.insert({current_state, state_ids.size()}); + for (uint32_t idx = 0; idx < cSizeOfByte; idx++) { + for (auto const* dest_state : current_state->get_byte_transitions(idx)) { + add_to_queue_and_visited(dest_state, state_queue, visited_states); + } + } + for (auto const* dest_state : current_state->get_epsilon_transitions()) { + add_to_queue_and_visited(dest_state, state_queue, visited_states); + } + for (auto const& positive_tagged_transition : + current_state->get_positive_tagged_transitions()) + { + add_to_queue_and_visited( + positive_tagged_transition.get_dest_state(), + state_queue, + visited_states + ); + } + for (auto const& negative_tagged_transition : + current_state->get_negative_tagged_transitions()) + { + add_to_queue_and_visited( + negative_tagged_transition.get_dest_state(), + state_queue, + visited_states + ); + } + } + + // Serialize NFA + std::vector serialized_states; + while (false == state_queue_copy.empty()) { + auto const* current_state = state_queue_copy.front(); + state_queue_copy.pop(); + serialized_states.emplace_back(current_state->serialize(state_ids)); + } + + return format("{}\n", fmt::join(serialized_states, "\n")); +} } // namespace log_surgeon::finite_automata #endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_HPP diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index 48b148b8..44afdaaf 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -1,6 +1,4 @@ #include -#include -#include #include #include #include @@ -18,12 +16,9 @@ using log_surgeon::cSizeOfByte; using log_surgeon::finite_automata::RegexNFAByteState; using log_surgeon::Schema; using log_surgeon::SchemaVarAST; -using std::queue; using std::string; using std::stringstream; using std::to_string; -using std::unordered_map; -using std::unordered_set; using std::vector; using ByteLexicalRule = log_surgeon::LexicalRule; @@ -37,27 +32,7 @@ using RegexASTMultiplicationByte using RegexASTOrByte = log_surgeon::finite_automata::RegexASTOr; namespace { -/** - * Add a destination state to the queue and set of visited states if it has not yet been visited. - * @param dest_state - * @param visited_states - * @param state_queue - */ -auto add_to_queue_and_visited( - RegexNFAByteState const* dest_state, - queue& state_queue, - unordered_set& visited_states -) -> void; -auto add_to_queue_and_visited( - RegexNFAByteState const* dest_state, - queue& state_queue, - unordered_set& visited_states -) -> void { - if (visited_states.insert(dest_state).second) { - state_queue.push(dest_state); - } -} } // namespace TEST_CASE("Test NFA", "[NFA]") { @@ -76,124 +51,26 @@ TEST_CASE("Test NFA", "[NFA]") { rules.emplace_back(0, move(capture_rule_ast.m_regex_ptr)); ByteNFA nfa(rules); - queue state_queue; - unordered_set visited_states; - - // Assign state IDs - unordered_map state_ids; - auto const* root = nfa.get_root(); - state_queue.push(root); - visited_states.insert(root); - while (false == state_queue.empty()) { - auto const* current_state = state_queue.front(); - state_queue.pop(); - state_ids.insert({current_state, state_ids.size()}); - for (uint32_t idx = 0; idx < cSizeOfByte; idx++) { - for (auto const* dest_state : current_state->get_byte_transitions(idx)) { - add_to_queue_and_visited(dest_state, state_queue, visited_states); - } - } - for (auto const* dest_state : current_state->get_epsilon_transitions()) { - add_to_queue_and_visited(dest_state, state_queue, visited_states); - } - for (auto const& positive_tagged_transition : - current_state->get_positive_tagged_transitions()) - { - add_to_queue_and_visited( - positive_tagged_transition.get_dest_state(), - state_queue, - visited_states - ); - } - for (auto const& negative_tagged_transition : - current_state->get_negative_tagged_transitions()) - { - add_to_queue_and_visited( - negative_tagged_transition.get_dest_state(), - state_queue, - visited_states - ); - } - } - - // Serialize NFA - string serialized_nfa; - state_queue.push(root); - visited_states.clear(); - visited_states.insert(root); - while (false == state_queue.empty()) { - auto const* current_state = state_queue.front(); - state_queue.pop(); - serialized_nfa += to_string(state_ids.at(current_state)) + ":"; - if (current_state->is_accepting()) { - serialized_nfa += "accepting_tag=" - + to_string(current_state->get_matching_variable_id()) + ","; - } - serialized_nfa += "byte_transitions={"; - for (uint32_t idx = 0; idx < cSizeOfByte; idx++) { - for (auto const* dest_state : current_state->get_byte_transitions(idx)) { - serialized_nfa += string(1, static_cast(idx)) + "-->" - + to_string(state_ids.find(dest_state)->second) + ","; - add_to_queue_and_visited(dest_state, state_queue, visited_states); - } - } - serialized_nfa += "},epsilon_transitions={"; - for (auto const* dest_state : current_state->get_epsilon_transitions()) { - serialized_nfa += to_string(state_ids.at(dest_state)) + ","; - add_to_queue_and_visited(dest_state, state_queue, visited_states); - } - serialized_nfa += "},positive_tagged_transitions={"; - for (auto const& positive_tagged_transition : - current_state->get_positive_tagged_transitions()) - { - serialized_nfa += to_string(state_ids.at(positive_tagged_transition.get_dest_state())); - serialized_nfa += "[" + to_string(positive_tagged_transition.get_tag()) + "],"; - add_to_queue_and_visited( - positive_tagged_transition.get_dest_state(), - state_queue, - visited_states - ); - } - serialized_nfa += "},negative_tagged_transitions={"; - for (auto const& negative_tagged_transition : - current_state->get_negative_tagged_transitions()) - { - serialized_nfa += to_string(state_ids.at(negative_tagged_transition.get_dest_state())); - serialized_nfa += "["; - for (auto const& tag : negative_tagged_transition.get_tags()) { - serialized_nfa += to_string(tag) + ","; - } - serialized_nfa += "],"; - add_to_queue_and_visited( - negative_tagged_transition.get_dest_state(), - state_queue, - visited_states - ); - } - serialized_nfa += "}"; - serialized_nfa += "\n"; - } - // Compare against expected output - string expected_serialized_nfa = "0:byte_transitions={A-->1,Z-->2,}," + string expected_serialized_nfa = "0:byte_transitions={A-->1,Z-->2}," "epsilon_transitions={}," "positive_tagged_transitions={}," "negative_tagged_transitions={}\n"; - expected_serialized_nfa += "1:byte_transitions={a-->3,b-->3,c-->4,d-->4,}," + expected_serialized_nfa += "1:byte_transitions={a-->3,b-->3,c-->4,d-->4}," "epsilon_transitions={}," "positive_tagged_transitions={}," "negative_tagged_transitions={}\n"; expected_serialized_nfa += "2:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_transitions={}," - "negative_tagged_transitions={5[0,1,2,3,],}\n"; + "negative_tagged_transitions={5[0,1,2,3]}\n"; expected_serialized_nfa += "3:byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_transitions={6[0],}," + "positive_tagged_transitions={6[0]}," "negative_tagged_transitions={}\n"; expected_serialized_nfa += "4:byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_transitions={7[1],}," + "positive_tagged_transitions={7[1]}," "negative_tagged_transitions={}\n"; expected_serialized_nfa += "5:accepting_tag=0,byte_transitions={}," "epsilon_transitions={}," @@ -202,39 +79,43 @@ TEST_CASE("Test NFA", "[NFA]") { expected_serialized_nfa += "6:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_transitions={}," - "negative_tagged_transitions={8[1,],}\n"; + "negative_tagged_transitions={8[1]}\n"; expected_serialized_nfa += "7:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_transitions={}," - "negative_tagged_transitions={8[0,],}\n"; + "negative_tagged_transitions={8[0]}\n"; expected_serialized_nfa += "8:byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_transitions={9[2],}," + "positive_tagged_transitions={9[2]}," "negative_tagged_transitions={}\n"; - expected_serialized_nfa += "9:byte_transitions={B-->10,}," + expected_serialized_nfa += "9:byte_transitions={B-->10}," "epsilon_transitions={}," "positive_tagged_transitions={}," "negative_tagged_transitions={}\n"; expected_serialized_nfa += "10:byte_transitions={0-->11,1-->11,2-->11,3-->11,4-->11,5-->11,6-->" - "11,7-->11,8-->11,9-->11,}," + "11,7-->11,8-->11,9-->11}," "epsilon_transitions={}," "positive_tagged_transitions={}," "negative_tagged_transitions={}\n"; expected_serialized_nfa += "11:byte_transitions={0-->11,1-->11,2-->11,3-->11,4-->11,5-->11,6-->" - "11,7-->11,8-->11,9-->11,}," + "11,7-->11,8-->11,9-->11}," "epsilon_transitions={}," - "positive_tagged_transitions={12[3],}," + "positive_tagged_transitions={12[3]}," "negative_tagged_transitions={}\n"; - expected_serialized_nfa += "12:byte_transitions={C-->5,}," + expected_serialized_nfa += "12:byte_transitions={C-->5}," "epsilon_transitions={}," "positive_tagged_transitions={}," "negative_tagged_transitions={}\n"; // Compare expected and actual line-by-line - stringstream ss_actual(serialized_nfa); + auto actual_serialized_nfa = nfa.serialize(); + stringstream ss_actual(actual_serialized_nfa); stringstream ss_expected(expected_serialized_nfa); string actual_line; string expected_line; + + CAPTURE(actual_serialized_nfa); + CAPTURE(expected_serialized_nfa); while (getline(ss_actual, actual_line) && getline(ss_expected, expected_line)) { REQUIRE(actual_line == expected_line); } From 9835eb06107a830c4319e982af4c50c62e2b82cb Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 20 Oct 2024 09:56:36 -0400 Subject: [PATCH 067/323] Fix compiler error. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 09c5bcb9..833b3ea6 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -429,7 +429,7 @@ auto RegexNFA::serialize() const -> std::string { serialized_states.emplace_back(current_state->serialize(state_ids)); } - return format("{}\n", fmt::join(serialized_states, "\n")); + return fmt::format("{}\n", fmt::join(serialized_states, "\n")); } } // namespace log_surgeon::finite_automata From dcd79a6fca7bb520989a76841fad5ca1204af481 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 20 Oct 2024 10:01:57 -0400 Subject: [PATCH 068/323] Improve var naming; Improve docstring. --- src/log_surgeon/finite_automata/RegexAST.hpp | 6 +++--- src/log_surgeon/finite_automata/RegexNFA.hpp | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 676141c5..eecf7462 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -893,9 +893,9 @@ template template void RegexASTCapture::add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const { - auto* state_with_a_positive_tagged_transition - = nfa->new_state_with_a_positive_tagged_transition(m_tag, end_state); - m_group_regex_ast->add_to_nfa_with_negative_tags(nfa, state_with_a_positive_tagged_transition); + auto* state_with_positive_tagged_transition + = nfa->new_state_with_positive_tagged_transition(m_tag, end_state); + m_group_regex_ast->add_to_nfa_with_negative_tags(nfa, state_with_positive_tagged_transition); } template diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 833b3ea6..6e1d6e59 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -186,7 +186,7 @@ class RegexNFA { * m_states. * @return NFAStateType* */ - auto new_state_with_a_positive_tagged_transition(uint32_t tag, NFAStateType const* dest_state) + auto new_state_with_positive_tagged_transition(uint32_t tag, NFAStateType const* dest_state) -> NFAStateType*; /** @@ -213,8 +213,8 @@ class RegexNFA { private: /** - * Add a destination state to the queue and set of visited states if it has not yet been - * visited. + * Helper method for breadth-first traversal of the NFA. + * Adds a state to the queue and visited set if it hasn't been visited before. * @param dest_state * @param visited_states * @param state_queue @@ -347,7 +347,7 @@ auto RegexNFA::new_state() -> NFAStateType* { } template -auto RegexNFA::new_state_with_a_positive_tagged_transition( +auto RegexNFA::new_state_with_positive_tagged_transition( uint32_t const tag, NFAStateType const* dest_state ) -> NFAStateType* { From 73300e755ca79b38859385630e8b391577baf213 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 20 Oct 2024 10:04:41 -0400 Subject: [PATCH 069/323] Improve docstrings for serialize() methods. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 6e1d6e59..0d5ebe18 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -47,6 +47,8 @@ class PositiveTaggedTransition { /** * Serialize the positive tagged transition into a string. + * @param state_ids A map of states to their unique identifiers. + * @return A string representation of the positive tagged transitions. */ [[nodiscard]] auto serialize( std::unordered_map const& state_ids @@ -72,6 +74,8 @@ class NegativeTaggedTransition { /** * Serialize the negative tagged transitions into a string. + * @param state_ids A map of states to their unique identifiers. + * @return A string representation of the negative tagged transitions. */ [[nodiscard]] auto serialize( std::unordered_map const& state_ids @@ -148,6 +152,8 @@ class RegexNFAState { /** * Serialize the NFA state into a string. + * @param state_ids A map of states to their unique identifiers. + * @return A string representation of the NFA state. */ [[nodiscard]] auto serialize( std::unordered_map const& state_ids @@ -200,6 +206,7 @@ class RegexNFA { /** * Serialize the NFA into a string. + * @return A string representation of the NFA. */ [[nodiscard]] auto serialize() const -> std::string; From 8548bd98f27e16303d3b8f0ef80620a5a74b4c6d Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 20 Oct 2024 10:22:23 -0400 Subject: [PATCH 070/323] Add get_traversal_order() to NFA; Fix docstrings. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 38 +++++++++++++------- 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 0d5ebe18..e1ac40f2 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -190,6 +190,8 @@ class RegexNFA { /** * Create a unique_ptr for an NFA state with a positive tagged transition and add it to * m_states. + * @param tag + * @param dest_state * @return NFAStateType* */ auto new_state_with_positive_tagged_transition(uint32_t tag, NFAStateType const* dest_state) @@ -197,6 +199,8 @@ class RegexNFA { /** * Create a unique_ptr for an NFA state with negative tagged transitions and add it to m_states. + * @param tags + * @param dest_state * @return NFAStateType* */ auto new_state_with_negative_tagged_transitions( @@ -204,6 +208,12 @@ class RegexNFA { NFAStateType const* dest_state ) -> NFAStateType*; + /** + * Traverse the NFA using a BFS and keep track of the order states are visited in. + * @return A vector that performs a BFS of the NFA. + */ + [[nodiscard]] auto get_traversal_order() const -> std::vector; + /** * Serialize the NFA into a string. * @return A string representation of the NFA. @@ -387,19 +397,16 @@ auto RegexNFA::add_to_queue_and_visited( } template -auto RegexNFA::serialize() const -> std::string { +auto RegexNFA::get_traversal_order() const -> std::vector { std::queue state_queue; - std::queue state_queue_copy; std::unordered_set visited_states; + std::vector visited_order; - // Assign state IDs - std::unordered_map state_ids; add_to_queue_and_visited(m_root, state_queue, visited_states); while (false == state_queue.empty()) { auto const* current_state = state_queue.front(); - state_queue_copy.push(current_state); + visited_order.push_back(current_state); state_queue.pop(); - state_ids.insert({current_state, state_ids.size()}); for (uint32_t idx = 0; idx < cSizeOfByte; idx++) { for (auto const* dest_state : current_state->get_byte_transitions(idx)) { add_to_queue_and_visited(dest_state, state_queue, visited_states); @@ -427,15 +434,22 @@ auto RegexNFA::serialize() const -> std::string { ); } } + return visited_order; +} - // Serialize NFA - std::vector serialized_states; - while (false == state_queue_copy.empty()) { - auto const* current_state = state_queue_copy.front(); - state_queue_copy.pop(); - serialized_states.emplace_back(current_state->serialize(state_ids)); +template +auto RegexNFA::serialize() const -> std::string { + auto traversal_order = get_traversal_order(); + + std::unordered_map state_ids; + for (auto const* state : traversal_order) { + state_ids.insert({state, state_ids.size()}); } + std::vector serialized_states; + for (auto const* state : traversal_order) { + serialized_states.emplace_back(state->serialize(state_ids)); + } return fmt::format("{}\n", fmt::join(serialized_states, "\n")); } } // namespace log_surgeon::finite_automata From 38720f7d40d0f1733bfc0fe92e6510080fe2d1ac Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 20 Oct 2024 10:25:52 -0400 Subject: [PATCH 071/323] Add missing include to test-intersect. --- examples/intersect-test.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/intersect-test.cpp b/examples/intersect-test.cpp index 3a053cc3..51a9865b 100644 --- a/examples/intersect-test.cpp +++ b/examples/intersect-test.cpp @@ -1,4 +1,5 @@ #include +#include #include #include From b700d995c9830e678ffa2e44ca4ce48f01e5b846 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:08:04 -0400 Subject: [PATCH 072/323] Update src/log_surgeon/LexicalRule.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/LexicalRule.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/log_surgeon/LexicalRule.hpp b/src/log_surgeon/LexicalRule.hpp index fbd544f6..9aa93286 100644 --- a/src/log_surgeon/LexicalRule.hpp +++ b/src/log_surgeon/LexicalRule.hpp @@ -26,6 +26,7 @@ class LexicalRule { [[nodiscard]] auto get_variable_id() const -> uint32_t { return m_variable_id; } [[nodiscard]] auto get_regex() const -> finite_automata::RegexAST* { + // TODO: make the returned pointer constant return m_regex.get(); } From 12e930c8701af33415b05362eed91058b5c956d8 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:08:29 -0400 Subject: [PATCH 073/323] Update src/log_surgeon/finite_automata/RegexNFA.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index e1ac40f2..ca052972 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -36,8 +36,8 @@ template class PositiveTaggedTransition { public: PositiveTaggedTransition(uint32_t const tag, RegexNFAState const* dest_state) - : m_tag(tag), - m_dest_state(dest_state) {} + : m_tag{tag}, + m_dest_state{dest_state} {} [[nodiscard]] auto get_tag() const -> uint32_t { return m_tag; } From 0a104ffd607b45454e0dc6322d3871946025f387 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:09:28 -0400 Subject: [PATCH 074/323] Update src/log_surgeon/finite_automata/RegexNFA.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index ca052972..0cb004fb 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -63,8 +63,8 @@ template class NegativeTaggedTransition { public: NegativeTaggedTransition(std::set tags, RegexNFAState const* dest_state) - : m_tags(std::move(tags)), - m_dest_state(dest_state) {} + : m_tags{std::move(tags)}, + m_dest_state{dest_state} {} [[nodiscard]] auto get_tags() const -> std::set const& { return m_tags; } From 7c126eb979755361751c3d373d1d990985e8f8b1 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:11:08 -0400 Subject: [PATCH 075/323] Update src/log_surgeon/finite_automata/RegexNFA.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 0cb004fb..278c1ba5 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -93,11 +93,11 @@ class RegexNFAState { RegexNFAState() = default; - explicit RegexNFAState(uint32_t const tag, RegexNFAState const* dest_state) { + RegexNFAState(uint32_t const tag, RegexNFAState const* dest_state) { m_positive_tagged_transitions.emplace_back(tag, dest_state); } - explicit RegexNFAState(std::set tags, RegexNFAState const* dest_state) { + RegexNFAState(std::set tags, RegexNFAState const* dest_state) { m_negative_tagged_transitions.emplace_back(std::move(tags), dest_state); } From 7e43f998ee526d5d41d1443a886cd1c714ba7bae Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:11:28 -0400 Subject: [PATCH 076/323] Update src/log_surgeon/finite_automata/RegexNFA.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 278c1ba5..2d515ad5 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -185,7 +185,7 @@ class RegexNFA { * Create a unique_ptr for an NFA state with no tagged transitions and add it to m_states. * @return NFAStateType* */ - auto new_state() -> NFAStateType*; + [[nodiscard]] auto new_state() -> NFAStateType*; /** * Create a unique_ptr for an NFA state with a positive tagged transition and add it to From 5957bfbe77185ca354f71ad038e202adbec35906 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:12:11 -0400 Subject: [PATCH 077/323] Update src/log_surgeon/finite_automata/RegexNFA.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 2d515ad5..23f069ea 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -182,7 +182,7 @@ class RegexNFA { explicit RegexNFA(std::vector> const& m_rules); /** - * Create a unique_ptr for an NFA state with no tagged transitions and add it to m_states. + * Creates a unique_ptr for an NFA state with no tagged transitions and adds it to `m_states`. * @return NFAStateType* */ [[nodiscard]] auto new_state() -> NFAStateType*; From 98b524244e3bea03de221f3a149a617f557e8df8 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:12:40 -0400 Subject: [PATCH 078/323] Update src/log_surgeon/finite_automata/RegexNFA.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 23f069ea..3ca28ff8 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -188,8 +188,8 @@ class RegexNFA { [[nodiscard]] auto new_state() -> NFAStateType*; /** - * Create a unique_ptr for an NFA state with a positive tagged transition and add it to - * m_states. + * Creates a unique_ptr for an NFA state with a positive tagged transition and adds it to + * `m_states`. * @param tag * @param dest_state * @return NFAStateType* From 06742ba27ea6346afc382b1a6ba4c178b2e33a79 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:13:02 -0400 Subject: [PATCH 079/323] Update src/log_surgeon/finite_automata/RegexNFA.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 3ca28ff8..d1df00a7 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -194,8 +194,10 @@ class RegexNFA { * @param dest_state * @return NFAStateType* */ - auto new_state_with_positive_tagged_transition(uint32_t tag, NFAStateType const* dest_state) - -> NFAStateType*; + [[nodiscard]] auto new_state_with_positive_tagged_transition( + uint32_t tag, + NFAStateType const* dest_state + ) -> NFAStateType*; /** * Create a unique_ptr for an NFA state with negative tagged transitions and add it to m_states. From 021ac009d85d2a2a3323fa013e7f8c1552ef9c51 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:17:08 -0400 Subject: [PATCH 080/323] Update src/log_surgeon/finite_automata/RegexNFA.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index d1df00a7..ba531370 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -205,7 +205,7 @@ class RegexNFA { * @param dest_state * @return NFAStateType* */ - auto new_state_with_negative_tagged_transitions( + [[nodiscard]] auto new_state_with_negative_tagged_transitions( std::set tags, NFAStateType const* dest_state ) -> NFAStateType*; From 29e9c43a0839430970b4f127d2f322de4a228431 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:18:23 -0400 Subject: [PATCH 081/323] Update src/log_surgeon/finite_automata/RegexNFA.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index ba531370..e37599b6 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -200,8 +200,8 @@ class RegexNFA { ) -> NFAStateType*; /** - * Create a unique_ptr for an NFA state with negative tagged transitions and add it to m_states. - * @param tags + * Creates a unique_ptr for an NFA state with negative tagged transitions and adds it to + * `m_states`. * @param dest_state * @return NFAStateType* */ From bd6081b03c1aab8d4448c969bb014ca33e5fe14b Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 23 Oct 2024 15:20:36 -0400 Subject: [PATCH 082/323] Update docstring for get_travel_order(). --- src/log_surgeon/finite_automata/RegexNFA.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index e37599b6..1f6afc22 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -212,7 +212,8 @@ class RegexNFA { /** * Traverse the NFA using a BFS and keep track of the order states are visited in. - * @return A vector that performs a BFS of the NFA. + * @return A vector representing the traversal order of the NFA states using breadth-first + * search. */ [[nodiscard]] auto get_traversal_order() const -> std::vector; From 16edf6f61130dfd4723a68f78085acc674d43d8f Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:21:45 -0400 Subject: [PATCH 083/323] Update src/log_surgeon/finite_automata/RegexNFA.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 1f6afc22..c23a0f1f 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -218,7 +218,6 @@ class RegexNFA { [[nodiscard]] auto get_traversal_order() const -> std::vector; /** - * Serialize the NFA into a string. * @return A string representation of the NFA. */ [[nodiscard]] auto serialize() const -> std::string; From b4b0b630ce71625d0df9a898eb4e7e41a65de116 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:22:05 -0400 Subject: [PATCH 084/323] Update src/log_surgeon/finite_automata/RegexNFA.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index c23a0f1f..18c89acb 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -73,7 +73,6 @@ class NegativeTaggedTransition { } /** - * Serialize the negative tagged transitions into a string. * @param state_ids A map of states to their unique identifiers. * @return A string representation of the negative tagged transitions. */ From d1086975af760738940b26680cd833d313f85406 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:22:11 -0400 Subject: [PATCH 085/323] Update src/log_surgeon/finite_automata/RegexNFA.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 18c89acb..80de9b16 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -46,7 +46,6 @@ class PositiveTaggedTransition { } /** - * Serialize the positive tagged transition into a string. * @param state_ids A map of states to their unique identifiers. * @return A string representation of the positive tagged transitions. */ From eef79d2f29fb6aaf508df43b7cdd7fd6031bfab5 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:23:46 -0400 Subject: [PATCH 086/323] Update tests/test-NFA.cpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- tests/test-NFA.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index 44afdaaf..de09b0ce 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -108,9 +108,9 @@ TEST_CASE("Test NFA", "[NFA]") { "negative_tagged_transitions={}\n"; // Compare expected and actual line-by-line - auto actual_serialized_nfa = nfa.serialize(); - stringstream ss_actual(actual_serialized_nfa); - stringstream ss_expected(expected_serialized_nfa); + auto const actual_serialized_nfa = nfa.serialize(); + stringstream ss_actual{actual_serialized_nfa}; + stringstream ss_expected{expected_serialized_nfa}; string actual_line; string expected_line; From 0d599cbf287c74d7602cdd8fd7bbd2bbccde76bb Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:23:55 -0400 Subject: [PATCH 087/323] Update tests/test-NFA.cpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- tests/test-NFA.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index de09b0ce..1df292c4 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -49,7 +49,7 @@ TEST_CASE("Test NFA", "[NFA]") { auto& capture_rule_ast = dynamic_cast(*schema_ast->m_schema_vars[0]); vector rules; rules.emplace_back(0, move(capture_rule_ast.m_regex_ptr)); - ByteNFA nfa(rules); + ByteNFA const nfa(rules); // Compare against expected output string expected_serialized_nfa = "0:byte_transitions={A-->1,Z-->2}," From 28071410025133bb679ae3d7057883f97476f373 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:25:01 -0400 Subject: [PATCH 088/323] Update src/log_surgeon/finite_automata/RegexNFA.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 80de9b16..2a0927a1 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -250,7 +250,7 @@ template auto PositiveTaggedTransition::serialize( std::unordered_map const& state_ids ) const -> std::string { - return fmt::format("{}[{}]", state_ids.at(get_dest_state()), get_tag()); + return fmt::format("{}[{}]", state_ids.at(m_dest_state), m_tag); } template From 8e225cd5594864dbb6c1670987d7b4918189dd50 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:25:19 -0400 Subject: [PATCH 089/323] Update src/log_surgeon/finite_automata/RegexNFA.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 2a0927a1..7d8efc1b 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -257,7 +257,7 @@ template auto NegativeTaggedTransition::serialize( std::unordered_map const& state_ids ) const -> std::string { - return fmt::format("{}[{}]", state_ids.at(get_dest_state()), fmt::join(get_tags(), ",")); + return fmt::format("{}[{}]", state_ids.at(m_dest_state), fmt::join(m_tags, ",")); } template From ecb84fba79aea4fb5c3ae561237e432b739e4d62 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:25:38 -0400 Subject: [PATCH 090/323] Update src/log_surgeon/finite_automata/RegexNFA.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 7d8efc1b..ff74b9e3 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -312,7 +312,7 @@ auto RegexNFAState::serialize( std::unordered_map const& state_ids ) const -> std::string { std::vector byte_transitions; - for (uint32_t idx = 0; idx < cSizeOfByte; idx++) { + for (uint32_t idx{0}; idx < cSizeOfByte; ++idx) { for (auto const* dest_state : m_bytes_transitions[idx]) { byte_transitions.push_back( fmt::format("{}-->{}", static_cast(idx), state_ids.at(dest_state)) From 6fc6030ede177c65115c70dd0aa7cdf3948c874a Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:26:10 -0400 Subject: [PATCH 091/323] Update src/log_surgeon/finite_automata/RegexNFA.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index ff74b9e3..3ccd8bde 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -314,7 +314,7 @@ auto RegexNFAState::serialize( std::vector byte_transitions; for (uint32_t idx{0}; idx < cSizeOfByte; ++idx) { for (auto const* dest_state : m_bytes_transitions[idx]) { - byte_transitions.push_back( + byte_transitions.emplace_back( fmt::format("{}-->{}", static_cast(idx), state_ids.at(dest_state)) ); } From f83ac5f00fdef02c442e9fb3c3a07cde2d8ab0a2 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 23 Oct 2024 15:32:07 -0400 Subject: [PATCH 092/323] Rename get_traversal_order() to get_bfs_tranversal_order() and upate docstring. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 3ccd8bde..e4b52474 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -209,11 +209,11 @@ class RegexNFA { ) -> NFAStateType*; /** - * Traverse the NFA using a BFS and keep track of the order states are visited in. - * @return A vector representing the traversal order of the NFA states using breadth-first - * search. + * Traverse the NFA using a breadth-first search (BFS) and keep track of the order states are + * visited in. + * @return A vector representing the traversal order of the NFA states using BFS. */ - [[nodiscard]] auto get_traversal_order() const -> std::vector; + [[nodiscard]] auto get_bfs_traversal_order() const -> std::vector; /** * @return A string representation of the NFA. @@ -397,7 +397,8 @@ auto RegexNFA::add_to_queue_and_visited( } template -auto RegexNFA::get_traversal_order() const -> std::vector { +auto RegexNFA::get_bfs_traversal_order( +) const -> std::vector { std::queue state_queue; std::unordered_set visited_states; std::vector visited_order; @@ -439,7 +440,7 @@ auto RegexNFA::get_traversal_order() const -> std::vector auto RegexNFA::serialize() const -> std::string { - auto traversal_order = get_traversal_order(); + auto traversal_order = get_bfs_traversal_order(); std::unordered_map state_ids; for (auto const* state : traversal_order) { From e3214f1cae78c411820b49ffd6fa91325bd20195 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:33:38 -0400 Subject: [PATCH 093/323] Update src/log_surgeon/finite_automata/RegexNFA.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index e4b52474..d291ee55 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -319,6 +319,7 @@ auto RegexNFAState::serialize( ); } } + std::vector epsilon_transitions; for (auto const* dest_state : m_epsilon_transitions) { epsilon_transitions.push_back(std::to_string(state_ids.at(dest_state))); From d7d6dbea5fb5ea28340fe64fb081efa8cb3f6490 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:34:00 -0400 Subject: [PATCH 094/323] Update src/log_surgeon/finite_automata/RegexNFA.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index d291ee55..de6b62c9 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -322,7 +322,7 @@ auto RegexNFAState::serialize( std::vector epsilon_transitions; for (auto const* dest_state : m_epsilon_transitions) { - epsilon_transitions.push_back(std::to_string(state_ids.at(dest_state))); + epsilon_transitions.emplace_back(std::to_string(state_ids.at(dest_state))); } std::vector positive_tagged_transitions; for (auto const& positive_tagged_transition : m_positive_tagged_transitions) { From fc553545aefe199925a11e0930ee1d98ddc42870 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:34:29 -0400 Subject: [PATCH 095/323] Update src/log_surgeon/finite_automata/RegexNFA.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index de6b62c9..1dbf2dda 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -324,6 +324,7 @@ auto RegexNFAState::serialize( for (auto const* dest_state : m_epsilon_transitions) { epsilon_transitions.emplace_back(std::to_string(state_ids.at(dest_state))); } + std::vector positive_tagged_transitions; for (auto const& positive_tagged_transition : m_positive_tagged_transitions) { positive_tagged_transitions.push_back(positive_tagged_transition.serialize(state_ids)); From fbc25c85fd044b4bd324fd673e5ed4c0c987864b Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:34:41 -0400 Subject: [PATCH 096/323] Update src/log_surgeon/finite_automata/RegexNFA.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 1dbf2dda..52d777c4 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -327,7 +327,7 @@ auto RegexNFAState::serialize( std::vector positive_tagged_transitions; for (auto const& positive_tagged_transition : m_positive_tagged_transitions) { - positive_tagged_transitions.push_back(positive_tagged_transition.serialize(state_ids)); + positive_tagged_transitions.emplace_back(positive_tagged_transition.serialize(state_ids)); } std::vector negative_tagged_transitions; for (auto const& negative_tagged_transition : m_negative_tagged_transitions) { From df070c386fd5f23dffbd0c95527c13f5e6bc447e Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:34:52 -0400 Subject: [PATCH 097/323] Update src/log_surgeon/finite_automata/RegexNFA.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 52d777c4..962c874a 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -329,6 +329,7 @@ auto RegexNFAState::serialize( for (auto const& positive_tagged_transition : m_positive_tagged_transitions) { positive_tagged_transitions.emplace_back(positive_tagged_transition.serialize(state_ids)); } + std::vector negative_tagged_transitions; for (auto const& negative_tagged_transition : m_negative_tagged_transitions) { negative_tagged_transitions.push_back(negative_tagged_transition.serialize(state_ids)); From 84cd573038bbc7990a0fe46dfa47f33b6eb672cc Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:35:03 -0400 Subject: [PATCH 098/323] Update src/log_surgeon/finite_automata/RegexNFA.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 962c874a..bc431b33 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -332,7 +332,7 @@ auto RegexNFAState::serialize( std::vector negative_tagged_transitions; for (auto const& negative_tagged_transition : m_negative_tagged_transitions) { - negative_tagged_transitions.push_back(negative_tagged_transition.serialize(state_ids)); + negative_tagged_transitions.emplace_back(negative_tagged_transition.serialize(state_ids)); } auto accepting_tag_string From a35f61f370ba718a3c05e9f826d9cd1a4de48ac2 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:35:23 -0400 Subject: [PATCH 099/323] Update src/log_surgeon/finite_automata/RegexNFA.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index bc431b33..1eefdef8 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -335,7 +335,7 @@ auto RegexNFAState::serialize( negative_tagged_transitions.emplace_back(negative_tagged_transition.serialize(state_ids)); } - auto accepting_tag_string + auto const accepting_tag_string = m_accepting ? fmt::format("accepting_tag={},", m_matching_variable_id) : ""; return fmt::format( From 53ba56a953087e5bfb2e7a0973223098863f2d22 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 23 Oct 2024 15:38:22 -0400 Subject: [PATCH 100/323] Remove unused using. --- tests/test-NFA.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index 1df292c4..943d31b1 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -18,7 +18,6 @@ using log_surgeon::Schema; using log_surgeon::SchemaVarAST; using std::string; using std::stringstream; -using std::to_string; using std::vector; using ByteLexicalRule = log_surgeon::LexicalRule; From 8a677e31f1ebc9184004910af5f334e4ea911484 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 23 Oct 2024 15:39:21 -0400 Subject: [PATCH 101/323] Remove empty namespace. --- tests/test-NFA.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index 943d31b1..1e3e9052 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -30,10 +30,6 @@ using RegexASTMultiplicationByte = log_surgeon::finite_automata::RegexASTMultiplication; using RegexASTOrByte = log_surgeon::finite_automata::RegexASTOr; -namespace { - -} // namespace - TEST_CASE("Test NFA", "[NFA]") { Schema schema; string const var_name{"capture"}; From f17f7527e1b500fe8bed7aa00a3fb28572515718 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:40:42 -0400 Subject: [PATCH 102/323] Update src/log_surgeon/finite_automata/RegexNFA.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 1eefdef8..8c29dd46 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -371,10 +371,8 @@ auto RegexNFA::new_state_with_positive_tagged_transition( uint32_t const tag, NFAStateType const* dest_state ) -> NFAStateType* { - std::unique_ptr ptr = std::make_unique(tag, dest_state); - NFAStateType* state = ptr.get(); - m_states.push_back(std::move(ptr)); - return state; + m_states.emplace_back(std::make_unique(tag, dest_state)); + return m_states.back().get(); } template From cbe1d39995d820f6381b49a9ccbfdeeea013c202 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 23 Oct 2024 15:41:45 -0400 Subject: [PATCH 103/323] Make traversal_order const. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 1eefdef8..823d0052 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -443,7 +443,7 @@ auto RegexNFA::get_bfs_traversal_order( template auto RegexNFA::serialize() const -> std::string { - auto traversal_order = get_bfs_traversal_order(); + auto const traversal_order = get_bfs_traversal_order(); std::unordered_map state_ids; for (auto const* state : traversal_order) { From a9d0ef37bd744daca5d54fa9e25d655f6f6535da Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:44:37 -0400 Subject: [PATCH 104/323] Update src/log_surgeon/finite_automata/RegexNFA.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 1499ade6..b83bbb36 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -380,10 +380,8 @@ auto RegexNFA::new_state_with_negative_tagged_transitions( std::set tags, NFAStateType const* dest_state ) -> NFAStateType* { - std::unique_ptr ptr = std::make_unique(tags, dest_state); - NFAStateType* state = ptr.get(); - m_states.push_back(std::move(ptr)); - return state; + m_states.emplace_back(std::make_unique(tags, dest_state)); + return m_states.back().get(); } template From 43ec3f0e39d4aea799c9b35832b81a23c9f9e5fc Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:45:34 -0400 Subject: [PATCH 105/323] Update src/log_surgeon/finite_automata/RegexNFA.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index b83bbb36..a4ade85a 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -407,7 +407,7 @@ auto RegexNFA::get_bfs_traversal_order( auto const* current_state = state_queue.front(); visited_order.push_back(current_state); state_queue.pop(); - for (uint32_t idx = 0; idx < cSizeOfByte; idx++) { + for (uint32_t idx{0}; idx < cSizeOfByte; ++idx) { for (auto const* dest_state : current_state->get_byte_transitions(idx)) { add_to_queue_and_visited(dest_state, state_queue, visited_states); } From 45372dfa85cfaa9631bc2ac7e467023c6a54a395 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 23 Oct 2024 15:46:34 -0400 Subject: [PATCH 106/323] Add missing using for std::move. --- tests/test-NFA.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index 1e3e9052..1c8b06bc 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -16,6 +16,7 @@ using log_surgeon::cSizeOfByte; using log_surgeon::finite_automata::RegexNFAByteState; using log_surgeon::Schema; using log_surgeon::SchemaVarAST; +using std::move; using std::string; using std::stringstream; using std::vector; From f69aa86a7cf643c34ae104500d34130a06ea20fb Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 23 Oct 2024 15:48:18 -0400 Subject: [PATCH 107/323] Update src/log_surgeon/finite_automata/RegexNFA.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index a4ade85a..0257c42c 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -443,7 +443,7 @@ auto RegexNFA::serialize() const -> std::string { std::unordered_map state_ids; for (auto const* state : traversal_order) { - state_ids.insert({state, state_ids.size()}); + state_ids.emplace(state, state_ids.size()); } std::vector serialized_states; From 723eabb319cd103ce727ae20c89864b09d8491e0 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 23 Oct 2024 15:55:46 -0400 Subject: [PATCH 108/323] Use move semantic for NFA constructor. --- examples/intersect-test.cpp | 4 ++-- src/log_surgeon/Lexer.tpp | 2 +- src/log_surgeon/finite_automata/RegexNFA.hpp | 6 +++--- tests/test-NFA.cpp | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/intersect-test.cpp b/examples/intersect-test.cpp index 51a9865b..3d870787 100644 --- a/examples/intersect-test.cpp +++ b/examples/intersect-test.cpp @@ -40,7 +40,7 @@ auto get_intersect_for_query( auto* schema_var_ast = dynamic_cast(parser_ast.get()); rules.emplace_back(0, std::move(schema_var_ast->m_regex_ptr)); } - RegexNFA nfa(rules); + RegexNFA nfa(move(rules)); auto dfa2 = ByteLexer::nfa_to_dfa(nfa); auto schema_types = dfa1->get_intersect(dfa2); std::cout << search_string << ":"; @@ -78,7 +78,7 @@ auto main() -> int { rules.emplace_back(m_id_symbol.size(), std::move(var_ast->m_regex_ptr)); m_id_symbol[m_id_symbol.size()] = var_ast->m_name; } - RegexNFA nfa(rules); + RegexNFA nfa(move(rules)); auto dfa = ByteLexer::nfa_to_dfa(nfa); get_intersect_for_query(m_id_symbol, dfa, "*1*"); get_intersect_for_query(m_id_symbol, dfa, "*a*"); diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index 9e51dfdf..ee700662 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -374,7 +374,7 @@ auto Lexer::get_rule(uint32_t const variable_id template void Lexer::generate() { - finite_automata::RegexNFA nfa(m_rules); + finite_automata::RegexNFA nfa(std::move(m_rules)); // TODO: DFA ignores tags. E.g., treats "capture:user=(?\d+)" as "capture:user=\d+" m_dfa = nfa_to_dfa(nfa); DFAStateType const* state = m_dfa->get_root(); diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index a4ade85a..7422c6c8 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -177,7 +177,7 @@ class RegexNFA { public: using StateVec = std::vector; - explicit RegexNFA(std::vector> const& m_rules); + explicit RegexNFA(std::vector> rules); /** * Creates a unique_ptr for an NFA state with no tagged transitions and adds it to `m_states`. @@ -351,9 +351,9 @@ auto RegexNFAState::serialize( } template -RegexNFA::RegexNFA(std::vector> const& m_rules) +RegexNFA::RegexNFA(std::vector> rules) : m_root{new_state()} { - for (auto const& rule : m_rules) { + for (auto const& rule : rules) { rule.add_to_nfa(this); } } diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index 1c8b06bc..e5078524 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -45,7 +45,7 @@ TEST_CASE("Test NFA", "[NFA]") { auto& capture_rule_ast = dynamic_cast(*schema_ast->m_schema_vars[0]); vector rules; rules.emplace_back(0, move(capture_rule_ast.m_regex_ptr)); - ByteNFA const nfa(rules); + ByteNFA const nfa(move(rules)); // Compare against expected output string expected_serialized_nfa = "0:byte_transitions={A-->1,Z-->2}," From d20e391b275bfc0bb418c937e18b0232e489c019 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 23 Oct 2024 16:03:01 -0400 Subject: [PATCH 109/323] Move add_to_queue_and_visited() to lambda. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 49 +++++--------------- 1 file changed, 12 insertions(+), 37 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 53f7cc57..7c5ab05d 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -229,19 +229,6 @@ class RegexNFA { auto get_root() -> NFAStateType* { return m_root; } private: - /** - * Helper method for breadth-first traversal of the NFA. - * Adds a state to the queue and visited set if it hasn't been visited before. - * @param dest_state - * @param visited_states - * @param state_queue - */ - static auto add_to_queue_and_visited( - RegexNFAByteState const* dest_state, - std::queue& state_queue, - std::unordered_set& visited_states - ) -> void; - std::vector> m_states; NFAStateType* m_root; }; @@ -384,17 +371,6 @@ auto RegexNFA::new_state_with_negative_tagged_transitions( return m_states.back().get(); } -template -auto RegexNFA::add_to_queue_and_visited( - RegexNFAByteState const* dest_state, - std::queue& state_queue, - std::unordered_set& visited_states -) -> void { - if (visited_states.insert(dest_state).second) { - state_queue.push(dest_state); - } -} - template auto RegexNFA::get_bfs_traversal_order( ) const -> std::vector { @@ -402,36 +378,35 @@ auto RegexNFA::get_bfs_traversal_order( std::unordered_set visited_states; std::vector visited_order; - add_to_queue_and_visited(m_root, state_queue, visited_states); + auto add_to_queue_and_visited + = [&state_queue, &visited_states](RegexNFAByteState const* dest_state) { + if (visited_states.insert(dest_state).second) { + state_queue.push(dest_state); + } + }; + + add_to_queue_and_visited(m_root); while (false == state_queue.empty()) { auto const* current_state = state_queue.front(); visited_order.push_back(current_state); state_queue.pop(); for (uint32_t idx{0}; idx < cSizeOfByte; ++idx) { for (auto const* dest_state : current_state->get_byte_transitions(idx)) { - add_to_queue_and_visited(dest_state, state_queue, visited_states); + add_to_queue_and_visited(dest_state); } } for (auto const* dest_state : current_state->get_epsilon_transitions()) { - add_to_queue_and_visited(dest_state, state_queue, visited_states); + add_to_queue_and_visited(dest_state); } for (auto const& positive_tagged_transition : current_state->get_positive_tagged_transitions()) { - add_to_queue_and_visited( - positive_tagged_transition.get_dest_state(), - state_queue, - visited_states - ); + add_to_queue_and_visited(positive_tagged_transition.get_dest_state()); } for (auto const& negative_tagged_transition : current_state->get_negative_tagged_transitions()) { - add_to_queue_and_visited( - negative_tagged_transition.get_dest_state(), - state_queue, - visited_states - ); + add_to_queue_and_visited(negative_tagged_transition.get_dest_state()); } } return visited_order; From 6a312e971d9e9948232290b27ac0ec18effd402b Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 23 Oct 2024 17:11:53 -0400 Subject: [PATCH 110/323] Fix compiler error in intersect-test. --- examples/intersect-test.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/intersect-test.cpp b/examples/intersect-test.cpp index 3d870787..d8632588 100644 --- a/examples/intersect-test.cpp +++ b/examples/intersect-test.cpp @@ -13,6 +13,7 @@ using log_surgeon::lexers::ByteLexer; using log_surgeon::LexicalRule; using log_surgeon::ParserAST; using log_surgeon::SchemaVarAST; +using std::move; using std::string; using std::unique_ptr; using std::vector; From f8e5f8ff9e66b102abc3a4d6fc773df402fb8389 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 23 Oct 2024 21:33:41 -0400 Subject: [PATCH 111/323] Simplify new_state(). --- src/log_surgeon/finite_automata/RegexNFA.hpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 7c5ab05d..c3114286 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -347,10 +347,8 @@ RegexNFA::RegexNFA(std::vector> rules) template auto RegexNFA::new_state() -> NFAStateType* { - std::unique_ptr ptr = std::make_unique(); - NFAStateType* state = ptr.get(); - m_states.push_back(std::move(ptr)); - return state; + m_states.emplace_back(std::make_unique()); + return m_states.back().get(); } template From fc25f00cfa56fc6e0cc9e131f235d0c777e5b9f8 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 23 Oct 2024 21:34:40 -0400 Subject: [PATCH 112/323] Remove using for std::move, and explicitly add namespace. --- examples/intersect-test.cpp | 5 ++--- tests/test-NFA.cpp | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/examples/intersect-test.cpp b/examples/intersect-test.cpp index d8632588..a5d0e433 100644 --- a/examples/intersect-test.cpp +++ b/examples/intersect-test.cpp @@ -13,7 +13,6 @@ using log_surgeon::lexers::ByteLexer; using log_surgeon::LexicalRule; using log_surgeon::ParserAST; using log_surgeon::SchemaVarAST; -using std::move; using std::string; using std::unique_ptr; using std::vector; @@ -41,7 +40,7 @@ auto get_intersect_for_query( auto* schema_var_ast = dynamic_cast(parser_ast.get()); rules.emplace_back(0, std::move(schema_var_ast->m_regex_ptr)); } - RegexNFA nfa(move(rules)); + RegexNFA nfa(std::move(rules)); auto dfa2 = ByteLexer::nfa_to_dfa(nfa); auto schema_types = dfa1->get_intersect(dfa2); std::cout << search_string << ":"; @@ -79,7 +78,7 @@ auto main() -> int { rules.emplace_back(m_id_symbol.size(), std::move(var_ast->m_regex_ptr)); m_id_symbol[m_id_symbol.size()] = var_ast->m_name; } - RegexNFA nfa(move(rules)); + RegexNFA nfa(std::move(rules)); auto dfa = ByteLexer::nfa_to_dfa(nfa); get_intersect_for_query(m_id_symbol, dfa, "*1*"); get_intersect_for_query(m_id_symbol, dfa, "*a*"); diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index e5078524..f243509b 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -16,7 +16,6 @@ using log_surgeon::cSizeOfByte; using log_surgeon::finite_automata::RegexNFAByteState; using log_surgeon::Schema; using log_surgeon::SchemaVarAST; -using std::move; using std::string; using std::stringstream; using std::vector; @@ -44,8 +43,8 @@ TEST_CASE("Test NFA", "[NFA]") { auto const schema_ast = schema.release_schema_ast_ptr(); auto& capture_rule_ast = dynamic_cast(*schema_ast->m_schema_vars[0]); vector rules; - rules.emplace_back(0, move(capture_rule_ast.m_regex_ptr)); - ByteNFA const nfa(move(rules)); + rules.emplace_back(0, std::move(capture_rule_ast.m_regex_ptr)); + ByteNFA const nfa(std::move(rules)); // Compare against expected output string expected_serialized_nfa = "0:byte_transitions={A-->1,Z-->2}," From cdab6505a19730d3a220ab17ceafca34f59482d0 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 24 Oct 2024 04:38:18 -0400 Subject: [PATCH 113/323] Update serialize docstring. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index c3114286..bb1ca05a 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -149,7 +149,6 @@ class RegexNFAState { auto add_interval(Interval interval, RegexNFAState* dest_state) -> void; /** - * Serialize the NFA state into a string. * @param state_ids A map of states to their unique identifiers. * @return A string representation of the NFA state. */ From e8db2777ae00a1a4b2fcae104194b1dfc70a8c0c Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 24 Oct 2024 11:48:17 -0400 Subject: [PATCH 114/323] Have internal serialize() functions for RegexNFA (states and tagged transitions) return nullopt if state_ids is malformed. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 57 ++++++++++++++------ 1 file changed, 42 insertions(+), 15 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index bb1ca05a..d44d7fff 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -47,11 +48,13 @@ class PositiveTaggedTransition { /** * @param state_ids A map of states to their unique identifiers. - * @return A string representation of the positive tagged transitions. + * @return A string representation of the positive tagged transitions if `m_dest_state` is in + * `state_ids`; + * nullopt otherwise. */ [[nodiscard]] auto serialize( std::unordered_map const& state_ids - ) const -> std::string; + ) const -> std::optional; private: uint32_t m_tag{}; @@ -73,11 +76,13 @@ class NegativeTaggedTransition { /** * @param state_ids A map of states to their unique identifiers. - * @return A string representation of the negative tagged transitions. + * @return A string representation of the negative tagged transitions if `m_dest_state` is in + * `state_ids`; + * nullopt otherwise. */ [[nodiscard]] auto serialize( std::unordered_map const& state_ids - ) const -> std::string; + ) const -> std::optional; private: std::set m_tags; @@ -150,11 +155,13 @@ class RegexNFAState { /** * @param state_ids A map of states to their unique identifiers. - * @return A string representation of the NFA state. + * @return A string representation of the NFA state if `m_positive_tagged_transitions` and + * `m_negative_tagged_transitions` can be serialized with `state_ids`; + * nullopt otherwise. */ [[nodiscard]] auto serialize( std::unordered_map const& state_ids - ) const -> std::string; + ) const -> std::optional; private: bool m_accepting{false}; @@ -215,7 +222,7 @@ class RegexNFA { [[nodiscard]] auto get_bfs_traversal_order() const -> std::vector; /** - * @return A string representation of the NFA. + * @return A string representation of the NFA. This function should always succeeed. */ [[nodiscard]] auto serialize() const -> std::string; @@ -235,15 +242,23 @@ class RegexNFA { template auto PositiveTaggedTransition::serialize( std::unordered_map const& state_ids -) const -> std::string { - return fmt::format("{}[{}]", state_ids.at(m_dest_state), m_tag); +) const -> std::optional { + auto state_id_it = state_ids.find(m_dest_state); + if (state_id_it == state_ids.end()) { + return std::nullopt; + } + return fmt::format("{}[{}]", state_id_it->second, m_tag); } template auto NegativeTaggedTransition::serialize( std::unordered_map const& state_ids -) const -> std::string { - return fmt::format("{}[{}]", state_ids.at(m_dest_state), fmt::join(m_tags, ",")); +) const -> std::optional { + auto state_id_it = state_ids.find(m_dest_state); + if (state_id_it == state_ids.end()) { + return std::nullopt; + } + return fmt::format("{}[{}]", state_id_it->second, fmt::join(m_tags, ",")); } template @@ -296,7 +311,7 @@ void RegexNFAState::add_interval(Interval interval, RegexNFAState* d template auto RegexNFAState::serialize( std::unordered_map const& state_ids -) const -> std::string { +) const -> std::optional { std::vector byte_transitions; for (uint32_t idx{0}; idx < cSizeOfByte; ++idx) { for (auto const* dest_state : m_bytes_transitions[idx]) { @@ -313,12 +328,22 @@ auto RegexNFAState::serialize( std::vector positive_tagged_transitions; for (auto const& positive_tagged_transition : m_positive_tagged_transitions) { - positive_tagged_transitions.emplace_back(positive_tagged_transition.serialize(state_ids)); + auto const serialized_positive_transition_it = positive_tagged_transition.serialize(state_ids); + if (serialized_positive_transition_it.has_value()) { + positive_tagged_transitions.emplace_back(serialized_positive_transition_it.value()); + } else { + return std::nullopt; + } } std::vector negative_tagged_transitions; for (auto const& negative_tagged_transition : m_negative_tagged_transitions) { - negative_tagged_transitions.emplace_back(negative_tagged_transition.serialize(state_ids)); + auto const serialized_negative_transition_it = negative_tagged_transition.serialize(state_ids); + if (serialized_negative_transition_it.has_value()) { + negative_tagged_transitions.emplace_back(serialized_negative_transition_it.value()); + } else { + return std::nullopt; + } } auto const accepting_tag_string @@ -420,7 +445,9 @@ auto RegexNFA::serialize() const -> std::string { std::vector serialized_states; for (auto const* state : traversal_order) { - serialized_states.emplace_back(state->serialize(state_ids)); + // `state_ids` is well-formed as its generated from `get_bfs_traversal_order` so we can + // safely assume `state->serialize(state_ids)` will return a valid value. + serialized_states.emplace_back(state->serialize(state_ids).value()); } return fmt::format("{}\n", fmt::join(serialized_states, "\n")); } From 337ceadda489a3ca8083ad44a1a86d2b7d77e114 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 24 Oct 2024 11:51:39 -0400 Subject: [PATCH 115/323] Reserve space during BFS; Run linter. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index d44d7fff..2bbbb58a 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -328,7 +328,8 @@ auto RegexNFAState::serialize( std::vector positive_tagged_transitions; for (auto const& positive_tagged_transition : m_positive_tagged_transitions) { - auto const serialized_positive_transition_it = positive_tagged_transition.serialize(state_ids); + auto const serialized_positive_transition_it + = positive_tagged_transition.serialize(state_ids); if (serialized_positive_transition_it.has_value()) { positive_tagged_transitions.emplace_back(serialized_positive_transition_it.value()); } else { @@ -338,7 +339,8 @@ auto RegexNFAState::serialize( std::vector negative_tagged_transitions; for (auto const& negative_tagged_transition : m_negative_tagged_transitions) { - auto const serialized_negative_transition_it = negative_tagged_transition.serialize(state_ids); + auto const serialized_negative_transition_it + = negative_tagged_transition.serialize(state_ids); if (serialized_negative_transition_it.has_value()) { negative_tagged_transitions.emplace_back(serialized_negative_transition_it.value()); } else { @@ -399,6 +401,8 @@ auto RegexNFA::get_bfs_traversal_order( std::queue state_queue; std::unordered_set visited_states; std::vector visited_order; + visited_states.reserve(m_states.size()); + visited_order.reserve(m_states.size()); auto add_to_queue_and_visited = [&state_queue, &visited_states](RegexNFAByteState const* dest_state) { From 4a30fdcc16db664451041c1f2cda83b3543ca70f Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Sun, 27 Oct 2024 19:23:52 -0400 Subject: [PATCH 116/323] Add braced initialization to nfa. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/Lexer.tpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index ee700662..5cdade49 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -374,7 +374,7 @@ auto Lexer::get_rule(uint32_t const variable_id template void Lexer::generate() { - finite_automata::RegexNFA nfa(std::move(m_rules)); + finite_automata::RegexNFA nfa{std::move(m_rules)}; // TODO: DFA ignores tags. E.g., treats "capture:user=(?\d+)" as "capture:user=\d+" m_dfa = nfa_to_dfa(nfa); DFAStateType const* state = m_dfa->get_root(); From 0203038125b4e8fa20c1f57156d169f4854c3aa4 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Sun, 27 Oct 2024 19:26:46 -0400 Subject: [PATCH 117/323] Update docstring for positive tag serialization. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 2bbbb58a..d5b44cce 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -48,9 +48,8 @@ class PositiveTaggedTransition { /** * @param state_ids A map of states to their unique identifiers. - * @return A string representation of the positive tagged transitions if `m_dest_state` is in - * `state_ids`; - * nullopt otherwise. + * @return A string representation of the positive tagged transitions on success. + * @return std::nullopt if `m_dest_state` is not in `state_ids`. */ [[nodiscard]] auto serialize( std::unordered_map const& state_ids From 633acc43d7351aa020938701f99157e5803b7c9d Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Sun, 27 Oct 2024 19:27:06 -0400 Subject: [PATCH 118/323] Update docstring for negative tag serialization. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index d5b44cce..8456ffc5 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -75,9 +75,8 @@ class NegativeTaggedTransition { /** * @param state_ids A map of states to their unique identifiers. - * @return A string representation of the negative tagged transitions if `m_dest_state` is in - * `state_ids`; - * nullopt otherwise. + * @return A string representation of the negative tagged transitions on success. + * @return std::nullopt if `m_dest_state` is not in `state_ids`. */ [[nodiscard]] auto serialize( std::unordered_map const& state_ids From 4db7b82198ba5ce7399841312e66fdc7cbbbd428 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Sun, 27 Oct 2024 19:38:16 -0400 Subject: [PATCH 119/323] Use return statement for full docstring of get_bfs_traversal_order. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 8456ffc5..17271c9f 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -213,8 +213,6 @@ class RegexNFA { ) -> NFAStateType*; /** - * Traverse the NFA using a breadth-first search (BFS) and keep track of the order states are - * visited in. * @return A vector representing the traversal order of the NFA states using BFS. */ [[nodiscard]] auto get_bfs_traversal_order() const -> std::vector; From 01f8b14c48f01ae89a3ffd98f62e36fbd1a5ec51 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Sun, 27 Oct 2024 19:40:39 -0400 Subject: [PATCH 120/323] Update NFA serialize() docstring. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 17271c9f..cf32e119 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -218,7 +218,7 @@ class RegexNFA { [[nodiscard]] auto get_bfs_traversal_order() const -> std::vector; /** - * @return A string representation of the NFA. This function should always succeeed. + * @return A string representation of the NFA. */ [[nodiscard]] auto serialize() const -> std::string; From d0476246228f9d9fc38533d0a2119d3126125916 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 27 Oct 2024 19:41:38 -0400 Subject: [PATCH 121/323] Add long form of BFS for first use. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index cf32e119..e746e83d 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -213,7 +213,8 @@ class RegexNFA { ) -> NFAStateType*; /** - * @return A vector representing the traversal order of the NFA states using BFS. + * @return A vector representing the traversal order of the NFA states using breadth-first + * search (BFS). */ [[nodiscard]] auto get_bfs_traversal_order() const -> std::vector; From f9c4f46263a03571523c02e4b7e8c0390a2bbfae Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Sun, 27 Oct 2024 19:42:33 -0400 Subject: [PATCH 122/323] Use const for state_id_it. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index e746e83d..6f0c8a96 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -240,7 +240,7 @@ template auto PositiveTaggedTransition::serialize( std::unordered_map const& state_ids ) const -> std::optional { - auto state_id_it = state_ids.find(m_dest_state); + auto const state_id_it = state_ids.find(m_dest_state); if (state_id_it == state_ids.end()) { return std::nullopt; } From bd77c787662e9f9b905a05a3208e30ff208667af Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Sun, 27 Oct 2024 19:43:56 -0400 Subject: [PATCH 123/323] Update docstring for NFA state serialize. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 6f0c8a96..e397aa38 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -153,9 +153,9 @@ class RegexNFAState { /** * @param state_ids A map of states to their unique identifiers. - * @return A string representation of the NFA state if `m_positive_tagged_transitions` and - * `m_negative_tagged_transitions` can be serialized with `state_ids`; - * nullopt otherwise. + * @return A string representation of the NFA state on success. + * @return Forwards `PositiveTaggedTransition::serialize`'s return values on failure. + * @return Forwards `NegativeTaggedTransition::serialize`'s return values on failure. */ [[nodiscard]] auto serialize( std::unordered_map const& state_ids From f2d8049c466962a87d129920292b197ec3b794f7 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 27 Oct 2024 19:53:13 -0400 Subject: [PATCH 124/323] Combine the two failure cases in NFA state serailize's docstring to make it clear to the reader that both failures are handled the same way and return nullopt. For more complicated return cases it would warrant the reader looking at the doc for the individual functions, but here I think we can make their life easier. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index e397aa38..23fcc6ed 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -154,8 +154,8 @@ class RegexNFAState { /** * @param state_ids A map of states to their unique identifiers. * @return A string representation of the NFA state on success. - * @return Forwards `PositiveTaggedTransition::serialize`'s return values on failure. - * @return Forwards `NegativeTaggedTransition::serialize`'s return values on failure. + * @return Forwards `PositiveTaggedTransition::serialize`'s or + * `NegativeTaggedTransition::serialize`'s return value (std::nullopt) on failure. */ [[nodiscard]] auto serialize( std::unordered_map const& state_ids From 4cb560f1d5e438e8d49b198f5183c48074a291a3 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Sun, 27 Oct 2024 19:54:12 -0400 Subject: [PATCH 125/323] Use const for state_id_it. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 23fcc6ed..7d045e4c 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -251,7 +251,7 @@ template auto NegativeTaggedTransition::serialize( std::unordered_map const& state_ids ) const -> std::optional { - auto state_id_it = state_ids.find(m_dest_state); + auto const state_id_it = state_ids.find(m_dest_state); if (state_id_it == state_ids.end()) { return std::nullopt; } From 95b74975600235f97b538bbc1bc700a6c1acb949 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 27 Oct 2024 19:57:08 -0400 Subject: [PATCH 126/323] For NFA state serialize flip order of failure checks to reduce indentation. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 23fcc6ed..2153a8c7 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -327,22 +327,20 @@ auto RegexNFAState::serialize( for (auto const& positive_tagged_transition : m_positive_tagged_transitions) { auto const serialized_positive_transition_it = positive_tagged_transition.serialize(state_ids); - if (serialized_positive_transition_it.has_value()) { - positive_tagged_transitions.emplace_back(serialized_positive_transition_it.value()); - } else { + if (false == serialized_positive_transition_it.has_value()) { return std::nullopt; } + positive_tagged_transitions.emplace_back(serialized_positive_transition_it.value()); } std::vector negative_tagged_transitions; for (auto const& negative_tagged_transition : m_negative_tagged_transitions) { auto const serialized_negative_transition_it = negative_tagged_transition.serialize(state_ids); - if (serialized_negative_transition_it.has_value()) { - negative_tagged_transitions.emplace_back(serialized_negative_transition_it.value()); - } else { + if (false == serialized_negative_transition_it.has_value()) { return std::nullopt; } + negative_tagged_transitions.emplace_back(serialized_negative_transition_it.value()); } auto const accepting_tag_string From 8b85511e7216d0159d1ac00ec4cad23027ef55ed Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 27 Oct 2024 20:02:44 -0400 Subject: [PATCH 127/323] Use const& for passing rules into the NFA as rules are never stored, nor are parts of the rules stored, instead the rules are only read and used to build the NFA. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 37fcfd05..b8e27824 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -181,7 +181,7 @@ class RegexNFA { public: using StateVec = std::vector; - explicit RegexNFA(std::vector> rules); + explicit RegexNFA(std::vector> const& rules); /** * Creates a unique_ptr for an NFA state with no tagged transitions and adds it to `m_states`. @@ -359,7 +359,7 @@ auto RegexNFAState::serialize( } template -RegexNFA::RegexNFA(std::vector> rules) +RegexNFA::RegexNFA(std::vector> const& rules) : m_root{new_state()} { for (auto const& rule : rules) { rule.add_to_nfa(this); From 075679401ad3dc47acd1bf8b0b3e0dc171dc4746 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Sun, 27 Oct 2024 20:05:26 -0400 Subject: [PATCH 128/323] Use braced initialization for NFA. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- tests/test-NFA.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index f243509b..beb35231 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -44,7 +44,7 @@ TEST_CASE("Test NFA", "[NFA]") { auto& capture_rule_ast = dynamic_cast(*schema_ast->m_schema_vars[0]); vector rules; rules.emplace_back(0, std::move(capture_rule_ast.m_regex_ptr)); - ByteNFA const nfa(std::move(rules)); + ByteNFA const nfa{std::move(rules)}; // Compare against expected output string expected_serialized_nfa = "0:byte_transitions={A-->1,Z-->2}," From 6ab439afd24a659a60f7cf2b5fc73ddfe678403b Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Sun, 27 Oct 2024 20:06:34 -0400 Subject: [PATCH 129/323] Remove warning for not check std::optional when we know the function call succeeds in NFA's serialize. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index b8e27824..b347052d 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -446,6 +446,7 @@ auto RegexNFA::serialize() const -> std::string { for (auto const* state : traversal_order) { // `state_ids` is well-formed as its generated from `get_bfs_traversal_order` so we can // safely assume `state->serialize(state_ids)` will return a valid value. + // NOLINTNEXTLINE(bugprone-unchecked-optional-access) serialized_states.emplace_back(state->serialize(state_ids).value()); } return fmt::format("{}\n", fmt::join(serialized_states, "\n")); From 924481243d333a4158896717d9ac67f65970f061 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 27 Oct 2024 20:08:37 -0400 Subject: [PATCH 130/323] Remove redundant initialzation of member variables in tagged transition classes when they are initialized in their constructor. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index b347052d..47ab2ac5 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -56,8 +56,8 @@ class PositiveTaggedTransition { ) const -> std::optional; private: - uint32_t m_tag{}; - RegexNFAState const* m_dest_state{}; + uint32_t m_tag; + RegexNFAState const* m_dest_state; }; template @@ -84,7 +84,7 @@ class NegativeTaggedTransition { private: std::set m_tags; - RegexNFAState const* m_dest_state{}; + RegexNFAState const* m_dest_state; }; template From 0d151a43c8d1416d58d8adf963fb412b34615885 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 27 Oct 2024 20:12:00 -0400 Subject: [PATCH 131/323] Use member initialization lists for constructing NFA state from tagged transitions instead of emplace back. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 47ab2ac5..f8905f2d 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -94,13 +94,11 @@ class RegexNFAState { RegexNFAState() = default; - RegexNFAState(uint32_t const tag, RegexNFAState const* dest_state) { - m_positive_tagged_transitions.emplace_back(tag, dest_state); - } + RegexNFAState(uint32_t const tag, RegexNFAState const* dest_state) + : m_positive_tagged_transitions{{tag, dest_state}} {} - RegexNFAState(std::set tags, RegexNFAState const* dest_state) { - m_negative_tagged_transitions.emplace_back(std::move(tags), dest_state); - } + RegexNFAState(std::set tags, RegexNFAState const* dest_state) + : m_negative_tagged_transitions{{std::move(tags), dest_state}} {} auto set_accepting(bool accepting) -> void { m_accepting = accepting; } From ac63713891da8d707e86708048c7c95eccf6dc2d Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 28 Oct 2024 05:34:25 -0400 Subject: [PATCH 132/323] Switch to using optional prefix for optional return types. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index f8905f2d..d064ecb3 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -323,22 +323,22 @@ auto RegexNFAState::serialize( std::vector positive_tagged_transitions; for (auto const& positive_tagged_transition : m_positive_tagged_transitions) { - auto const serialized_positive_transition_it + auto const optional_serialized_positive_transition = positive_tagged_transition.serialize(state_ids); - if (false == serialized_positive_transition_it.has_value()) { + if (false == optional_serialized_positive_transition.has_value()) { return std::nullopt; } - positive_tagged_transitions.emplace_back(serialized_positive_transition_it.value()); + positive_tagged_transitions.emplace_back(optional_serialized_positive_transition.value()); } std::vector negative_tagged_transitions; for (auto const& negative_tagged_transition : m_negative_tagged_transitions) { - auto const serialized_negative_transition_it + auto const optional_serialized_negative_transition = negative_tagged_transition.serialize(state_ids); - if (false == serialized_negative_transition_it.has_value()) { + if (false == optional_serialized_negative_transition.has_value()) { return std::nullopt; } - negative_tagged_transitions.emplace_back(serialized_negative_transition_it.value()); + negative_tagged_transitions.emplace_back(optional_serialized_negative_transition.value()); } auto const accepting_tag_string From b57b93fa3cadd309fe60cdfc3f7b7b67d77e8709 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 28 Oct 2024 09:38:15 -0400 Subject: [PATCH 133/323] Make negative tagged transition singular as you can never have more than one leaving an NFA state. --- src/log_surgeon/Lexer.tpp | 5 ++- src/log_surgeon/finite_automata/RegexAST.hpp | 8 ++-- src/log_surgeon/finite_automata/RegexNFA.hpp | 39 +++++++++++--------- tests/test-NFA.cpp | 26 ++++++------- 4 files changed, 41 insertions(+), 37 deletions(-) diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index 5cdade49..b500f461 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -405,8 +405,9 @@ auto Lexer::epsilon_closure(NFAStateType const* stat for (auto const& positive_tagged_transition : t->get_positive_tagged_transitions()) { stack.push(positive_tagged_transition.get_dest_state()); } - for (auto const& negative_tagged_transition : t->get_negative_tagged_transitions()) { - stack.push(negative_tagged_transition.get_dest_state()); + auto const* negative_dest_state = t->get_negative_tagged_transition().get_dest_state(); + if (nullptr != negative_dest_state) { + stack.push(t->get_negative_tagged_transition().get_dest_state()); } } } diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index eecf7462..fe891a1c 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -105,11 +105,11 @@ class RegexAST { auto add_to_nfa_with_negative_tags(RegexNFA* nfa, NFAStateType* end_state) const -> void { // Handle negative tags as: - // root --(regex)--> state_with_negative_tagged_transitions --(negative tags)--> end_state + // root --(regex)--> state_with_negative_tagged_transition --(negative tags)--> end_state if (false == m_negative_tags.empty()) { - auto* state_with_negative_tagged_transitions - = nfa->new_state_with_negative_tagged_transitions(m_negative_tags, end_state); - add_to_nfa(nfa, state_with_negative_tagged_transitions); + auto* state_with_negative_tagged_transition + = nfa->new_state_with_negative_tagged_transition(m_negative_tags, end_state); + add_to_nfa(nfa, state_with_negative_tagged_transition); } else { add_to_nfa(nfa, end_state); } diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index d064ecb3..163aec23 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -63,6 +63,8 @@ class PositiveTaggedTransition { template class NegativeTaggedTransition { public: + NegativeTaggedTransition() = default; + NegativeTaggedTransition(std::set tags, RegexNFAState const* dest_state) : m_tags{std::move(tags)}, m_dest_state{dest_state} {} @@ -84,7 +86,7 @@ class NegativeTaggedTransition { private: std::set m_tags; - RegexNFAState const* m_dest_state; + RegexNFAState const* m_dest_state{nullptr}; }; template @@ -98,7 +100,7 @@ class RegexNFAState { : m_positive_tagged_transitions{{tag, dest_state}} {} RegexNFAState(std::set tags, RegexNFAState const* dest_state) - : m_negative_tagged_transitions{{std::move(tags), dest_state}} {} + : m_negative_tagged_transition{std::move(tags), dest_state} {} auto set_accepting(bool accepting) -> void { m_accepting = accepting; } @@ -117,9 +119,9 @@ class RegexNFAState { return m_positive_tagged_transitions; } - [[nodiscard]] auto get_negative_tagged_transitions( - ) const -> std::vector> const& { - return m_negative_tagged_transitions; + [[nodiscard]] auto get_negative_tagged_transition( + ) const -> NegativeTaggedTransition const& { + return m_negative_tagged_transition; } auto add_epsilon_transition(RegexNFAState* epsilon_transition) -> void { @@ -163,7 +165,7 @@ class RegexNFAState { bool m_accepting{false}; uint32_t m_matching_variable_id{0}; std::vector> m_positive_tagged_transitions; - std::vector> m_negative_tagged_transitions; + NegativeTaggedTransition m_negative_tagged_transition; std::vector m_epsilon_transitions; std::array, cSizeOfByte> m_bytes_transitions; // NOTE: We don't need m_tree_transitions for the `stateType == @@ -202,10 +204,11 @@ class RegexNFA { /** * Creates a unique_ptr for an NFA state with negative tagged transitions and adds it to * `m_states`. + * @param tags * @param dest_state * @return NFAStateType* */ - [[nodiscard]] auto new_state_with_negative_tagged_transitions( + [[nodiscard]] auto new_state_with_negative_tagged_transition( std::set tags, NFAStateType const* dest_state ) -> NFAStateType*; @@ -331,14 +334,14 @@ auto RegexNFAState::serialize( positive_tagged_transitions.emplace_back(optional_serialized_positive_transition.value()); } - std::vector negative_tagged_transitions; - for (auto const& negative_tagged_transition : m_negative_tagged_transitions) { + std::string negative_tagged_transition; + if(nullptr != m_negative_tagged_transition.get_dest_state()) { auto const optional_serialized_negative_transition - = negative_tagged_transition.serialize(state_ids); + = m_negative_tagged_transition.serialize(state_ids); if (false == optional_serialized_negative_transition.has_value()) { return std::nullopt; } - negative_tagged_transitions.emplace_back(optional_serialized_negative_transition.value()); + negative_tagged_transition = optional_serialized_negative_transition.value(); } auto const accepting_tag_string @@ -346,13 +349,13 @@ auto RegexNFAState::serialize( return fmt::format( "{}:{}byte_transitions={{{}}},epsilon_transitions={{{}}},positive_tagged_transitions={{" - "{}}},negative_tagged_transitions={{{}}}", + "{}}},negative_tagged_transition={{{}}}", state_ids.at(this), accepting_tag_string, fmt::join(byte_transitions, ","), fmt::join(epsilon_transitions, ","), fmt::join(positive_tagged_transitions, ","), - fmt::join(negative_tagged_transitions, ",") + negative_tagged_transition ); } @@ -380,7 +383,7 @@ auto RegexNFA::new_state_with_positive_tagged_transition( } template -auto RegexNFA::new_state_with_negative_tagged_transitions( +auto RegexNFA::new_state_with_negative_tagged_transition( std::set tags, NFAStateType const* dest_state ) -> NFAStateType* { @@ -422,10 +425,10 @@ auto RegexNFA::get_bfs_traversal_order( { add_to_queue_and_visited(positive_tagged_transition.get_dest_state()); } - for (auto const& negative_tagged_transition : - current_state->get_negative_tagged_transitions()) - { - add_to_queue_and_visited(negative_tagged_transition.get_dest_state()); + auto const* negative_dest_state + = current_state->get_negative_tagged_transition().get_dest_state(); + if (nullptr != negative_dest_state) { + add_to_queue_and_visited(negative_dest_state); } } return visited_order; diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index beb35231..0223c9bb 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -50,57 +50,57 @@ TEST_CASE("Test NFA", "[NFA]") { string expected_serialized_nfa = "0:byte_transitions={A-->1,Z-->2}," "epsilon_transitions={}," "positive_tagged_transitions={}," - "negative_tagged_transitions={}\n"; + "negative_tagged_transition={}\n"; expected_serialized_nfa += "1:byte_transitions={a-->3,b-->3,c-->4,d-->4}," "epsilon_transitions={}," "positive_tagged_transitions={}," - "negative_tagged_transitions={}\n"; + "negative_tagged_transition={}\n"; expected_serialized_nfa += "2:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_transitions={}," - "negative_tagged_transitions={5[0,1,2,3]}\n"; + "negative_tagged_transition={5[0,1,2,3]}\n"; expected_serialized_nfa += "3:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_transitions={6[0]}," - "negative_tagged_transitions={}\n"; + "negative_tagged_transition={}\n"; expected_serialized_nfa += "4:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_transitions={7[1]}," - "negative_tagged_transitions={}\n"; + "negative_tagged_transition={}\n"; expected_serialized_nfa += "5:accepting_tag=0,byte_transitions={}," "epsilon_transitions={}," "positive_tagged_transitions={}," - "negative_tagged_transitions={}\n"; + "negative_tagged_transition={}\n"; expected_serialized_nfa += "6:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_transitions={}," - "negative_tagged_transitions={8[1]}\n"; + "negative_tagged_transition={8[1]}\n"; expected_serialized_nfa += "7:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_transitions={}," - "negative_tagged_transitions={8[0]}\n"; + "negative_tagged_transition={8[0]}\n"; expected_serialized_nfa += "8:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_transitions={9[2]}," - "negative_tagged_transitions={}\n"; + "negative_tagged_transition={}\n"; expected_serialized_nfa += "9:byte_transitions={B-->10}," "epsilon_transitions={}," "positive_tagged_transitions={}," - "negative_tagged_transitions={}\n"; + "negative_tagged_transition={}\n"; expected_serialized_nfa += "10:byte_transitions={0-->11,1-->11,2-->11,3-->11,4-->11,5-->11,6-->" "11,7-->11,8-->11,9-->11}," "epsilon_transitions={}," "positive_tagged_transitions={}," - "negative_tagged_transitions={}\n"; + "negative_tagged_transition={}\n"; expected_serialized_nfa += "11:byte_transitions={0-->11,1-->11,2-->11,3-->11,4-->11,5-->11,6-->" "11,7-->11,8-->11,9-->11}," "epsilon_transitions={}," "positive_tagged_transitions={12[3]}," - "negative_tagged_transitions={}\n"; + "negative_tagged_transition={}\n"; expected_serialized_nfa += "12:byte_transitions={C-->5}," "epsilon_transitions={}," "positive_tagged_transitions={}," - "negative_tagged_transitions={}\n"; + "negative_tagged_transition={}\n"; // Compare expected and actual line-by-line auto const actual_serialized_nfa = nfa.serialize(); From c3fb16d393ece2d9c911677872cd13b4948643d4 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 28 Oct 2024 09:39:35 -0400 Subject: [PATCH 134/323] Add missing param for new_state_with_negative_tagged_transitions. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index d064ecb3..3d52ba91 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -202,6 +202,7 @@ class RegexNFA { /** * Creates a unique_ptr for an NFA state with negative tagged transitions and adds it to * `m_states`. + * @param tags * @param dest_state * @return NFAStateType* */ From 8a41367dfcbeed485863199c297820a64e4052a4 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 28 Oct 2024 10:36:31 -0400 Subject: [PATCH 135/323] Move RegexNFAStateType, RegexNFAState, and PositiveTaggedTransition/NegativeTaggedTransition classes into their own files. --- CMakeLists.txt | 3 + src/log_surgeon/finite_automata/RegexNFA.hpp | 280 +----------------- .../finite_automata/RegexNFAState.hpp | 207 +++++++++++++ .../finite_automata/RegexNFAStateType.hpp | 19 ++ .../finite_automata/TaggedTransition.hpp | 92 ++++++ tests/CMakeLists.txt | 3 + 6 files changed, 325 insertions(+), 279 deletions(-) create mode 100644 src/log_surgeon/finite_automata/RegexNFAState.hpp create mode 100644 src/log_surgeon/finite_automata/RegexNFAStateType.hpp create mode 100644 src/log_surgeon/finite_automata/TaggedTransition.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 8a9916a3..50e45392 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -97,6 +97,9 @@ set(SOURCE_FILES src/log_surgeon/finite_automata/RegexDFA.hpp src/log_surgeon/finite_automata/RegexDFA.tpp src/log_surgeon/finite_automata/RegexNFA.hpp + src/log_surgeon/finite_automata/RegexNFAState.hpp + src/log_surgeon/finite_automata/RegexNFAStateType.hpp + src/log_surgeon/finite_automata/TaggedTransition.hpp src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp src/log_surgeon/finite_automata/UnicodeIntervalTree.tpp ) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 3d52ba91..61090c09 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -1,15 +1,10 @@ #ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_HPP #define LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_HPP -#include -#include -#include #include #include #include #include -#include -#include #include #include #include @@ -18,161 +13,10 @@ #include #include -#include +#include #include namespace log_surgeon::finite_automata { -enum class RegexNFAStateType : uint8_t { - Byte, - UTF8 -}; - -template -class RegexNFAState; - -using RegexNFAByteState = RegexNFAState; -using RegexNFAUTF8State = RegexNFAState; - -template -class PositiveTaggedTransition { -public: - PositiveTaggedTransition(uint32_t const tag, RegexNFAState const* dest_state) - : m_tag{tag}, - m_dest_state{dest_state} {} - - [[nodiscard]] auto get_tag() const -> uint32_t { return m_tag; } - - [[nodiscard]] auto get_dest_state() const -> RegexNFAState const* { - return m_dest_state; - } - - /** - * @param state_ids A map of states to their unique identifiers. - * @return A string representation of the positive tagged transitions on success. - * @return std::nullopt if `m_dest_state` is not in `state_ids`. - */ - [[nodiscard]] auto serialize( - std::unordered_map const& state_ids - ) const -> std::optional; - -private: - uint32_t m_tag; - RegexNFAState const* m_dest_state; -}; - -template -class NegativeTaggedTransition { -public: - NegativeTaggedTransition(std::set tags, RegexNFAState const* dest_state) - : m_tags{std::move(tags)}, - m_dest_state{dest_state} {} - - [[nodiscard]] auto get_tags() const -> std::set const& { return m_tags; } - - [[nodiscard]] auto get_dest_state() const -> RegexNFAState const* { - return m_dest_state; - } - - /** - * @param state_ids A map of states to their unique identifiers. - * @return A string representation of the negative tagged transitions on success. - * @return std::nullopt if `m_dest_state` is not in `state_ids`. - */ - [[nodiscard]] auto serialize( - std::unordered_map const& state_ids - ) const -> std::optional; - -private: - std::set m_tags; - RegexNFAState const* m_dest_state; -}; - -template -class RegexNFAState { -public: - using Tree = UnicodeIntervalTree; - - RegexNFAState() = default; - - RegexNFAState(uint32_t const tag, RegexNFAState const* dest_state) - : m_positive_tagged_transitions{{tag, dest_state}} {} - - RegexNFAState(std::set tags, RegexNFAState const* dest_state) - : m_negative_tagged_transitions{{std::move(tags), dest_state}} {} - - auto set_accepting(bool accepting) -> void { m_accepting = accepting; } - - [[nodiscard]] auto is_accepting() const -> bool const& { return m_accepting; } - - auto set_matching_variable_id(uint32_t const variable_id) -> void { - m_matching_variable_id = variable_id; - } - - [[nodiscard]] auto get_matching_variable_id() const -> uint32_t { - return m_matching_variable_id; - } - - [[nodiscard]] auto get_positive_tagged_transitions( - ) const -> std::vector> const& { - return m_positive_tagged_transitions; - } - - [[nodiscard]] auto get_negative_tagged_transitions( - ) const -> std::vector> const& { - return m_negative_tagged_transitions; - } - - auto add_epsilon_transition(RegexNFAState* epsilon_transition) -> void { - m_epsilon_transitions.push_back(epsilon_transition); - } - - [[nodiscard]] auto get_epsilon_transitions() const -> std::vector const& { - return m_epsilon_transitions; - } - - auto add_byte_transition(uint8_t byte, RegexNFAState* dest_state) -> void { - m_bytes_transitions[byte].push_back(dest_state); - } - - [[nodiscard]] auto get_byte_transitions(uint8_t byte - ) const -> std::vector const& { - return m_bytes_transitions[byte]; - } - - auto get_tree_transitions() -> Tree const& { return m_tree_transitions; } - - /** - Add dest_state to m_bytes_transitions if all values in interval are a byte, otherwise add - dest_state to m_tree_transitions - * @param interval - * @param dest_state - */ - auto add_interval(Interval interval, RegexNFAState* dest_state) -> void; - - /** - * @param state_ids A map of states to their unique identifiers. - * @return A string representation of the NFA state on success. - * @return Forwards `PositiveTaggedTransition::serialize`'s or - * `NegativeTaggedTransition::serialize`'s return value (std::nullopt) on failure. - */ - [[nodiscard]] auto serialize( - std::unordered_map const& state_ids - ) const -> std::optional; - -private: - bool m_accepting{false}; - uint32_t m_matching_variable_id{0}; - std::vector> m_positive_tagged_transitions; - std::vector> m_negative_tagged_transitions; - std::vector m_epsilon_transitions; - std::array, cSizeOfByte> m_bytes_transitions; - // NOTE: We don't need m_tree_transitions for the `stateType == - // RegexDFAStateType::Byte` case, so we use an empty class (`std::tuple<>`) - // in that case. - std::conditional_t> - m_tree_transitions; -}; - // TODO: rename `RegexNFA` to `NFA` template class RegexNFA { @@ -235,128 +79,6 @@ class RegexNFA { NFAStateType* m_root; }; -template -auto PositiveTaggedTransition::serialize( - std::unordered_map const& state_ids -) const -> std::optional { - auto const state_id_it = state_ids.find(m_dest_state); - if (state_id_it == state_ids.end()) { - return std::nullopt; - } - return fmt::format("{}[{}]", state_id_it->second, m_tag); -} - -template -auto NegativeTaggedTransition::serialize( - std::unordered_map const& state_ids -) const -> std::optional { - auto const state_id_it = state_ids.find(m_dest_state); - if (state_id_it == state_ids.end()) { - return std::nullopt; - } - return fmt::format("{}[{}]", state_id_it->second, fmt::join(m_tags, ",")); -} - -template -void RegexNFAState::add_interval(Interval interval, RegexNFAState* dest_state) { - if (interval.first < cSizeOfByte) { - uint32_t const bound = std::min(interval.second, cSizeOfByte - 1); - for (uint32_t i = interval.first; i <= bound; i++) { - add_byte_transition(i, dest_state); - } - interval.first = bound + 1; - } - if constexpr (RegexNFAStateType::UTF8 == state_type) { - if (interval.second < cSizeOfByte) { - return; - } - std::unique_ptr> overlaps - = m_tree_transitions.pop(interval); - for (typename Tree::Data const& data : *overlaps) { - uint32_t overlap_low = std::max(data.m_interval.first, interval.first); - uint32_t overlap_high = std::min(data.m_interval.second, interval.second); - - std::vector tree_states = data.m_value; - tree_states.push_back(dest_state); - m_tree_transitions.insert(Interval(overlap_low, overlap_high), tree_states); - if (data.m_interval.first < interval.first) { - m_tree_transitions.insert( - Interval(data.m_interval.first, interval.first - 1), - data.m_value - ); - } else if (data.m_interval.first > interval.first) { - m_tree_transitions.insert( - Interval(interval.first, data.m_interval.first - 1), - {dest_state} - ); - } - if (data.m_interval.second > interval.second) { - m_tree_transitions.insert( - Interval(interval.second + 1, data.m_interval.second), - data.m_value - ); - } - interval.first = data.m_interval.second + 1; - } - if (interval.first != 0 && interval.first <= interval.second) { - m_tree_transitions.insert(interval, {dest_state}); - } - } -} - -template -auto RegexNFAState::serialize( - std::unordered_map const& state_ids -) const -> std::optional { - std::vector byte_transitions; - for (uint32_t idx{0}; idx < cSizeOfByte; ++idx) { - for (auto const* dest_state : m_bytes_transitions[idx]) { - byte_transitions.emplace_back( - fmt::format("{}-->{}", static_cast(idx), state_ids.at(dest_state)) - ); - } - } - - std::vector epsilon_transitions; - for (auto const* dest_state : m_epsilon_transitions) { - epsilon_transitions.emplace_back(std::to_string(state_ids.at(dest_state))); - } - - std::vector positive_tagged_transitions; - for (auto const& positive_tagged_transition : m_positive_tagged_transitions) { - auto const optional_serialized_positive_transition - = positive_tagged_transition.serialize(state_ids); - if (false == optional_serialized_positive_transition.has_value()) { - return std::nullopt; - } - positive_tagged_transitions.emplace_back(optional_serialized_positive_transition.value()); - } - - std::vector negative_tagged_transitions; - for (auto const& negative_tagged_transition : m_negative_tagged_transitions) { - auto const optional_serialized_negative_transition - = negative_tagged_transition.serialize(state_ids); - if (false == optional_serialized_negative_transition.has_value()) { - return std::nullopt; - } - negative_tagged_transitions.emplace_back(optional_serialized_negative_transition.value()); - } - - auto const accepting_tag_string - = m_accepting ? fmt::format("accepting_tag={},", m_matching_variable_id) : ""; - - return fmt::format( - "{}:{}byte_transitions={{{}}},epsilon_transitions={{{}}},positive_tagged_transitions={{" - "{}}},negative_tagged_transitions={{{}}}", - state_ids.at(this), - accepting_tag_string, - fmt::join(byte_transitions, ","), - fmt::join(epsilon_transitions, ","), - fmt::join(positive_tagged_transitions, ","), - fmt::join(negative_tagged_transitions, ",") - ); -} - template RegexNFA::RegexNFA(std::vector> const& rules) : m_root{new_state()} { diff --git a/src/log_surgeon/finite_automata/RegexNFAState.hpp b/src/log_surgeon/finite_automata/RegexNFAState.hpp new file mode 100644 index 00000000..aca1d7eb --- /dev/null +++ b/src/log_surgeon/finite_automata/RegexNFAState.hpp @@ -0,0 +1,207 @@ +#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_STATE +#define LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_STATE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace log_surgeon::finite_automata { +template +class RegexNFAState { +public: + using Tree = UnicodeIntervalTree; + + RegexNFAState() = default; + + RegexNFAState(uint32_t const tag, RegexNFAState const* dest_state) + : m_positive_tagged_transitions{{tag, dest_state}} {} + + RegexNFAState(std::set tags, RegexNFAState const* dest_state) + : m_negative_tagged_transitions{{std::move(tags), dest_state}} {} + + auto set_accepting(bool accepting) -> void { m_accepting = accepting; } + + [[nodiscard]] auto is_accepting() const -> bool const& { return m_accepting; } + + auto set_matching_variable_id(uint32_t const variable_id) -> void { + m_matching_variable_id = variable_id; + } + + [[nodiscard]] auto get_matching_variable_id() const -> uint32_t { + return m_matching_variable_id; + } + + [[nodiscard]] auto get_positive_tagged_transitions( + ) const -> std::vector> const& { + return m_positive_tagged_transitions; + } + + [[nodiscard]] auto get_negative_tagged_transitions( + ) const -> std::vector> const& { + return m_negative_tagged_transitions; + } + + auto add_epsilon_transition(RegexNFAState* epsilon_transition) -> void { + m_epsilon_transitions.push_back(epsilon_transition); + } + + [[nodiscard]] auto get_epsilon_transitions() const -> std::vector const& { + return m_epsilon_transitions; + } + + auto add_byte_transition(uint8_t byte, RegexNFAState* dest_state) -> void { + m_bytes_transitions[byte].push_back(dest_state); + } + + [[nodiscard]] auto get_byte_transitions(uint8_t byte + ) const -> std::vector const& { + return m_bytes_transitions[byte]; + } + + auto get_tree_transitions() -> Tree const& { return m_tree_transitions; } + + /** + Add dest_state to m_bytes_transitions if all values in interval are a byte, otherwise add + dest_state to m_tree_transitions + * @param interval + * @param dest_state + */ + auto add_interval(Interval interval, RegexNFAState* dest_state) -> void; + + /** + * @param state_ids A map of states to their unique identifiers. + * @return A string representation of the NFA state on success. + * @return Forwards `PositiveTaggedTransition::serialize`'s or + * `NegativeTaggedTransition::serialize`'s return value (std::nullopt) on failure. + */ + [[nodiscard]] auto serialize( + std::unordered_map const& state_ids + ) const -> std::optional; + +private: + bool m_accepting{false}; + uint32_t m_matching_variable_id{0}; + std::vector> m_positive_tagged_transitions; + std::vector> m_negative_tagged_transitions; + std::vector m_epsilon_transitions; + std::array, cSizeOfByte> m_bytes_transitions; + // NOTE: We don't need m_tree_transitions for the `stateType == + // RegexDFAStateType::Byte` case, so we use an empty class (`std::tuple<>`) + // in that case. + std::conditional_t> + m_tree_transitions; +}; + +template +void RegexNFAState::add_interval(Interval interval, RegexNFAState* dest_state) { + if (interval.first < cSizeOfByte) { + uint32_t const bound = std::min(interval.second, cSizeOfByte - 1); + for (uint32_t i = interval.first; i <= bound; i++) { + add_byte_transition(i, dest_state); + } + interval.first = bound + 1; + } + if constexpr (RegexNFAStateType::UTF8 == state_type) { + if (interval.second < cSizeOfByte) { + return; + } + std::unique_ptr> overlaps + = m_tree_transitions.pop(interval); + for (typename Tree::Data const& data : *overlaps) { + uint32_t overlap_low = std::max(data.m_interval.first, interval.first); + uint32_t overlap_high = std::min(data.m_interval.second, interval.second); + + std::vector tree_states = data.m_value; + tree_states.push_back(dest_state); + m_tree_transitions.insert(Interval(overlap_low, overlap_high), tree_states); + if (data.m_interval.first < interval.first) { + m_tree_transitions.insert( + Interval(data.m_interval.first, interval.first - 1), + data.m_value + ); + } else if (data.m_interval.first > interval.first) { + m_tree_transitions.insert( + Interval(interval.first, data.m_interval.first - 1), + {dest_state} + ); + } + if (data.m_interval.second > interval.second) { + m_tree_transitions.insert( + Interval(interval.second + 1, data.m_interval.second), + data.m_value + ); + } + interval.first = data.m_interval.second + 1; + } + if (interval.first != 0 && interval.first <= interval.second) { + m_tree_transitions.insert(interval, {dest_state}); + } + } +} + +template +auto RegexNFAState::serialize( + std::unordered_map const& state_ids +) const -> std::optional { + std::vector byte_transitions; + for (uint32_t idx{0}; idx < cSizeOfByte; ++idx) { + for (auto const* dest_state : m_bytes_transitions[idx]) { + byte_transitions.emplace_back( + fmt::format("{}-->{}", static_cast(idx), state_ids.at(dest_state)) + ); + } + } + + std::vector epsilon_transitions; + for (auto const* dest_state : m_epsilon_transitions) { + epsilon_transitions.emplace_back(std::to_string(state_ids.at(dest_state))); + } + + std::vector positive_tagged_transitions; + for (auto const& positive_tagged_transition : m_positive_tagged_transitions) { + auto const optional_serialized_positive_transition + = positive_tagged_transition.serialize(state_ids); + if (false == optional_serialized_positive_transition.has_value()) { + return std::nullopt; + } + positive_tagged_transitions.emplace_back(optional_serialized_positive_transition.value()); + } + + std::vector negative_tagged_transitions; + for (auto const& negative_tagged_transition : m_negative_tagged_transitions) { + auto const optional_serialized_negative_transition + = negative_tagged_transition.serialize(state_ids); + if (false == optional_serialized_negative_transition.has_value()) { + return std::nullopt; + } + negative_tagged_transitions.emplace_back(optional_serialized_negative_transition.value()); + } + + auto const accepting_tag_string + = m_accepting ? fmt::format("accepting_tag={},", m_matching_variable_id) : ""; + + return fmt::format( + "{}:{}byte_transitions={{{}}},epsilon_transitions={{{}}},positive_tagged_transitions={{" + "{}}},negative_tagged_transitions={{{}}}", + state_ids.at(this), + accepting_tag_string, + fmt::join(byte_transitions, ","), + fmt::join(epsilon_transitions, ","), + fmt::join(positive_tagged_transitions, ","), + fmt::join(negative_tagged_transitions, ",") + ); +} +} // namespace log_surgeon::finite_automata + +#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_STATE diff --git a/src/log_surgeon/finite_automata/RegexNFAStateType.hpp b/src/log_surgeon/finite_automata/RegexNFAStateType.hpp new file mode 100644 index 00000000..e190e387 --- /dev/null +++ b/src/log_surgeon/finite_automata/RegexNFAStateType.hpp @@ -0,0 +1,19 @@ +#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_STATE_TYPE +#define LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_STATE_TYPE + +#include + +namespace log_surgeon::finite_automata { +enum class RegexNFAStateType : uint8_t { + Byte, + UTF8 +}; + +template +class RegexNFAState; + +using RegexNFAByteState = RegexNFAState; +using RegexNFAUTF8State = RegexNFAState; +} // namespace log_surgeon::finite_automata + +#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_STATE_TYPE diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp new file mode 100644 index 00000000..21282986 --- /dev/null +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -0,0 +1,92 @@ +#ifndef LOG_SURGEON_FINITE_AUTOMATA_TAGGED_TRANSITION +#define LOG_SURGEON_FINITE_AUTOMATA_TAGGED_TRANSITION + +#include +#include +#include +#include +#include + +#include + +#include + +namespace log_surgeon::finite_automata { +template +class PositiveTaggedTransition { +public: + PositiveTaggedTransition(uint32_t const tag, RegexNFAState const* dest_state) + : m_tag{tag}, + m_dest_state{dest_state} {} + + [[nodiscard]] auto get_tag() const -> uint32_t { return m_tag; } + + [[nodiscard]] auto get_dest_state() const -> RegexNFAState const* { + return m_dest_state; + } + + /** + * @param state_ids A map of states to their unique identifiers. + * @return A string representation of the positive tagged transitions on success. + * @return std::nullopt if `m_dest_state` is not in `state_ids`. + */ + [[nodiscard]] auto serialize( + std::unordered_map const& state_ids + ) const -> std::optional; + +private: + uint32_t m_tag; + RegexNFAState const* m_dest_state; +}; + +template +class NegativeTaggedTransition { +public: + NegativeTaggedTransition(std::set tags, RegexNFAState const* dest_state) + : m_tags{std::move(tags)}, + m_dest_state{dest_state} {} + + [[nodiscard]] auto get_tags() const -> std::set const& { return m_tags; } + + [[nodiscard]] auto get_dest_state() const -> RegexNFAState const* { + return m_dest_state; + } + + /** + * @param state_ids A map of states to their unique identifiers. + * @return A string representation of the negative tagged transitions on success. + * @return std::nullopt if `m_dest_state` is not in `state_ids`. + */ + [[nodiscard]] auto serialize( + std::unordered_map const& state_ids + ) const -> std::optional; + +private: + std::set m_tags; + RegexNFAState const* m_dest_state; +}; + +template +auto PositiveTaggedTransition::serialize( + std::unordered_map const& state_ids +) const -> std::optional { + auto const state_id_it = state_ids.find(m_dest_state); + if (state_id_it == state_ids.end()) { + return std::nullopt; + } + return fmt::format("{}[{}]", state_id_it->second, m_tag); +} + +template +auto NegativeTaggedTransition::serialize( + std::unordered_map const& state_ids +) const -> std::optional { + auto const state_id_it = state_ids.find(m_dest_state); + if (state_id_it == state_ids.end()) { + return std::nullopt; + } + return fmt::format("{}[{}]", state_id_it->second, fmt::join(m_tags, ",")); +} +} // namespace log_surgeon::finite_automata + +#endif // LOG_SURGEON_FINITE_AUTOMATA_TAGGED_TRANSITION diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 8c45b07b..b7afd1f1 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -4,6 +4,9 @@ set( ../src/log_surgeon/FileReader.hpp ../src/log_surgeon/finite_automata/RegexAST.hpp ../src/log_surgeon/finite_automata/RegexNFA.hpp + ../src/log_surgeon/finite_automata/RegexNFAState.hpp + ../src/log_surgeon/finite_automata/RegexNFAStateType.hpp + ../src/log_surgeon/finite_automata/TaggedTransition.hpp ../src/log_surgeon/LALR1Parser.cpp ../src/log_surgeon/LALR1Parser.hpp ../src/log_surgeon/LALR1Parser.tpp From d1a57e4fa6d9f0826bfb74f91f1d52b7d2769ff2 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 28 Oct 2024 11:57:58 -0400 Subject: [PATCH 136/323] Add tag class. --- CMakeLists.txt | 1 + src/log_surgeon/finite_automata/RegexAST.hpp | 1 + src/log_surgeon/finite_automata/Tag.hpp | 16 ++++++++++++++++ .../finite_automata/TaggedTransition.hpp | 1 + tests/CMakeLists.txt | 1 + 5 files changed, 20 insertions(+) create mode 100644 src/log_surgeon/finite_automata/Tag.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 50e45392..e76ecb8c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,6 +99,7 @@ set(SOURCE_FILES src/log_surgeon/finite_automata/RegexNFA.hpp src/log_surgeon/finite_automata/RegexNFAState.hpp src/log_surgeon/finite_automata/RegexNFAStateType.hpp + src/log_surgeon/finite_automata/Tag.hpp src/log_surgeon/finite_automata/TaggedTransition.hpp src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp src/log_surgeon/finite_automata/UnicodeIntervalTree.tpp diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index eecf7462..94b92182 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -19,6 +19,7 @@ #include #include +#include #include namespace log_surgeon::finite_automata { diff --git a/src/log_surgeon/finite_automata/Tag.hpp b/src/log_surgeon/finite_automata/Tag.hpp new file mode 100644 index 00000000..3f4eaf34 --- /dev/null +++ b/src/log_surgeon/finite_automata/Tag.hpp @@ -0,0 +1,16 @@ +#ifndef LOG_SURGEON_FINITE_AUTOMATA_TAG +#define LOG_SURGEON_FINITE_AUTOMATA_TAG + +#include +#include +#include + +namespace log_surgeon::finite_automata { +struct Tag { + std::string name; + std::vector starts; + std::vector ends; +}; +} // namespace log_surgeon::finite_automata + +#endif // LOG_SURGEON_FINITE_AUTOMATA_TAG diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index 21282986..2b8941eb 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -10,6 +10,7 @@ #include #include +#include namespace log_surgeon::finite_automata { template diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index b7afd1f1..e15ec233 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -6,6 +6,7 @@ set( ../src/log_surgeon/finite_automata/RegexNFA.hpp ../src/log_surgeon/finite_automata/RegexNFAState.hpp ../src/log_surgeon/finite_automata/RegexNFAStateType.hpp + ../src/log_surgeon/finite_automata/Tag.hpp ../src/log_surgeon/finite_automata/TaggedTransition.hpp ../src/log_surgeon/LALR1Parser.cpp ../src/log_surgeon/LALR1Parser.hpp From bc78f5921b02701f05d32090aaa565b486363a7c Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 29 Oct 2024 10:34:50 -0400 Subject: [PATCH 137/323] Make tag an object with name, start, and end information, instead of just an id. This object is created and owned by the capture AST, and other AST and NFA states point to these tags. --- src/log_surgeon/SchemaParser.cpp | 8 +-- src/log_surgeon/SchemaParser.hpp | 20 -------- src/log_surgeon/finite_automata/RegexAST.hpp | 50 +++++++++---------- src/log_surgeon/finite_automata/RegexNFA.hpp | 22 ++++---- .../finite_automata/RegexNFAState.hpp | 17 ++++--- src/log_surgeon/finite_automata/Tag.hpp | 15 ++++-- .../finite_automata/TaggedTransition.hpp | 22 ++++---- tests/test-NFA.cpp | 21 ++++---- tests/test-lexer.cpp | 36 ++++++------- 9 files changed, 101 insertions(+), 110 deletions(-) diff --git a/src/log_surgeon/SchemaParser.cpp b/src/log_surgeon/SchemaParser.cpp index d74167fa..4558a064 100644 --- a/src/log_surgeon/SchemaParser.cpp +++ b/src/log_surgeon/SchemaParser.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -161,13 +162,12 @@ auto SchemaParser::existing_schema_rule(NonTerminal* m) -> unique_ptr return schema_ast; } -auto SchemaParser::regex_capture_rule(NonTerminal* m) -> std::unique_ptr { - auto* r4 = dynamic_cast(m->non_terminal_cast(3)->get_parser_ast().get()); +static auto regex_capture_rule(NonTerminal const* m) -> std::unique_ptr { + auto const* r4 = dynamic_cast(m->non_terminal_cast(3)->get_parser_ast().get()); auto& r6 = m->non_terminal_cast(5)->get_parser_ast()->get>(); return std::make_unique(make_unique( - r4->m_name, std::move(r6), - m_capture_group_id_generator.assign_next_id() + std::make_unique(r4->m_name) )); } diff --git a/src/log_surgeon/SchemaParser.hpp b/src/log_surgeon/SchemaParser.hpp index 004ec495..c5081287 100644 --- a/src/log_surgeon/SchemaParser.hpp +++ b/src/log_surgeon/SchemaParser.hpp @@ -8,17 +8,6 @@ #include namespace log_surgeon { -/** - * Class for generating monotonically increasing integer IDs. - */ -class UniqueIdGenerator { -public: - [[nodiscard]] auto assign_next_id() -> uint32_t { return m_next_id++; } - -private: - uint32_t m_next_id{0}; -}; - // ASTs used in SchemaParser AST class SchemaAST : public ParserAST { public: @@ -113,13 +102,6 @@ class SchemaParser : public LALR1Parser< */ auto existing_schema_rule(NonTerminal* m) -> std::unique_ptr; - /** - * A semantic rule for regex capture groups that needs access to `m_capture_group_id_generator`. - * @param m - * @return A unique pointer to the parsed regex capture group. - */ - auto regex_capture_rule(NonTerminal* m) -> std::unique_ptr; - /** * After lexing half of the buffer, reads into that half of the buffer and * changes variables accordingly @@ -146,8 +128,6 @@ class SchemaParser : public LALR1Parser< auto generate_schema_ast(Reader& reader) -> std::unique_ptr; static inline std::unordered_map m_special_regex_characters; - - UniqueIdGenerator m_capture_group_id_generator; }; } // namespace log_surgeon diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 94b92182..ed44c248 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -82,19 +82,19 @@ class RegexAST { */ [[nodiscard]] virtual auto serialize() const -> std::u32string = 0; - [[nodiscard]] auto get_subtree_positive_tags() const -> std::set const& { + [[nodiscard]] auto get_subtree_positive_tags() const -> std::set const& { return m_subtree_positive_tags; } - auto set_subtree_positive_tags(std::set subtree_positive_tags) -> void { + auto set_subtree_positive_tags(std::set subtree_positive_tags) -> void { m_subtree_positive_tags = std::move(subtree_positive_tags); } - auto add_subtree_positive_tags(std::set subtree_positive_tags) -> void { + auto add_subtree_positive_tags(std::set subtree_positive_tags) -> void { m_subtree_positive_tags.merge(subtree_positive_tags); } - auto set_negative_tags(std::set negative_tags) -> void { + auto set_negative_tags(std::set negative_tags) -> void { m_negative_tags = std::move(negative_tags); } @@ -128,9 +128,10 @@ class RegexAST { } auto const transformed_negative_tags - = m_negative_tags | std::ranges::views::transform([](uint32_t tag) { - return fmt::format("<~{}>", tag); - }); + = m_negative_tags + | std::ranges::views::transform([](Tag const* tag) { + return fmt::format("<~{}>", tag->get_name()); + }); auto const negative_tags_string = fmt::format("{}", fmt::join(transformed_negative_tags, "")); @@ -141,8 +142,8 @@ class RegexAST { } private: - std::set m_subtree_positive_tags; - std::set m_negative_tags; + std::set m_subtree_positive_tags; + std::set m_negative_tags; }; /** @@ -634,26 +635,23 @@ class RegexASTCapture : public RegexAST { ~RegexASTCapture() override = default; RegexASTCapture( - std::string group_name, std::unique_ptr> group_regex_ast, - uint32_t const tag + std::unique_ptr tag ) - : m_group_name(std::move(group_name)), - m_group_regex_ast(std::move(group_regex_ast)), - m_tag(tag) { + : m_group_regex_ast{std::move(group_regex_ast)}, + m_tag{std::move(tag)} { RegexAST::set_subtree_positive_tags( m_group_regex_ast->get_subtree_positive_tags() ); - RegexAST::add_subtree_positive_tags({m_tag}); + RegexAST::add_subtree_positive_tags({m_tag.get()}); } RegexASTCapture(RegexASTCapture const& rhs) - : RegexAST(rhs), - m_group_name(rhs.m_group_name), - m_group_regex_ast( + : RegexAST{rhs}, + m_group_regex_ast{ std::unique_ptr>(rhs.m_group_regex_ast->clone()) - ), - m_tag(rhs.m_tag) { + }, + m_tag{rhs.m_tag ? std::make_unique(*rhs.m_tag) : nullptr} { RegexAST::set_subtree_positive_tags(rhs.get_subtree_positive_tags()); } @@ -698,19 +696,16 @@ class RegexASTCapture : public RegexAST { [[nodiscard]] auto serialize() const -> std::u32string override; - [[nodiscard]] auto get_group_name() const -> std::string const& { return m_group_name; } + [[nodiscard]] auto get_group_name() const -> std::string const& { return m_tag->get_name(); } [[nodiscard]] auto get_group_regex_ast( ) const -> std::unique_ptr> const& { return m_group_regex_ast; } - [[nodiscard]] auto get_tag() const -> uint32_t { return m_tag; } - private: - std::string m_group_name; std::unique_ptr> m_group_regex_ast; - uint32_t m_tag; + std::unique_ptr m_tag; }; template @@ -895,16 +890,17 @@ template void RegexASTCapture::add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const { auto* state_with_positive_tagged_transition - = nfa->new_state_with_positive_tagged_transition(m_tag, end_state); + = nfa->new_state_with_positive_tagged_transition(m_tag.get(), end_state); m_group_regex_ast->add_to_nfa_with_negative_tags(nfa, state_with_positive_tagged_transition); } template [[nodiscard]] auto RegexASTCapture::serialize() const -> std::u32string { + auto const tag_name_u32 = std::u32string(m_tag->get_name().begin(), m_tag->get_name().end()); return fmt::format( U"({})<{}>{}", nullptr != m_group_regex_ast ? m_group_regex_ast->serialize() : U"null", - m_tag, + tag_name_u32, RegexAST::serialize_negative_tags() ); } diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 61090c09..7639769f 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -23,7 +23,7 @@ class RegexNFA { public: using StateVec = std::vector; - explicit RegexNFA(std::vector> const& rules); + explicit RegexNFA(std::vector> rules); /** * Creates a unique_ptr for an NFA state with no tagged transitions and adds it to `m_states`. @@ -39,7 +39,7 @@ class RegexNFA { * @return NFAStateType* */ [[nodiscard]] auto new_state_with_positive_tagged_transition( - uint32_t tag, + Tag const* tag, NFAStateType const* dest_state ) -> NFAStateType*; @@ -51,7 +51,7 @@ class RegexNFA { * @return NFAStateType* */ [[nodiscard]] auto new_state_with_negative_tagged_transitions( - std::set tags, + std::set tags, NFAStateType const* dest_state ) -> NFAStateType*; @@ -77,12 +77,16 @@ class RegexNFA { private: std::vector> m_states; NFAStateType* m_root; + // Store the rules locally as they contain information needed by the NFA. E.g., transitions in + // the NFA point to tags in the rule ASTs. + std::vector> m_rules; }; template -RegexNFA::RegexNFA(std::vector> const& rules) - : m_root{new_state()} { - for (auto const& rule : rules) { +RegexNFA::RegexNFA(std::vector> rules) + : m_root{new_state()}, + m_rules{std::move(rules)} { + for (auto const& rule : m_rules) { rule.add_to_nfa(this); } } @@ -95,7 +99,7 @@ auto RegexNFA::new_state() -> NFAStateType* { template auto RegexNFA::new_state_with_positive_tagged_transition( - uint32_t const tag, + Tag const* tag, NFAStateType const* dest_state ) -> NFAStateType* { m_states.emplace_back(std::make_unique(tag, dest_state)); @@ -104,10 +108,10 @@ auto RegexNFA::new_state_with_positive_tagged_transition( template auto RegexNFA::new_state_with_negative_tagged_transitions( - std::set tags, + std::set tags, NFAStateType const* dest_state ) -> NFAStateType* { - m_states.emplace_back(std::make_unique(tags, dest_state)); + m_states.emplace_back(std::make_unique(std::move(tags), dest_state)); return m_states.back().get(); } diff --git a/src/log_surgeon/finite_automata/RegexNFAState.hpp b/src/log_surgeon/finite_automata/RegexNFAState.hpp index aca1d7eb..4e32aa4f 100644 --- a/src/log_surgeon/finite_automata/RegexNFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexNFAState.hpp @@ -1,15 +1,16 @@ #ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_STATE #define LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_STATE -#include -#include -#include -#include -#include #include -#include +#include #include +#include +#include #include +#include +#include +#include + #include #include @@ -24,10 +25,10 @@ class RegexNFAState { RegexNFAState() = default; - RegexNFAState(uint32_t const tag, RegexNFAState const* dest_state) + RegexNFAState(Tag const* tag, RegexNFAState const* dest_state) : m_positive_tagged_transitions{{tag, dest_state}} {} - RegexNFAState(std::set tags, RegexNFAState const* dest_state) + RegexNFAState(std::set tags, RegexNFAState const* dest_state) : m_negative_tagged_transitions{{std::move(tags), dest_state}} {} auto set_accepting(bool accepting) -> void { m_accepting = accepting; } diff --git a/src/log_surgeon/finite_automata/Tag.hpp b/src/log_surgeon/finite_automata/Tag.hpp index 3f4eaf34..927b5062 100644 --- a/src/log_surgeon/finite_automata/Tag.hpp +++ b/src/log_surgeon/finite_automata/Tag.hpp @@ -3,13 +3,20 @@ #include #include +#include #include namespace log_surgeon::finite_automata { -struct Tag { - std::string name; - std::vector starts; - std::vector ends; +class Tag { +public: + explicit Tag(std::string name) : m_name{std::move(name)} {} + + [[nodiscard]] auto get_name() const -> std::string const& { return m_name; } + +private: + std::string const m_name; + std::vector m_starts; + std::vector m_ends; }; } // namespace log_surgeon::finite_automata diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index 2b8941eb..7795423a 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -16,12 +16,10 @@ namespace log_surgeon::finite_automata { template class PositiveTaggedTransition { public: - PositiveTaggedTransition(uint32_t const tag, RegexNFAState const* dest_state) + PositiveTaggedTransition(Tag const* tag, RegexNFAState const* dest_state) : m_tag{tag}, m_dest_state{dest_state} {} - [[nodiscard]] auto get_tag() const -> uint32_t { return m_tag; } - [[nodiscard]] auto get_dest_state() const -> RegexNFAState const* { return m_dest_state; } @@ -36,19 +34,17 @@ class PositiveTaggedTransition { ) const -> std::optional; private: - uint32_t m_tag; + Tag const* m_tag; RegexNFAState const* m_dest_state; }; template class NegativeTaggedTransition { public: - NegativeTaggedTransition(std::set tags, RegexNFAState const* dest_state) + NegativeTaggedTransition(std::set tags, RegexNFAState const* dest_state) : m_tags{std::move(tags)}, m_dest_state{dest_state} {} - [[nodiscard]] auto get_tags() const -> std::set const& { return m_tags; } - [[nodiscard]] auto get_dest_state() const -> RegexNFAState const* { return m_dest_state; } @@ -63,7 +59,7 @@ class NegativeTaggedTransition { ) const -> std::optional; private: - std::set m_tags; + std::set const m_tags; RegexNFAState const* m_dest_state; }; @@ -75,7 +71,7 @@ auto PositiveTaggedTransition::serialize( if (state_id_it == state_ids.end()) { return std::nullopt; } - return fmt::format("{}[{}]", state_id_it->second, m_tag); + return fmt::format("{}[{}]", state_id_it->second, m_tag->get_name()); } template @@ -86,7 +82,13 @@ auto NegativeTaggedTransition::serialize( if (state_id_it == state_ids.end()) { return std::nullopt; } - return fmt::format("{}[{}]", state_id_it->second, fmt::join(m_tags, ",")); + + auto const tag_names + = m_tags + | std::ranges::views::transform([](Tag const* tag) { + return tag->get_name(); + }); + return fmt::format("{}[{}]", state_id_it->second, fmt::join(tag_names, ",")); } } // namespace log_surgeon::finite_automata diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index beb35231..d37badde 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -55,17 +55,18 @@ TEST_CASE("Test NFA", "[NFA]") { "epsilon_transitions={}," "positive_tagged_transitions={}," "negative_tagged_transitions={}\n"; - expected_serialized_nfa += "2:byte_transitions={}," - "epsilon_transitions={}," - "positive_tagged_transitions={}," - "negative_tagged_transitions={5[0,1,2,3]}\n"; + expected_serialized_nfa + += "2:byte_transitions={}," + "epsilon_transitions={}," + "positive_tagged_transitions={}," + "negative_tagged_transitions={5[containerID,letter2,letter,letter1]}\n"; expected_serialized_nfa += "3:byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_transitions={6[0]}," + "positive_tagged_transitions={6[letter1]}," "negative_tagged_transitions={}\n"; expected_serialized_nfa += "4:byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_transitions={7[1]}," + "positive_tagged_transitions={7[letter2]}," "negative_tagged_transitions={}\n"; expected_serialized_nfa += "5:accepting_tag=0,byte_transitions={}," "epsilon_transitions={}," @@ -74,14 +75,14 @@ TEST_CASE("Test NFA", "[NFA]") { expected_serialized_nfa += "6:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_transitions={}," - "negative_tagged_transitions={8[1]}\n"; + "negative_tagged_transitions={8[letter2]}\n"; expected_serialized_nfa += "7:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_transitions={}," - "negative_tagged_transitions={8[0]}\n"; + "negative_tagged_transitions={8[letter1]}\n"; expected_serialized_nfa += "8:byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_transitions={9[2]}," + "positive_tagged_transitions={9[letter]}," "negative_tagged_transitions={}\n"; expected_serialized_nfa += "9:byte_transitions={B-->10}," "epsilon_transitions={}," @@ -95,7 +96,7 @@ TEST_CASE("Test NFA", "[NFA]") { expected_serialized_nfa += "11:byte_transitions={0-->11,1-->11,2-->11,3-->11,4-->11,5-->11,6-->" "11,7-->11,8-->11,9-->11}," "epsilon_transitions={}," - "positive_tagged_transitions={12[3]}," + "positive_tagged_transitions={12[containerID]}," "negative_tagged_transitions={}\n"; expected_serialized_nfa += "12:byte_transitions={C-->5}," "epsilon_transitions={}," diff --git a/tests/test-lexer.cpp b/tests/test-lexer.cpp index abfa460e..c6129ec9 100644 --- a/tests/test-lexer.cpp +++ b/tests/test-lexer.cpp @@ -152,13 +152,13 @@ TEST_CASE("Test the Schema class", "[Schema]") { "?\\d+" ")C" ")", - U"(Z<~0><~1><~2><~3>)|(" + U"(Z<~letter1><~letter><~containerID><~letter2>)|(" "A(" - "(((a)|(b))<0><~1>)|" - "(((c)|(d))<1><~0>)" - ")<2>B(" + "(((a)|(b))<~letter2>)|" + "(((c)|(d))<~letter1>)" + ")B(" "([0-9]){1,inf}" - ")<3>C" + ")C" ")" // clang-format on ); @@ -172,13 +172,13 @@ TEST_CASE("Test the Schema class", "[Schema]") { test_regex_ast("capture:a+", U"(a){1,inf}"); // Repetition with capture groups untagged and tagged AST are different - test_regex_ast("capture:(?a){0,10}", U"(<~0>)|(((a)<0>){1,10})"); - test_regex_ast("capture:(?a){5,10}", U"((a)<0>){5,10}"); - test_regex_ast("capture:(?a)*", U"(<~0>)|(((a)<0>){1,inf})"); - test_regex_ast("capture:(?a)+", U"((a)<0>){1,inf}"); + test_regex_ast("capture:(?a){0,10}", U"(<~letter>)|(((a)){1,10})"); + test_regex_ast("capture:(?a){5,10}", U"((a)){5,10}"); + test_regex_ast("capture:(?a)*", U"(<~letter>)|(((a)){1,inf})"); + test_regex_ast("capture:(?a)+", U"((a)){1,inf}"); // Capture group with repetition - test_regex_ast("capture:(?a{0,10})", U"(()|((a){1,10}))<0>"); + test_regex_ast("capture:(?a{0,10})", U"(()|((a){1,10}))"); // Complex repetition test_regex_ast( @@ -196,16 +196,16 @@ TEST_CASE("Test the Schema class", "[Schema]") { "){0,10}" ")", U"(" - U"(<~0><~1>)|((" - U"((a)<0><~1>)|" - U"((b)<1><~0>)" + U"(<~letterA><~letterB>)|((" + U"((a)<~letterB>)|" + U"((b)<~letterA>)" U"){1,inf})" - U"<~2><~3>)|(" - U"(<~2><~3>)|((" - U"((c)<2><~3>)|" - U"((d)<3><~2>)" + U"<~letterD><~letterC>)|(" + U"(<~letterD><~letterC>)|((" + U"((c)<~letterD>)|" + U"((d)<~letterC>)" U"){1,10})" - U"<~0><~1>)" + U"<~letterA><~letterB>)" // clang-format on ); } From ac7260f509f2dda712162f8f3a3b6e0a3267959a Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 29 Oct 2024 10:35:56 -0400 Subject: [PATCH 138/323] Run linter. --- src/log_surgeon/finite_automata/RegexNFAState.hpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFAState.hpp b/src/log_surgeon/finite_automata/RegexNFAState.hpp index aca1d7eb..0ea09fd7 100644 --- a/src/log_surgeon/finite_automata/RegexNFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexNFAState.hpp @@ -1,15 +1,16 @@ #ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_STATE #define LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_STATE -#include -#include -#include -#include -#include #include -#include +#include #include +#include +#include #include +#include +#include +#include + #include #include From c2eea21d1427315166b3f30afac21fe10cdd9aac Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 31 Oct 2024 06:53:24 -0400 Subject: [PATCH 139/323] Change t to curr_state and u to dest_state. --- src/log_surgeon/Lexer.tpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index b500f461..92af6b26 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -394,20 +394,23 @@ auto Lexer::epsilon_closure(NFAStateType const* stat std::stack stack; stack.push(state_ptr); while (!stack.empty()) { - NFAStateType const* t = stack.top(); + NFAStateType const* curr_state = stack.top(); stack.pop(); - if (closure_set.insert(t).second) { - for (NFAStateType* const u : t->get_epsilon_transitions()) { - stack.push(u); + if (closure_set.insert(curr_state).second) { + for (NFAStateType* const dest_state : curr_state->get_epsilon_transitions()) { + stack.push(dest_state); } // TODO: currently treat tagged transitions as epsilon transitions - for (auto const& positive_tagged_transition : t->get_positive_tagged_transitions()) { + for (auto const& positive_tagged_transition : + curr_state->get_positive_tagged_transitions()) + { stack.push(positive_tagged_transition.get_dest_state()); } - auto const* negative_dest_state = t->get_negative_tagged_transition().get_dest_state(); + auto const* negative_dest_state + = curr_state->get_negative_tagged_transition().get_dest_state(); if (nullptr != negative_dest_state) { - stack.push(t->get_negative_tagged_transition().get_dest_state()); + stack.push(curr_state->get_negative_tagged_transition().get_dest_state()); } } } From 629fce974c30cab39f7628f50d590782225a6c97 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 31 Oct 2024 07:25:14 -0400 Subject: [PATCH 140/323] Change curr_state to current_state; Remove extraneous *; Add newline that was accidentally removed. --- src/log_surgeon/Lexer.tpp | 12 ++++++------ src/log_surgeon/finite_automata/RegexNFA.hpp | 3 ++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index 92af6b26..d03afd3f 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -394,23 +394,23 @@ auto Lexer::epsilon_closure(NFAStateType const* stat std::stack stack; stack.push(state_ptr); while (!stack.empty()) { - NFAStateType const* curr_state = stack.top(); + NFAStateType const* current_state = stack.top(); stack.pop(); - if (closure_set.insert(curr_state).second) { - for (NFAStateType* const dest_state : curr_state->get_epsilon_transitions()) { + if (closure_set.insert(current_state).second) { + for (NFAStateType* const dest_state : current_state->get_epsilon_transitions()) { stack.push(dest_state); } // TODO: currently treat tagged transitions as epsilon transitions for (auto const& positive_tagged_transition : - curr_state->get_positive_tagged_transitions()) + current_state->get_positive_tagged_transitions()) { stack.push(positive_tagged_transition.get_dest_state()); } auto const* negative_dest_state - = curr_state->get_negative_tagged_transition().get_dest_state(); + = current_state->get_negative_tagged_transition().get_dest_state(); if (nullptr != negative_dest_state) { - stack.push(curr_state->get_negative_tagged_transition().get_dest_state()); + stack.push(negative_dest_state); } } } diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 9d7679fa..7629fd2e 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -148,7 +148,7 @@ class RegexNFAState { /** * @param state_ids A map of states to their unique identifiers. * @return A string representation of the NFA state on success. - * @return Forwards `PositiveTaggedTransition::serialize`'s return value (std::nullopt) on * + * @return Forwards `PositiveTaggedTransition::serialize`'s return value (std::nullopt) on * failure. * @return Forwards `NegativeTaggedTransition::serialize`'s return value (std::nullopt) on * failure. @@ -341,6 +341,7 @@ auto RegexNFAState::serialize( auto const accepting_tag_string = m_accepting ? fmt::format("accepting_tag={},", m_matching_variable_id) : ""; + return fmt::format( "{}:{}byte_transitions={{{}}},epsilon_transitions={{{}}},positive_tagged_transitions={{" "{}}},negative_tagged_transition={{{}}}", From aed62b2a8580d9da7d11ac4b1fc52616b2575fa9 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 31 Oct 2024 07:33:03 -0400 Subject: [PATCH 141/323] Add TODO for utf8 case in BFS. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 7629fd2e..672512d2 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -406,6 +406,7 @@ auto RegexNFA::get_bfs_traversal_order() const -> std::vectorget_byte_transitions(idx)) { add_to_queue_and_visited(dest_state); From 34522a7810ee06cab2672f35ea903708be5d7f48 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 31 Oct 2024 08:41:51 -0400 Subject: [PATCH 142/323] Use auto and fix order of const wrt to *. --- src/log_surgeon/Lexer.tpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index d03afd3f..8ae8e1f0 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -394,10 +394,10 @@ auto Lexer::epsilon_closure(NFAStateType const* stat std::stack stack; stack.push(state_ptr); while (!stack.empty()) { - NFAStateType const* current_state = stack.top(); + auto const* current_state = stack.top(); stack.pop(); if (closure_set.insert(current_state).second) { - for (NFAStateType* const dest_state : current_state->get_epsilon_transitions()) { + for (auto const* dest_state : current_state->get_epsilon_transitions()) { stack.push(dest_state); } From 332af359413a4084e5b9d38eac7f5fcbe32df6b3 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 31 Oct 2024 08:45:40 -0400 Subject: [PATCH 143/323] Initialize m_dest_state to nullptr. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 672512d2..79903ecb 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -80,7 +80,7 @@ class NegativeTaggedTransition { private: std::set m_tags; - NFAStateType const* m_dest_state; + NFAStateType const* m_dest_state{nullptr}; }; template From 748e79486f034dce85d678304879a28e3ed16453 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 31 Oct 2024 09:04:28 -0400 Subject: [PATCH 144/323] Change negative_tagged_transition to negative_tagged_transition_string. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 79903ecb..8df857bf 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -329,14 +329,14 @@ auto RegexNFAState::serialize( positive_tagged_transitions.emplace_back(optional_serialized_positive_transition.value()); } - std::string negative_tagged_transition; + std::string negative_tagged_transition_string; if (nullptr != m_negative_tagged_transition.get_dest_state()) { auto const optional_serialized_negative_transition = m_negative_tagged_transition.serialize(state_ids); if (false == optional_serialized_negative_transition.has_value()) { return std::nullopt; } - negative_tagged_transition = optional_serialized_negative_transition.value(); + negative_tagged_transition_string = optional_serialized_negative_transition.value(); } auto const accepting_tag_string @@ -350,7 +350,7 @@ auto RegexNFAState::serialize( fmt::join(byte_transitions, ","), fmt::join(epsilon_transitions, ","), fmt::join(positive_tagged_transitions, ","), - negative_tagged_transition + negative_tagged_transition_string ); } From 38dc22b722d1e2487fa6e3967e3483d7424a1d0c Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 31 Oct 2024 09:08:18 -0400 Subject: [PATCH 145/323] Change negative tag transitions to singular. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 8df857bf..fcf3a962 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -197,7 +197,7 @@ class RegexNFA { ) -> NFAStateType*; /** - * Creates a unique_ptr for an NFA state with negative tagged transitions and adds it to + * Creates a unique_ptr for an NFA state with negative tagged transition and adds it to * `m_states`. * @param tags * @param dest_state From 5a30ed875b8c0b87c5a5294e50cff1e408942b7d Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 31 Oct 2024 09:12:31 -0400 Subject: [PATCH 146/323] Switch transitions to singular where applicable. --- src/log_surgeon/finite_automata/RegexAST.hpp | 2 +- src/log_surgeon/finite_automata/RegexNFA.hpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index fe891a1c..ee0cc7e3 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -98,7 +98,7 @@ class RegexAST { } /** - * Handles the addition of an intermediate state with negative transitions if needed. + * Handles the addition of an intermediate state with a negative transition if needed. * @param nfa * @param end_state */ diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index fcf3a962..e5d6792a 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -46,7 +46,7 @@ class PositiveTaggedTransition { /** * @param state_ids A map of states to their unique identifiers. - * @return A string representation of the positive tagged transitions on success. + * @return A string representation of the positive tagged transition on success. * @return std::nullopt if `m_dest_state` is not in `state_ids`. */ [[nodiscard]] auto serialize(std::unordered_map const& state_ids @@ -72,7 +72,7 @@ class NegativeTaggedTransition { /** * @param state_ids A map of states to their unique identifiers. - * @return A string representation of the negative tagged transitions on success. + * @return A string representation of the negative tagged transition on success. * @return std::nullopt if `m_dest_state` is not in `state_ids`. */ [[nodiscard]] auto serialize(std::unordered_map const& state_ids From c8bf9e6b8cd0226c31d434cbd9320ebfccd553cd Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 31 Oct 2024 09:39:45 -0400 Subject: [PATCH 147/323] Merge changes with previous PR manually. Still missing changes to previously untouched files. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 38 ++++++++-------- .../finite_automata/RegexNFAState.hpp | 37 ++++++++-------- .../finite_automata/TaggedTransition.hpp | 44 +++++++++---------- 3 files changed, 58 insertions(+), 61 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 61090c09..3f286e2a 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -44,13 +44,13 @@ class RegexNFA { ) -> NFAStateType*; /** - * Creates a unique_ptr for an NFA state with negative tagged transitions and adds it to + * Creates a unique_ptr for an NFA state with negative tagged transition and adds it to * `m_states`. * @param tags * @param dest_state * @return NFAStateType* */ - [[nodiscard]] auto new_state_with_negative_tagged_transitions( + [[nodiscard]] auto new_state_with_negative_tagged_transition( std::set tags, NFAStateType const* dest_state ) -> NFAStateType*; @@ -59,7 +59,7 @@ class RegexNFA { * @return A vector representing the traversal order of the NFA states using breadth-first * search (BFS). */ - [[nodiscard]] auto get_bfs_traversal_order() const -> std::vector; + [[nodiscard]] auto get_bfs_traversal_order() const -> std::vector; /** * @return A string representation of the NFA. @@ -103,7 +103,7 @@ auto RegexNFA::new_state_with_positive_tagged_transition( } template -auto RegexNFA::new_state_with_negative_tagged_transitions( +auto RegexNFA::new_state_with_negative_tagged_transition( std::set tags, NFAStateType const* dest_state ) -> NFAStateType* { @@ -112,26 +112,26 @@ auto RegexNFA::new_state_with_negative_tagged_transitions( } template -auto RegexNFA::get_bfs_traversal_order( -) const -> std::vector { - std::queue state_queue; - std::unordered_set visited_states; - std::vector visited_order; +auto RegexNFA::get_bfs_traversal_order() const -> std::vector { + std::queue state_queue; + std::unordered_set visited_states; + std::vector visited_order; visited_states.reserve(m_states.size()); visited_order.reserve(m_states.size()); auto add_to_queue_and_visited - = [&state_queue, &visited_states](RegexNFAByteState const* dest_state) { - if (visited_states.insert(dest_state).second) { - state_queue.push(dest_state); - } - }; + = [&state_queue, &visited_states](NFAStateType const* dest_state) { + if (visited_states.insert(dest_state).second) { + state_queue.push(dest_state); + } + }; add_to_queue_and_visited(m_root); while (false == state_queue.empty()) { auto const* current_state = state_queue.front(); visited_order.push_back(current_state); state_queue.pop(); + // TODO: handle the utf8 case for (uint32_t idx{0}; idx < cSizeOfByte; ++idx) { for (auto const* dest_state : current_state->get_byte_transitions(idx)) { add_to_queue_and_visited(dest_state); @@ -145,10 +145,10 @@ auto RegexNFA::get_bfs_traversal_order( { add_to_queue_and_visited(positive_tagged_transition.get_dest_state()); } - for (auto const& negative_tagged_transition : - current_state->get_negative_tagged_transitions()) - { - add_to_queue_and_visited(negative_tagged_transition.get_dest_state()); + auto const* negative_dest_state + = current_state->get_negative_tagged_transition().get_dest_state(); + if (nullptr != negative_dest_state) { + add_to_queue_and_visited(negative_dest_state); } } return visited_order; @@ -158,7 +158,7 @@ template auto RegexNFA::serialize() const -> std::string { auto const traversal_order = get_bfs_traversal_order(); - std::unordered_map state_ids; + std::unordered_map state_ids; for (auto const* state : traversal_order) { state_ids.emplace(state, state_ids.size()); } diff --git a/src/log_surgeon/finite_automata/RegexNFAState.hpp b/src/log_surgeon/finite_automata/RegexNFAState.hpp index 0ea09fd7..efc97eaa 100644 --- a/src/log_surgeon/finite_automata/RegexNFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexNFAState.hpp @@ -29,7 +29,7 @@ class RegexNFAState { : m_positive_tagged_transitions{{tag, dest_state}} {} RegexNFAState(std::set tags, RegexNFAState const* dest_state) - : m_negative_tagged_transitions{{std::move(tags), dest_state}} {} + : m_negative_tagged_transition{std::move(tags), dest_state} {} auto set_accepting(bool accepting) -> void { m_accepting = accepting; } @@ -44,13 +44,13 @@ class RegexNFAState { } [[nodiscard]] auto get_positive_tagged_transitions( - ) const -> std::vector> const& { + ) const -> std::vector> const& { return m_positive_tagged_transitions; } - [[nodiscard]] auto get_negative_tagged_transitions( - ) const -> std::vector> const& { - return m_negative_tagged_transitions; + [[nodiscard]] auto get_negative_tagged_transition( + ) const -> NegativeTaggedTransition const& { + return m_negative_tagged_transition; } auto add_epsilon_transition(RegexNFAState* epsilon_transition) -> void { @@ -83,18 +83,19 @@ class RegexNFAState { /** * @param state_ids A map of states to their unique identifiers. * @return A string representation of the NFA state on success. - * @return Forwards `PositiveTaggedTransition::serialize`'s or - * `NegativeTaggedTransition::serialize`'s return value (std::nullopt) on failure. + * @return Forwards `PositiveTaggedTransition::serialize`'s return value (std::nullopt) on + * failure. + * @return Forwards `NegativeTaggedTransition::serialize`'s return value (std::nullopt) on + * failure. */ - [[nodiscard]] auto serialize( - std::unordered_map const& state_ids + [[nodiscard]] auto serialize(std::unordered_map const& state_ids ) const -> std::optional; private: bool m_accepting{false}; uint32_t m_matching_variable_id{0}; - std::vector> m_positive_tagged_transitions; - std::vector> m_negative_tagged_transitions; + std::vector> m_positive_tagged_transitions; + NegativeTaggedTransition m_negative_tagged_transition; std::vector m_epsilon_transitions; std::array, cSizeOfByte> m_bytes_transitions; // NOTE: We don't need m_tree_transitions for the `stateType == @@ -153,7 +154,7 @@ void RegexNFAState::add_interval(Interval interval, RegexNFAState* d template auto RegexNFAState::serialize( - std::unordered_map const& state_ids + std::unordered_map const& state_ids ) const -> std::optional { std::vector byte_transitions; for (uint32_t idx{0}; idx < cSizeOfByte; ++idx) { @@ -179,14 +180,14 @@ auto RegexNFAState::serialize( positive_tagged_transitions.emplace_back(optional_serialized_positive_transition.value()); } - std::vector negative_tagged_transitions; - for (auto const& negative_tagged_transition : m_negative_tagged_transitions) { + std::string negative_tagged_transition_string; + if (nullptr != m_negative_tagged_transition.get_dest_state()) { auto const optional_serialized_negative_transition - = negative_tagged_transition.serialize(state_ids); + = m_negative_tagged_transition.serialize(state_ids); if (false == optional_serialized_negative_transition.has_value()) { return std::nullopt; } - negative_tagged_transitions.emplace_back(optional_serialized_negative_transition.value()); + negative_tagged_transition_string = optional_serialized_negative_transition.value(); } auto const accepting_tag_string @@ -194,13 +195,13 @@ auto RegexNFAState::serialize( return fmt::format( "{}:{}byte_transitions={{{}}},epsilon_transitions={{{}}},positive_tagged_transitions={{" - "{}}},negative_tagged_transitions={{{}}}", + "{}}},negative_tagged_transition={{{}}}", state_ids.at(this), accepting_tag_string, fmt::join(byte_transitions, ","), fmt::join(epsilon_transitions, ","), fmt::join(positive_tagged_transitions, ","), - fmt::join(negative_tagged_transitions, ",") + negative_tagged_transition_string ); } } // namespace log_surgeon::finite_automata diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index 21282986..21cff924 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -12,63 +12,59 @@ #include namespace log_surgeon::finite_automata { -template +template class PositiveTaggedTransition { public: - PositiveTaggedTransition(uint32_t const tag, RegexNFAState const* dest_state) + PositiveTaggedTransition(uint32_t const tag, NFAStateType const* dest_state) : m_tag{tag}, m_dest_state{dest_state} {} [[nodiscard]] auto get_tag() const -> uint32_t { return m_tag; } - [[nodiscard]] auto get_dest_state() const -> RegexNFAState const* { - return m_dest_state; - } + [[nodiscard]] auto get_dest_state() const -> NFAStateType const* { return m_dest_state; } /** * @param state_ids A map of states to their unique identifiers. - * @return A string representation of the positive tagged transitions on success. + * @return A string representation of the positive tagged transition on success. * @return std::nullopt if `m_dest_state` is not in `state_ids`. */ - [[nodiscard]] auto serialize( - std::unordered_map const& state_ids + [[nodiscard]] auto serialize(std::unordered_map const& state_ids ) const -> std::optional; private: uint32_t m_tag; - RegexNFAState const* m_dest_state; + NFAStateType const* m_dest_state; }; -template +template class NegativeTaggedTransition { public: - NegativeTaggedTransition(std::set tags, RegexNFAState const* dest_state) + NegativeTaggedTransition() = default; + + NegativeTaggedTransition(std::set tags, NFAStateType const* dest_state) : m_tags{std::move(tags)}, m_dest_state{dest_state} {} [[nodiscard]] auto get_tags() const -> std::set const& { return m_tags; } - [[nodiscard]] auto get_dest_state() const -> RegexNFAState const* { - return m_dest_state; - } + [[nodiscard]] auto get_dest_state() const -> NFAStateType const* { return m_dest_state; } /** * @param state_ids A map of states to their unique identifiers. - * @return A string representation of the negative tagged transitions on success. + * @return A string representation of the negative tagged transition on success. * @return std::nullopt if `m_dest_state` is not in `state_ids`. */ - [[nodiscard]] auto serialize( - std::unordered_map const& state_ids + [[nodiscard]] auto serialize(std::unordered_map const& state_ids ) const -> std::optional; private: std::set m_tags; - RegexNFAState const* m_dest_state; + NFAStateType const* m_dest_state{nullptr}; }; -template -auto PositiveTaggedTransition::serialize( - std::unordered_map const& state_ids +template +auto PositiveTaggedTransition::serialize( + std::unordered_map const& state_ids ) const -> std::optional { auto const state_id_it = state_ids.find(m_dest_state); if (state_id_it == state_ids.end()) { @@ -77,9 +73,9 @@ auto PositiveTaggedTransition::serialize( return fmt::format("{}[{}]", state_id_it->second, m_tag); } -template -auto NegativeTaggedTransition::serialize( - std::unordered_map const& state_ids +template +auto NegativeTaggedTransition::serialize( + std::unordered_map const& state_ids ) const -> std::optional { auto const state_id_it = state_ids.find(m_dest_state); if (state_id_it == state_ids.end()) { From 90edf77c24852c2368160b783f789761c8d6239e Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 31 Oct 2024 09:40:51 -0400 Subject: [PATCH 148/323] Auto linter. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 3f286e2a..d47c494c 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -121,10 +121,10 @@ auto RegexNFA::get_bfs_traversal_order() const -> std::vector Date: Thu, 31 Oct 2024 10:28:02 -0400 Subject: [PATCH 149/323] Modify expected output where ordering of negative tags is ambiguous. Should fix this so the ordering is determinstic. --- tests/test-NFA.cpp | 2 +- tests/test-lexer.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index 434c2787..0c379028 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -59,7 +59,7 @@ TEST_CASE("Test NFA", "[NFA]") { += "2:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_transitions={}," - "negative_tagged_transition={5[containerID,letter2,letter,letter1]}\n"; + "negative_tagged_transition={5[containerID,letter1,letter2,letter]}\n"; expected_serialized_nfa += "3:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_transitions={6[letter1]}," diff --git a/tests/test-lexer.cpp b/tests/test-lexer.cpp index c6129ec9..e369020f 100644 --- a/tests/test-lexer.cpp +++ b/tests/test-lexer.cpp @@ -152,7 +152,7 @@ TEST_CASE("Test the Schema class", "[Schema]") { "?\\d+" ")C" ")", - U"(Z<~letter1><~letter><~containerID><~letter2>)|(" + U"(Z<~letter2><~containerID><~letter1><~letter>)|(" "A(" "(((a)|(b))<~letter2>)|" "(((c)|(d))<~letter1>)" From d90b731ba162b99dbaa07016c34fd7599e153dd9 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 31 Oct 2024 11:11:24 -0400 Subject: [PATCH 150/323] Add a description for how to use the tag. --- src/log_surgeon/finite_automata/Tag.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/log_surgeon/finite_automata/Tag.hpp b/src/log_surgeon/finite_automata/Tag.hpp index 927b5062..36e7c3d1 100644 --- a/src/log_surgeon/finite_automata/Tag.hpp +++ b/src/log_surgeon/finite_automata/Tag.hpp @@ -7,6 +7,14 @@ #include namespace log_surgeon::finite_automata { +/** + * This class represents a tag that is associated with matches of a capture group. If `m_starts` is + * empty, it indicates that the capture group was unmatched. + * + * Since capture group regex can be contained within repetition regex, + * (e.g., "((user_id=(?\d+),)+"), `m_starts` and `m_ends` are vectors that track the locations + * of each occurrence of the capture group. + */ class Tag { public: explicit Tag(std::string name) : m_name{std::move(name)} {} From 3f1f8ff0449bf9f7d85f9ee6cecc82451e0a5066 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 31 Oct 2024 12:32:08 -0400 Subject: [PATCH 151/323] Add start and end positive transitions. --- src/log_surgeon/Lexer.tpp | 11 +++-- src/log_surgeon/finite_automata/RegexNFA.hpp | 11 +++-- .../finite_automata/RegexNFAState.hpp | 44 ++++++++++++++----- src/log_surgeon/finite_automata/Tag.hpp | 12 +++++ .../finite_automata/TaggedTransition.hpp | 1 - tests/test-NFA.cpp | 41 +++++++++++------ tests/test-lexer.cpp | 10 ++--- 7 files changed, 93 insertions(+), 37 deletions(-) diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index 8ae8e1f0..d9f77906 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -402,10 +402,15 @@ auto Lexer::epsilon_closure(NFAStateType const* stat } // TODO: currently treat tagged transitions as epsilon transitions - for (auto const& positive_tagged_transition : - current_state->get_positive_tagged_transitions()) + for (auto const& positive_tagged_start_transition : + current_state->get_positive_tagged_start_transitions()) { - stack.push(positive_tagged_transition.get_dest_state()); + stack.push(positive_tagged_start_transition.get_dest_state()); + } + for (auto const& positive_tagged_end_transition : + current_state->get_positive_tagged_start_transitions()) + { + stack.push(positive_tagged_end_transition.get_dest_state()); } auto const* negative_dest_state = current_state->get_negative_tagged_transition().get_dest_state(); diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 008478a5..da672911 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -144,10 +144,15 @@ auto RegexNFA::get_bfs_traversal_order() const -> std::vectorget_epsilon_transitions()) { add_to_queue_and_visited(dest_state); } - for (auto const& positive_tagged_transition : - current_state->get_positive_tagged_transitions()) + for (auto const& positive_tagged_start_transition : + current_state->get_positive_tagged_start_transitions()) { - add_to_queue_and_visited(positive_tagged_transition.get_dest_state()); + add_to_queue_and_visited(positive_tagged_start_transition.get_dest_state()); + } + for (auto const& positive_tagged_end_transition : + current_state->get_positive_tagged_end_transitions()) + { + add_to_queue_and_visited(positive_tagged_end_transition.get_dest_state()); } auto const* negative_dest_state = current_state->get_negative_tagged_transition().get_dest_state(); diff --git a/src/log_surgeon/finite_automata/RegexNFAState.hpp b/src/log_surgeon/finite_automata/RegexNFAState.hpp index aaf36ef3..baa992c1 100644 --- a/src/log_surgeon/finite_automata/RegexNFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexNFAState.hpp @@ -26,7 +26,7 @@ class RegexNFAState { RegexNFAState() = default; RegexNFAState(Tag const* tag, RegexNFAState const* dest_state) - : m_positive_tagged_transitions{{tag, dest_state}} {} + : m_positive_tagged_end_transitions{{tag, dest_state}} {} RegexNFAState(std::set tags, RegexNFAState const* dest_state) : m_negative_tagged_transition{std::move(tags), dest_state} {} @@ -43,9 +43,14 @@ class RegexNFAState { return m_matching_variable_id; } - [[nodiscard]] auto get_positive_tagged_transitions( + [[nodiscard]] auto get_positive_tagged_start_transitions( ) const -> std::vector> const& { - return m_positive_tagged_transitions; + return m_positive_tagged_start_transitions; + } + + [[nodiscard]] auto get_positive_tagged_end_transitions( + ) const -> std::vector> const& { + return m_positive_tagged_end_transitions; } [[nodiscard]] auto get_negative_tagged_transition( @@ -94,7 +99,8 @@ class RegexNFAState { private: bool m_accepting{false}; uint32_t m_matching_variable_id{0}; - std::vector> m_positive_tagged_transitions; + std::vector> m_positive_tagged_start_transitions; + std::vector> m_positive_tagged_end_transitions; NegativeTaggedTransition m_negative_tagged_transition; std::vector m_epsilon_transitions; std::array, cSizeOfByte> m_bytes_transitions; @@ -170,14 +176,28 @@ auto RegexNFAState::serialize( epsilon_transitions.emplace_back(std::to_string(state_ids.at(dest_state))); } - std::vector positive_tagged_transitions; - for (auto const& positive_tagged_transition : m_positive_tagged_transitions) { + std::vector positive_tagged_start_transition_strings; + for (auto const& positive_tagged_start_transition : m_positive_tagged_start_transitions) { + auto const optional_serialized_positive_transition + = positive_tagged_start_transition.serialize(state_ids); + if (false == optional_serialized_positive_transition.has_value()) { + return std::nullopt; + } + positive_tagged_start_transition_strings.emplace_back( + optional_serialized_positive_transition.value() + ); + } + + std::vector positive_tagged_end_transition_strings; + for (auto const& positive_tagged_end_transition : m_positive_tagged_end_transitions) { auto const optional_serialized_positive_transition - = positive_tagged_transition.serialize(state_ids); + = positive_tagged_end_transition.serialize(state_ids); if (false == optional_serialized_positive_transition.has_value()) { return std::nullopt; } - positive_tagged_transitions.emplace_back(optional_serialized_positive_transition.value()); + positive_tagged_end_transition_strings.emplace_back( + optional_serialized_positive_transition.value() + ); } std::string negative_tagged_transition_string; @@ -194,13 +214,15 @@ auto RegexNFAState::serialize( = m_accepting ? fmt::format("accepting_tag={},", m_matching_variable_id) : ""; return fmt::format( - "{}:{}byte_transitions={{{}}},epsilon_transitions={{{}}},positive_tagged_transitions={{" - "{}}},negative_tagged_transition={{{}}}", + "{}:{}byte_transitions={{{}}},epsilon_transitions={{{}}},positive_tagged_start_" + "transitions={{{}}},positive_tagged_end_transitions={{{}}},negative_tagged_transition={" + "{{}}}", state_ids.at(this), accepting_tag_string, fmt::join(byte_transitions, ","), fmt::join(epsilon_transitions, ","), - fmt::join(positive_tagged_transitions, ","), + fmt::join(positive_tagged_start_transition_strings, ","), + fmt::join(positive_tagged_end_transition_strings, ","), negative_tagged_transition_string ); } diff --git a/src/log_surgeon/finite_automata/Tag.hpp b/src/log_surgeon/finite_automata/Tag.hpp index 36e7c3d1..fa04dd44 100644 --- a/src/log_surgeon/finite_automata/Tag.hpp +++ b/src/log_surgeon/finite_automata/Tag.hpp @@ -19,6 +19,18 @@ class Tag { public: explicit Tag(std::string name) : m_name{std::move(name)} {} + auto add_start(uint32_t start) -> void { + m_starts.push_back(start); + } + + auto add_end(uint32_t end) -> void { + m_ends.push_back(end); + } + + auto set_unmatched() -> void { + m_starts.clear(); + } + [[nodiscard]] auto get_name() const -> std::string const& { return m_name; } private: diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index 8b050190..8b3053c2 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -9,7 +9,6 @@ #include -#include #include namespace log_surgeon::finite_automata { diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index 0c379028..aecbb535 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -49,58 +49,71 @@ TEST_CASE("Test NFA", "[NFA]") { // Compare against expected output string expected_serialized_nfa = "0:byte_transitions={A-->1,Z-->2}," "epsilon_transitions={}," - "positive_tagged_transitions={}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; expected_serialized_nfa += "1:byte_transitions={a-->3,b-->3,c-->4,d-->4}," "epsilon_transitions={}," - "positive_tagged_transitions={}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; expected_serialized_nfa += "2:byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_transitions={}," - "negative_tagged_transition={5[containerID,letter1,letter2,letter]}\n"; + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={}," + "negative_tagged_transition={5[containerID,letter,letter1,letter2]}\n"; expected_serialized_nfa += "3:byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_transitions={6[letter1]}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={6[letter1]}," "negative_tagged_transition={}\n"; expected_serialized_nfa += "4:byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_transitions={7[letter2]}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={7[letter2]}," "negative_tagged_transition={}\n"; expected_serialized_nfa += "5:accepting_tag=0,byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_transitions={}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; expected_serialized_nfa += "6:byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_transitions={}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={}," "negative_tagged_transition={8[letter2]}\n"; expected_serialized_nfa += "7:byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_transitions={}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={}," "negative_tagged_transition={8[letter1]}\n"; expected_serialized_nfa += "8:byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_transitions={9[letter]}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={9[letter]}," "negative_tagged_transition={}\n"; expected_serialized_nfa += "9:byte_transitions={B-->10}," "epsilon_transitions={}," - "positive_tagged_transitions={}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; expected_serialized_nfa += "10:byte_transitions={0-->11,1-->11,2-->11,3-->11,4-->11,5-->11,6-->" "11,7-->11,8-->11,9-->11}," "epsilon_transitions={}," - "positive_tagged_transitions={}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; expected_serialized_nfa += "11:byte_transitions={0-->11,1-->11,2-->11,3-->11,4-->11,5-->11,6-->" "11,7-->11,8-->11,9-->11}," "epsilon_transitions={}," - "positive_tagged_transitions={12[containerID]}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={12[containerID]}," "negative_tagged_transition={}\n"; expected_serialized_nfa += "12:byte_transitions={C-->5}," "epsilon_transitions={}," - "positive_tagged_transitions={}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; // Compare expected and actual line-by-line diff --git a/tests/test-lexer.cpp b/tests/test-lexer.cpp index e369020f..80dd7906 100644 --- a/tests/test-lexer.cpp +++ b/tests/test-lexer.cpp @@ -152,7 +152,7 @@ TEST_CASE("Test the Schema class", "[Schema]") { "?\\d+" ")C" ")", - U"(Z<~letter2><~containerID><~letter1><~letter>)|(" + U"(Z<~letter2><~containerID><~letter><~letter1>)|(" "A(" "(((a)|(b))<~letter2>)|" "(((c)|(d))<~letter1>)" @@ -196,16 +196,16 @@ TEST_CASE("Test the Schema class", "[Schema]") { "){0,10}" ")", U"(" - U"(<~letterA><~letterB>)|((" + U"(<~letterB><~letterA>)|((" U"((a)<~letterB>)|" U"((b)<~letterA>)" U"){1,inf})" - U"<~letterD><~letterC>)|(" - U"(<~letterD><~letterC>)|((" + U"<~letterC><~letterD>)|(" + U"(<~letterC><~letterD>)|((" U"((c)<~letterD>)|" U"((d)<~letterC>)" U"){1,10})" - U"<~letterA><~letterB>)" + U"<~letterB><~letterA>)" // clang-format on ); } From 2bd5d2cf08489e12a2a2488e63320d9c2f8143b3 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 31 Oct 2024 13:11:15 -0400 Subject: [PATCH 152/323] Add functionality to tags to use it for tracking capture positions; Rename tag members to make it clear they are positions. --- src/log_surgeon/finite_automata/RegexAST.hpp | 12 ++++----- src/log_surgeon/finite_automata/RegexNFA.hpp | 8 +++--- .../finite_automata/RegexNFAState.hpp | 4 +-- src/log_surgeon/finite_automata/Tag.hpp | 24 +++++++---------- .../finite_automata/TaggedTransition.hpp | 26 ++++++++++++------- 5 files changed, 38 insertions(+), 36 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 0481081f..1a44de41 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -82,19 +82,19 @@ class RegexAST { */ [[nodiscard]] virtual auto serialize() const -> std::u32string = 0; - [[nodiscard]] auto get_subtree_positive_tags() const -> std::set const& { + [[nodiscard]] auto get_subtree_positive_tags() const -> std::set const& { return m_subtree_positive_tags; } - auto set_subtree_positive_tags(std::set subtree_positive_tags) -> void { + auto set_subtree_positive_tags(std::set subtree_positive_tags) -> void { m_subtree_positive_tags = std::move(subtree_positive_tags); } - auto add_subtree_positive_tags(std::set subtree_positive_tags) -> void { + auto add_subtree_positive_tags(std::set subtree_positive_tags) -> void { m_subtree_positive_tags.merge(subtree_positive_tags); } - auto set_negative_tags(std::set negative_tags) -> void { + auto set_negative_tags(std::set negative_tags) -> void { m_negative_tags = std::move(negative_tags); } @@ -142,8 +142,8 @@ class RegexAST { } private: - std::set m_subtree_positive_tags; - std::set m_negative_tags; + std::set m_subtree_positive_tags; + std::set m_negative_tags; }; /** diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index da672911..ec3ca9da 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -39,7 +39,7 @@ class RegexNFA { * @return NFAStateType* */ [[nodiscard]] auto new_state_with_positive_tagged_transition( - Tag const* tag, + Tag* tag, NFAStateType const* dest_state ) -> NFAStateType*; @@ -51,7 +51,7 @@ class RegexNFA { * @return NFAStateType* */ [[nodiscard]] auto new_state_with_negative_tagged_transition( - std::set tags, + std::set tags, NFAStateType const* dest_state ) -> NFAStateType*; @@ -99,7 +99,7 @@ auto RegexNFA::new_state() -> NFAStateType* { template auto RegexNFA::new_state_with_positive_tagged_transition( - Tag const* tag, + Tag* tag, NFAStateType const* dest_state ) -> NFAStateType* { m_states.emplace_back(std::make_unique(tag, dest_state)); @@ -108,7 +108,7 @@ auto RegexNFA::new_state_with_positive_tagged_transition( template auto RegexNFA::new_state_with_negative_tagged_transition( - std::set tags, + std::set tags, NFAStateType const* dest_state ) -> NFAStateType* { m_states.emplace_back(std::make_unique(std::move(tags), dest_state)); diff --git a/src/log_surgeon/finite_automata/RegexNFAState.hpp b/src/log_surgeon/finite_automata/RegexNFAState.hpp index baa992c1..c72e791c 100644 --- a/src/log_surgeon/finite_automata/RegexNFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexNFAState.hpp @@ -25,10 +25,10 @@ class RegexNFAState { RegexNFAState() = default; - RegexNFAState(Tag const* tag, RegexNFAState const* dest_state) + RegexNFAState(Tag* tag, RegexNFAState const* dest_state) : m_positive_tagged_end_transitions{{tag, dest_state}} {} - RegexNFAState(std::set tags, RegexNFAState const* dest_state) + RegexNFAState(std::set tags, RegexNFAState const* dest_state) : m_negative_tagged_transition{std::move(tags), dest_state} {} auto set_accepting(bool accepting) -> void { m_accepting = accepting; } diff --git a/src/log_surgeon/finite_automata/Tag.hpp b/src/log_surgeon/finite_automata/Tag.hpp index fa04dd44..1c91e61c 100644 --- a/src/log_surgeon/finite_automata/Tag.hpp +++ b/src/log_surgeon/finite_automata/Tag.hpp @@ -8,35 +8,29 @@ namespace log_surgeon::finite_automata { /** - * This class represents a tag that is associated with matches of a capture group. If `m_starts` is - * empty, it indicates that the capture group was unmatched. + * This class represents a tag that is associated with matches of a capture group. If + * `m_start_positions` is empty, it indicates that the capture group was unmatched. * * Since capture group regex can be contained within repetition regex, - * (e.g., "((user_id=(?\d+),)+"), `m_starts` and `m_ends` are vectors that track the locations - * of each occurrence of the capture group. + * (e.g., "((user_id=(?\d+),)+"), `m_start_positions` and `m_end_positions` are vectors that + * track the locations of each occurrence of the capture group. */ class Tag { public: explicit Tag(std::string name) : m_name{std::move(name)} {} - auto add_start(uint32_t start) -> void { - m_starts.push_back(start); - } + auto add_start_pos(uint32_t start_pos) -> void { m_start_positions.push_back(start_pos); } - auto add_end(uint32_t end) -> void { - m_ends.push_back(end); - } + auto add_end_pos(uint32_t end_pos) -> void { m_end_positions.push_back(end_pos); } - auto set_unmatched() -> void { - m_starts.clear(); - } + auto set_unmatched() -> void { m_start_positions.clear(); } [[nodiscard]] auto get_name() const -> std::string const& { return m_name; } private: std::string const m_name; - std::vector m_starts; - std::vector m_ends; + std::vector m_start_positions; + std::vector m_end_positions; }; } // namespace log_surgeon::finite_automata diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index 8b3053c2..dd77627a 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -15,12 +15,16 @@ namespace log_surgeon::finite_automata { template class PositiveTaggedTransition { public: - PositiveTaggedTransition(Tag const* tag, NFAStateType const* dest_state) + PositiveTaggedTransition(Tag* tag, NFAStateType const* dest_state) : m_tag{tag}, m_dest_state{dest_state} {} [[nodiscard]] auto get_dest_state() const -> NFAStateType const* { return m_dest_state; } + auto add_tag_start_pos(uint32_t start_pos) const -> void { m_tag->add_start_pos(start_pos); } + + auto add_tag_end_pos(uint32_t end_pos) const -> void { m_tag->add_end_pos(end_pos); } + /** * @param state_ids A map of states to their unique identifiers. * @return A string representation of the positive tagged transition on success. @@ -30,7 +34,7 @@ class PositiveTaggedTransition { ) const -> std::optional; private: - Tag const* m_tag; + Tag* m_tag; NFAStateType const* m_dest_state; }; @@ -39,12 +43,18 @@ class NegativeTaggedTransition { public: NegativeTaggedTransition() = default; - NegativeTaggedTransition(std::set tags, NFAStateType const* dest_state) + NegativeTaggedTransition(std::set tags, NFAStateType const* dest_state) : m_tags{std::move(tags)}, m_dest_state{dest_state} {} [[nodiscard]] auto get_dest_state() const -> NFAStateType const* { return m_dest_state; } + auto negate_tag() const -> void { + for (auto* tag : m_tags) { + tag->set_unmatched(); + } + } + /** * @param state_ids A map of states to their unique identifiers. * @return A string representation of the negative tagged transition on success. @@ -54,7 +64,7 @@ class NegativeTaggedTransition { ) const -> std::optional; private: - std::set const m_tags; + std::set const m_tags; NFAStateType const* m_dest_state{nullptr}; }; @@ -78,11 +88,9 @@ auto NegativeTaggedTransition::serialize( return std::nullopt; } - auto const tag_names - = m_tags - | std::ranges::views::transform([](Tag const* tag) { - return tag->get_name(); - }); + auto const tag_names = m_tags | std::ranges::views::transform([](Tag const* tag) { + return tag->get_name(); + }); return fmt::format("{}[{}]", state_id_it->second, fmt::join(tag_names, ",")); } } // namespace log_surgeon::finite_automata From 2d0157e190446280d4c1dc4cc3ab65eb361b062f Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 31 Oct 2024 13:19:21 -0400 Subject: [PATCH 153/323] Reduce indentation of epsilon closure by using continue. --- src/log_surgeon/Lexer.tpp | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index 8ae8e1f0..fe2c7919 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -396,22 +396,23 @@ auto Lexer::epsilon_closure(NFAStateType const* stat while (!stack.empty()) { auto const* current_state = stack.top(); stack.pop(); - if (closure_set.insert(current_state).second) { - for (auto const* dest_state : current_state->get_epsilon_transitions()) { - stack.push(dest_state); - } + if (false == closure_set.insert(current_state).second) { + continue; + } + for (auto const* dest_state : current_state->get_epsilon_transitions()) { + stack.push(dest_state); + } - // TODO: currently treat tagged transitions as epsilon transitions - for (auto const& positive_tagged_transition : - current_state->get_positive_tagged_transitions()) - { - stack.push(positive_tagged_transition.get_dest_state()); - } - auto const* negative_dest_state - = current_state->get_negative_tagged_transition().get_dest_state(); - if (nullptr != negative_dest_state) { - stack.push(negative_dest_state); - } + // TODO: currently treat tagged transitions as epsilon transitions + for (auto const& positive_tagged_transition : + current_state->get_positive_tagged_transitions()) + { + stack.push(positive_tagged_transition.get_dest_state()); + } + auto const* negative_dest_state + = current_state->get_negative_tagged_transition().get_dest_state(); + if (nullptr != negative_dest_state) { + stack.push(negative_dest_state); } } return closure_set; From 1cabafd76eee22401d975908ca4d158ca609687e Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 31 Oct 2024 14:35:02 -0400 Subject: [PATCH 154/323] Use optional for negative transitions in RegexNFAState. --- src/log_surgeon/Lexer.tpp | 8 +++--- src/log_surgeon/finite_automata/RegexNFA.hpp | 28 ++++++++++---------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index fe2c7919..1a86ea97 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -409,10 +409,10 @@ auto Lexer::epsilon_closure(NFAStateType const* stat { stack.push(positive_tagged_transition.get_dest_state()); } - auto const* negative_dest_state - = current_state->get_negative_tagged_transition().get_dest_state(); - if (nullptr != negative_dest_state) { - stack.push(negative_dest_state); + auto const optional_negative_tagged_transition + = current_state->get_optional_negative_tagged_transition(); + if (optional_negative_tagged_transition.has_value()) { + stack.push(optional_negative_tagged_transition.value().get_dest_state()); } } return closure_set; diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index e5d6792a..1270712d 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -60,8 +60,6 @@ class PositiveTaggedTransition { template class NegativeTaggedTransition { public: - NegativeTaggedTransition() = default; - NegativeTaggedTransition(std::set tags, NFAStateType const* dest_state) : m_tags{std::move(tags)}, m_dest_state{dest_state} {} @@ -80,7 +78,7 @@ class NegativeTaggedTransition { private: std::set m_tags; - NFAStateType const* m_dest_state{nullptr}; + NFAStateType const* m_dest_state; }; template @@ -94,7 +92,9 @@ class RegexNFAState { : m_positive_tagged_transitions{{tag, dest_state}} {} RegexNFAState(std::set tags, RegexNFAState const* dest_state) - : m_negative_tagged_transition{std::move(tags), dest_state} {} + : m_optional_negative_tagged_transition{ + NegativeTaggedTransition{std::move(tags), dest_state} + } {} auto set_accepting(bool accepting) -> void { m_accepting = accepting; } @@ -113,9 +113,9 @@ class RegexNFAState { return m_positive_tagged_transitions; } - [[nodiscard]] auto get_negative_tagged_transition( - ) const -> NegativeTaggedTransition const& { - return m_negative_tagged_transition; + [[nodiscard]] auto get_optional_negative_tagged_transition( + ) const -> std::optional> const& { + return m_optional_negative_tagged_transition; } auto add_epsilon_transition(RegexNFAState* epsilon_transition) -> void { @@ -160,7 +160,7 @@ class RegexNFAState { bool m_accepting{false}; uint32_t m_matching_variable_id{0}; std::vector> m_positive_tagged_transitions; - NegativeTaggedTransition m_negative_tagged_transition; + std::optional> m_optional_negative_tagged_transition; std::vector m_epsilon_transitions; std::array, cSizeOfByte> m_bytes_transitions; // NOTE: We don't need m_tree_transitions for the `stateType == @@ -330,9 +330,9 @@ auto RegexNFAState::serialize( } std::string negative_tagged_transition_string; - if (nullptr != m_negative_tagged_transition.get_dest_state()) { + if (m_optional_negative_tagged_transition.has_value()) { auto const optional_serialized_negative_transition - = m_negative_tagged_transition.serialize(state_ids); + = m_optional_negative_tagged_transition.value().serialize(state_ids); if (false == optional_serialized_negative_transition.has_value()) { return std::nullopt; } @@ -420,10 +420,10 @@ auto RegexNFA::get_bfs_traversal_order() const -> std::vectorget_negative_tagged_transition().get_dest_state(); - if (nullptr != negative_dest_state) { - add_to_queue_and_visited(negative_dest_state); + auto const optional_negative_tagged_transition + = current_state->get_optional_negative_tagged_transition(); + if (optional_negative_tagged_transition.has_value()) { + add_to_queue_and_visited(optional_negative_tagged_transition.value().get_dest_state()); } } return visited_order; From dc2c637f94f66b1efe3917fe0966a1a8e041e411 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 1 Nov 2024 06:37:15 -0400 Subject: [PATCH 155/323] Add missing headers; Remove unused headers. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 1270712d..9dc5823f 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -8,7 +8,8 @@ #include #include #include -#include +#include +#include #include #include #include From 7c5cfc0f77022fc38add40692a651c0e65238507 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Fri, 1 Nov 2024 06:40:36 -0400 Subject: [PATCH 156/323] Assign optional_negative_tagged_transition to a reference. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/Lexer.tpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index 1a86ea97..3cae1a64 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -409,7 +409,7 @@ auto Lexer::epsilon_closure(NFAStateType const* stat { stack.push(positive_tagged_transition.get_dest_state()); } - auto const optional_negative_tagged_transition + auto const& optional_negative_tagged_transition = current_state->get_optional_negative_tagged_transition(); if (optional_negative_tagged_transition.has_value()) { stack.push(optional_negative_tagged_transition.value().get_dest_state()); From 4e8d290398e331b65143855a4cf2378abc1a1ff0 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Fri, 1 Nov 2024 06:40:58 -0400 Subject: [PATCH 157/323] Assign optional_negative_tagged_transition to a reference again. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 9dc5823f..06a0bc54 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -421,7 +421,7 @@ auto RegexNFA::get_bfs_traversal_order() const -> std::vectorget_optional_negative_tagged_transition(); if (optional_negative_tagged_transition.has_value()) { add_to_queue_and_visited(optional_negative_tagged_transition.value().get_dest_state()); From fde9037021bbd45333b7e9e9f8c584ffb914bb8d Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 1 Nov 2024 06:49:54 -0400 Subject: [PATCH 158/323] Add to Lexer.tpp. --- src/log_surgeon/Lexer.tpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index 3cae1a64..26f8570b 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -2,6 +2,7 @@ #define LOG_SURGEON_LEXER_TPP #include +#include #include #include From e63637e62244d14742e0c89441a873eb5bf91a4c Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 1 Nov 2024 06:51:38 -0400 Subject: [PATCH 159/323] Fix comment grammar. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 06a0bc54..eb6e67f6 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -198,7 +198,7 @@ class RegexNFA { ) -> NFAStateType*; /** - * Creates a unique_ptr for an NFA state with negative tagged transition and adds it to + * Creates a unique_ptr for an NFA state with a negative tagged transition and adds it to * `m_states`. * @param tags * @param dest_state From b8c8f77011d42736d7669b773c02c820dce73575 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 1 Nov 2024 10:09:25 -0400 Subject: [PATCH 160/323] Store negative tags in a vector instead of set so that the order is deterministic. This is fine as tags are unique across disjoint subtrees. --- src/log_surgeon/finite_automata/RegexAST.hpp | 25 +++++++++++-------- src/log_surgeon/finite_automata/RegexNFA.hpp | 4 +-- .../finite_automata/RegexNFAState.hpp | 2 +- .../finite_automata/TaggedTransition.hpp | 4 +-- tests/test-NFA.cpp | 2 +- tests/test-lexer.cpp | 6 ++--- 6 files changed, 23 insertions(+), 20 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 1a44de41..2d58222c 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -82,19 +82,23 @@ class RegexAST { */ [[nodiscard]] virtual auto serialize() const -> std::u32string = 0; - [[nodiscard]] auto get_subtree_positive_tags() const -> std::set const& { + [[nodiscard]] auto get_subtree_positive_tags() const -> std::vector const& { return m_subtree_positive_tags; } - auto set_subtree_positive_tags(std::set subtree_positive_tags) -> void { + auto set_subtree_positive_tags(std::vector subtree_positive_tags) -> void { m_subtree_positive_tags = std::move(subtree_positive_tags); } - auto add_subtree_positive_tags(std::set subtree_positive_tags) -> void { - m_subtree_positive_tags.merge(subtree_positive_tags); + auto add_subtree_positive_tags(std::vector subtree_positive_tags) -> void { + m_subtree_positive_tags.insert( + m_subtree_positive_tags.end(), + std::make_move_iterator(subtree_positive_tags.begin()), + std::make_move_iterator(subtree_positive_tags.end()) + ); } - auto set_negative_tags(std::set negative_tags) -> void { + auto set_negative_tags(std::vector negative_tags) -> void { m_negative_tags = std::move(negative_tags); } @@ -128,10 +132,9 @@ class RegexAST { } auto const transformed_negative_tags - = m_negative_tags - | std::ranges::views::transform([](Tag const* tag) { - return fmt::format("<~{}>", tag->get_name()); - }); + = m_negative_tags | std::ranges::views::transform([](Tag const* tag) { + return fmt::format("<~{}>", tag->get_name()); + }); auto const negative_tags_string = fmt::format("{}", fmt::join(transformed_negative_tags, "")); @@ -142,8 +145,8 @@ class RegexAST { } private: - std::set m_subtree_positive_tags; - std::set m_negative_tags; + std::vector m_subtree_positive_tags; + std::vector m_negative_tags; }; /** diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index bf95c64f..c618d8dd 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -54,7 +54,7 @@ class RegexNFA { * @return NFAStateType* */ [[nodiscard]] auto new_state_with_negative_tagged_transition( - std::set tags, + std::vector tags, NFAStateType const* dest_state ) -> NFAStateType*; @@ -111,7 +111,7 @@ auto RegexNFA::new_state_with_positive_tagged_transition( template auto RegexNFA::new_state_with_negative_tagged_transition( - std::set tags, + std::vector tags, NFAStateType const* dest_state ) -> NFAStateType* { m_states.emplace_back(std::make_unique(std::move(tags), dest_state)); diff --git a/src/log_surgeon/finite_automata/RegexNFAState.hpp b/src/log_surgeon/finite_automata/RegexNFAState.hpp index 8dc368c8..831bd072 100644 --- a/src/log_surgeon/finite_automata/RegexNFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexNFAState.hpp @@ -28,7 +28,7 @@ class RegexNFAState { RegexNFAState(Tag* tag, RegexNFAState const* dest_state) : m_positive_tagged_end_transitions{{tag, dest_state}} {} - RegexNFAState(std::set tags, RegexNFAState const* dest_state) + RegexNFAState(std::vector tags, RegexNFAState const* dest_state) : m_optional_negative_tagged_transition{ NegativeTaggedTransition{std::move(tags), dest_state} } {} diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index 9e9d6a67..1c32bf70 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -41,7 +41,7 @@ class PositiveTaggedTransition { template class NegativeTaggedTransition { public: - NegativeTaggedTransition(std::set tags, NFAStateType const* dest_state) + NegativeTaggedTransition(std::vector tags, NFAStateType const* dest_state) : m_tags{std::move(tags)}, m_dest_state{dest_state} {} @@ -62,7 +62,7 @@ class NegativeTaggedTransition { ) const -> std::optional; private: - std::set const m_tags; + std::vector const m_tags; NFAStateType const* m_dest_state; }; diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index aecbb535..f92e7a37 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -62,7 +62,7 @@ TEST_CASE("Test NFA", "[NFA]") { "epsilon_transitions={}," "positive_tagged_start_transitions={}," "positive_tagged_end_transitions={}," - "negative_tagged_transition={5[containerID,letter,letter1,letter2]}\n"; + "negative_tagged_transition={5[letter1,letter2,letter,containerID]}\n"; expected_serialized_nfa += "3:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_start_transitions={}," diff --git a/tests/test-lexer.cpp b/tests/test-lexer.cpp index 80dd7906..8906be8d 100644 --- a/tests/test-lexer.cpp +++ b/tests/test-lexer.cpp @@ -152,7 +152,7 @@ TEST_CASE("Test the Schema class", "[Schema]") { "?\\d+" ")C" ")", - U"(Z<~letter2><~containerID><~letter><~letter1>)|(" + U"(Z<~letter1><~letter2><~letter><~containerID>)|(" "A(" "(((a)|(b))<~letter2>)|" "(((c)|(d))<~letter1>)" @@ -196,7 +196,7 @@ TEST_CASE("Test the Schema class", "[Schema]") { "){0,10}" ")", U"(" - U"(<~letterB><~letterA>)|((" + U"(<~letterA><~letterB>)|((" U"((a)<~letterB>)|" U"((b)<~letterA>)" U"){1,inf})" @@ -205,7 +205,7 @@ TEST_CASE("Test the Schema class", "[Schema]") { U"((c)<~letterD>)|" U"((d)<~letterC>)" U"){1,10})" - U"<~letterB><~letterA>)" + U"<~letterA><~letterB>)" // clang-format on ); } From ef9506193a92a535a244b31a9810507945b4d8d4 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 1 Nov 2024 22:39:24 -0400 Subject: [PATCH 161/323] Sync with previous PR. --- src/log_surgeon/Lexer.tpp | 2 +- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 +- src/log_surgeon/finite_automata/RegexNFAState.hpp | 14 ++++++-------- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index 26f8570b..c7dab9db 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -411,7 +411,7 @@ auto Lexer::epsilon_closure(NFAStateType const* stat stack.push(positive_tagged_transition.get_dest_state()); } auto const& optional_negative_tagged_transition - = current_state->get_optional_negative_tagged_transition(); + = current_state->get_negative_tagged_transition(); if (optional_negative_tagged_transition.has_value()) { stack.push(optional_negative_tagged_transition.value().get_dest_state()); } diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 6474dc06..54ac5bdc 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -149,7 +149,7 @@ auto RegexNFA::get_bfs_traversal_order() const -> std::vectorget_optional_negative_tagged_transition(); + = current_state->get_negative_tagged_transition(); if (optional_negative_tagged_transition.has_value()) { add_to_queue_and_visited(optional_negative_tagged_transition.value().get_dest_state()); } diff --git a/src/log_surgeon/finite_automata/RegexNFAState.hpp b/src/log_surgeon/finite_automata/RegexNFAState.hpp index 0a116469..2b6cb624 100644 --- a/src/log_surgeon/finite_automata/RegexNFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexNFAState.hpp @@ -29,9 +29,7 @@ class RegexNFAState { : m_positive_tagged_transitions{{tag, dest_state}} {} RegexNFAState(std::set tags, RegexNFAState const* dest_state) - : m_optional_negative_tagged_transition{ - NegativeTaggedTransition{std::move(tags), dest_state} - } {} + : m_negative_tagged_transition{NegativeTaggedTransition{std::move(tags), dest_state}} {} auto set_accepting(bool accepting) -> void { m_accepting = accepting; } @@ -50,9 +48,9 @@ class RegexNFAState { return m_positive_tagged_transitions; } - [[nodiscard]] auto get_optional_negative_tagged_transition( + [[nodiscard]] auto get_negative_tagged_transition( ) const -> std::optional> const& { - return m_optional_negative_tagged_transition; + return m_negative_tagged_transition; } auto add_epsilon_transition(RegexNFAState* epsilon_transition) -> void { @@ -97,7 +95,7 @@ class RegexNFAState { bool m_accepting{false}; uint32_t m_matching_variable_id{0}; std::vector> m_positive_tagged_transitions; - std::optional> m_optional_negative_tagged_transition; + std::optional> m_negative_tagged_transition; std::vector m_epsilon_transitions; std::array, cSizeOfByte> m_bytes_transitions; // NOTE: We don't need m_tree_transitions for the `stateType == @@ -183,9 +181,9 @@ auto RegexNFAState::serialize( } std::string negative_tagged_transition_string; - if (m_optional_negative_tagged_transition.has_value()) { + if (m_negative_tagged_transition.has_value()) { auto const optional_serialized_negative_transition - = m_optional_negative_tagged_transition.value().serialize(state_ids); + = m_negative_tagged_transition.value().serialize(state_ids); if (false == optional_serialized_negative_transition.has_value()) { return std::nullopt; } From 7cc8c5250966ddb24c172e2c2999ac6869bad6c5 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sat, 2 Nov 2024 04:37:44 -0400 Subject: [PATCH 162/323] Add start tags to NFA. --- src/log_surgeon/finite_automata/RegexAST.hpp | 7 +++++++ src/log_surgeon/finite_automata/RegexNFAState.hpp | 4 ++++ 2 files changed, 11 insertions(+) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 2d58222c..745a0fd8 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -892,9 +892,16 @@ template template void RegexASTCapture::add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const { + NFAStateType* root = nfa->get_root(); + auto* capture_group_start_state = nfa->new_state(); + root->add_positive_tagged_start_transition(m_tag.get(), capture_group_start_state); + auto* state_with_positive_tagged_transition = nfa->new_state_with_positive_tagged_transition(m_tag.get(), end_state); + nfa->set_root(capture_group_start_state); m_group_regex_ast->add_to_nfa_with_negative_tags(nfa, state_with_positive_tagged_transition); + + nfa->set_root(root); } template diff --git a/src/log_surgeon/finite_automata/RegexNFAState.hpp b/src/log_surgeon/finite_automata/RegexNFAState.hpp index 99f71652..3a5f70f7 100644 --- a/src/log_surgeon/finite_automata/RegexNFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexNFAState.hpp @@ -43,6 +43,10 @@ class RegexNFAState { return m_matching_variable_id; } + auto add_positive_tagged_start_transition(Tag* tag, RegexNFAState* dest_state) -> void { + m_positive_tagged_start_transitions.emplace_back(tag, dest_state); + } + [[nodiscard]] auto get_positive_tagged_start_transitions( ) const -> std::vector> const& { return m_positive_tagged_start_transitions; From b1a9300f70bc5500cfcc29207024c2067e909a45 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sat, 2 Nov 2024 04:54:17 -0400 Subject: [PATCH 163/323] Update unit-test to handle start transitions. --- tests/test-NFA.cpp | 60 ++++++++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 20 deletions(-) diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index f92e7a37..6a92f4bb 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -52,9 +52,9 @@ TEST_CASE("Test NFA", "[NFA]") { "positive_tagged_start_transitions={}," "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; - expected_serialized_nfa += "1:byte_transitions={a-->3,b-->3,c-->4,d-->4}," + expected_serialized_nfa += "1:byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_start_transitions={}," + "positive_tagged_start_transitions={3[letter]}," "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; expected_serialized_nfa @@ -62,55 +62,75 @@ TEST_CASE("Test NFA", "[NFA]") { "epsilon_transitions={}," "positive_tagged_start_transitions={}," "positive_tagged_end_transitions={}," - "negative_tagged_transition={5[letter1,letter2,letter,containerID]}\n"; + "negative_tagged_transition={4[letter1,letter2,letter,containerID]}\n"; expected_serialized_nfa += "3:byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_start_transitions={}," - "positive_tagged_end_transitions={6[letter1]}," + "positive_tagged_start_transitions={5[letter1],6[letter2]}," + "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; - expected_serialized_nfa += "4:byte_transitions={}," + expected_serialized_nfa += "4:accepting_tag=0,byte_transitions={}," "epsilon_transitions={}," "positive_tagged_start_transitions={}," - "positive_tagged_end_transitions={7[letter2]}," + "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; - expected_serialized_nfa += "5:accepting_tag=0,byte_transitions={}," + expected_serialized_nfa += "5:byte_transitions={a-->7,b-->7}," "epsilon_transitions={}," "positive_tagged_start_transitions={}," "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; - expected_serialized_nfa += "6:byte_transitions={}," + expected_serialized_nfa += "6:byte_transitions={c-->8,d-->8}," "epsilon_transitions={}," "positive_tagged_start_transitions={}," "positive_tagged_end_transitions={}," - "negative_tagged_transition={8[letter2]}\n"; + "negative_tagged_transition={}\n"; expected_serialized_nfa += "7:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_start_transitions={}," - "positive_tagged_end_transitions={}," - "negative_tagged_transition={8[letter1]}\n"; + "positive_tagged_end_transitions={9[letter1]}," + "negative_tagged_transition={}\n"; expected_serialized_nfa += "8:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_start_transitions={}," - "positive_tagged_end_transitions={9[letter]}," + "positive_tagged_end_transitions={10[letter2]}," "negative_tagged_transition={}\n"; - expected_serialized_nfa += "9:byte_transitions={B-->10}," + expected_serialized_nfa += "9:byte_transitions={}," + "epsilon_transitions={}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={}," + "negative_tagged_transition={11[letter2]}\n"; + expected_serialized_nfa += "10:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_start_transitions={}," "positive_tagged_end_transitions={}," + "negative_tagged_transition={11[letter1]}\n"; + expected_serialized_nfa += "11:byte_transitions={}," + "epsilon_transitions={}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={12[letter]}," + "negative_tagged_transition={}\n"; + expected_serialized_nfa += "12:byte_transitions={B-->13}," + "epsilon_transitions={}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={}," + "negative_tagged_transition={}\n"; + expected_serialized_nfa += "13:byte_transitions={}," + "epsilon_transitions={}," + "positive_tagged_start_transitions={14[containerID]}," + "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; - expected_serialized_nfa += "10:byte_transitions={0-->11,1-->11,2-->11,3-->11,4-->11,5-->11,6-->" - "11,7-->11,8-->11,9-->11}," + expected_serialized_nfa += "14:byte_transitions={0-->15,1-->15,2-->15,3-->15,4-->15,5-->15,6-->" + "15,7-->15,8-->15,9-->15}," "epsilon_transitions={}," "positive_tagged_start_transitions={}," "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; - expected_serialized_nfa += "11:byte_transitions={0-->11,1-->11,2-->11,3-->11,4-->11,5-->11,6-->" - "11,7-->11,8-->11,9-->11}," + expected_serialized_nfa += "15:byte_transitions={0-->15,1-->15,2-->15,3-->15,4-->15,5-->15,6-->" + "15,7-->15,8-->15,9-->15}," "epsilon_transitions={}," "positive_tagged_start_transitions={}," - "positive_tagged_end_transitions={12[containerID]}," + "positive_tagged_end_transitions={16[containerID]}," "negative_tagged_transition={}\n"; - expected_serialized_nfa += "12:byte_transitions={C-->5}," + expected_serialized_nfa += "16:byte_transitions={C-->4}," "epsilon_transitions={}," "positive_tagged_start_transitions={}," "positive_tagged_end_transitions={}," From b451651c6854a56ccc4662e734de1204064e1fc5 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 6 Nov 2024 10:08:17 -0500 Subject: [PATCH 164/323] Move RegexNFAXState typedef into RegexNFAState.hpp --- src/log_surgeon/finite_automata/RegexNFAState.hpp | 3 +++ src/log_surgeon/finite_automata/RegexNFAStateType.hpp | 6 ------ 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFAState.hpp b/src/log_surgeon/finite_automata/RegexNFAState.hpp index 2b6cb624..4b8ce0a9 100644 --- a/src/log_surgeon/finite_automata/RegexNFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexNFAState.hpp @@ -204,6 +204,9 @@ auto RegexNFAState::serialize( negative_tagged_transition_string ); } + +using RegexNFAByteState = RegexNFAState; +using RegexNFAUTF8State = RegexNFAState; } // namespace log_surgeon::finite_automata #endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_STATE diff --git a/src/log_surgeon/finite_automata/RegexNFAStateType.hpp b/src/log_surgeon/finite_automata/RegexNFAStateType.hpp index e190e387..24ef2153 100644 --- a/src/log_surgeon/finite_automata/RegexNFAStateType.hpp +++ b/src/log_surgeon/finite_automata/RegexNFAStateType.hpp @@ -8,12 +8,6 @@ enum class RegexNFAStateType : uint8_t { Byte, UTF8 }; - -template -class RegexNFAState; - -using RegexNFAByteState = RegexNFAState; -using RegexNFAUTF8State = RegexNFAState; } // namespace log_surgeon::finite_automata #endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_STATE_TYPE From f71348bb504346c46586352d5657cfb3101fb2d0 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 6 Nov 2024 10:09:13 -0500 Subject: [PATCH 165/323] Switch void to auto -> void. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexNFAState.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFAState.hpp b/src/log_surgeon/finite_automata/RegexNFAState.hpp index 2b6cb624..2a03100d 100644 --- a/src/log_surgeon/finite_automata/RegexNFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexNFAState.hpp @@ -106,7 +106,7 @@ class RegexNFAState { }; template -void RegexNFAState::add_interval(Interval interval, RegexNFAState* dest_state) { +auto RegexNFAState::add_interval(Interval interval, RegexNFAState* dest_state) -> void { if (interval.first < cSizeOfByte) { uint32_t const bound = std::min(interval.second, cSizeOfByte - 1); for (uint32_t i = interval.first; i <= bound; i++) { From 4576d7dd0def9712017c900f38e0b270b830d03a Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 6 Nov 2024 10:14:36 -0500 Subject: [PATCH 166/323] Move short functions into the class definition; Move RegexNFAXState typedefs to the top of the file to fix compilation error. --- .../finite_automata/RegexNFAState.hpp | 9 +++-- .../finite_automata/TaggedTransition.hpp | 38 +++++++------------ 2 files changed, 20 insertions(+), 27 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFAState.hpp b/src/log_surgeon/finite_automata/RegexNFAState.hpp index 54d4f9df..f2a27898 100644 --- a/src/log_surgeon/finite_automata/RegexNFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexNFAState.hpp @@ -18,6 +18,12 @@ #include namespace log_surgeon::finite_automata { +template +class RegexNFAState; + +using RegexNFAByteState = RegexNFAState; +using RegexNFAUTF8State = RegexNFAState; + template class RegexNFAState { public: @@ -204,9 +210,6 @@ auto RegexNFAState::serialize( negative_tagged_transition_string ); } - -using RegexNFAByteState = RegexNFAState; -using RegexNFAUTF8State = RegexNFAState; } // namespace log_surgeon::finite_automata #endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_STATE diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index 29d9fd23..614841a7 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -29,7 +29,13 @@ class PositiveTaggedTransition { * @return std::nullopt if `m_dest_state` is not in `state_ids`. */ [[nodiscard]] auto serialize(std::unordered_map const& state_ids - ) const -> std::optional; + ) const -> std::optional { + auto const state_id_it = state_ids.find(m_dest_state); + if (state_id_it == state_ids.end()) { + return std::nullopt; + } + return fmt::format("{}[{}]", state_id_it->second, m_tag); + } private: uint32_t m_tag; @@ -53,34 +59,18 @@ class NegativeTaggedTransition { * @return std::nullopt if `m_dest_state` is not in `state_ids`. */ [[nodiscard]] auto serialize(std::unordered_map const& state_ids - ) const -> std::optional; + ) const -> std::optional { + auto const state_id_it = state_ids.find(m_dest_state); + if (state_id_it == state_ids.end()) { + return std::nullopt; + } + return fmt::format("{}[{}]", state_id_it->second, fmt::join(m_tags, ",")); + } private: std::set m_tags; NFAStateType const* m_dest_state; }; - -template -auto PositiveTaggedTransition::serialize( - std::unordered_map const& state_ids -) const -> std::optional { - auto const state_id_it = state_ids.find(m_dest_state); - if (state_id_it == state_ids.end()) { - return std::nullopt; - } - return fmt::format("{}[{}]", state_id_it->second, m_tag); -} - -template -auto NegativeTaggedTransition::serialize( - std::unordered_map const& state_ids -) const -> std::optional { - auto const state_id_it = state_ids.find(m_dest_state); - if (state_id_it == state_ids.end()) { - return std::nullopt; - } - return fmt::format("{}[{}]", state_id_it->second, fmt::join(m_tags, ",")); -} } // namespace log_surgeon::finite_automata #endif // LOG_SURGEON_FINITE_AUTOMATA_TAGGED_TRANSITION From 5abe90625cd87d7dae6a1993e6b86f39724565a5 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 7 Nov 2024 02:43:36 -0500 Subject: [PATCH 167/323] Auto format. --- src/log_surgeon/finite_automata/TaggedTransition.hpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index 26b24762..71336de2 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -62,11 +62,9 @@ class NegativeTaggedTransition { return std::nullopt; } - auto const tag_names - = m_tags - | std::ranges::views::transform([](Tag const* tag) { - return tag->get_name(); - }); + auto const tag_names = m_tags | std::ranges::views::transform([](Tag const* tag) { + return tag->get_name(); + }); return fmt::format("{}[{}]", state_id_it->second, fmt::join(tag_names, ",")); } From bb0bd2e54af48dd18117f4d771338fb89d82029b Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 7 Nov 2024 02:49:12 -0500 Subject: [PATCH 168/323] Remove unused lambda; Auto format. --- src/log_surgeon/SchemaParser.cpp | 2 +- src/log_surgeon/finite_automata/RegexAST.hpp | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/log_surgeon/SchemaParser.cpp b/src/log_surgeon/SchemaParser.cpp index 4558a064..56760262 100644 --- a/src/log_surgeon/SchemaParser.cpp +++ b/src/log_surgeon/SchemaParser.cpp @@ -622,7 +622,7 @@ void SchemaParser::add_productions() { add_production( "Literal", {"Lparen", "QuestionMark", "Langle", "Identifier", "Rangle", "Regex", "Rparen"}, - [this](NonTerminal* m) { return regex_capture_rule(m); } + regex_capture_rule ); add_production("Literal", {"Lparen", "Regex", "Rparen"}, regex_middle_identity_rule); for (auto const& [special_regex_char, special_regex_name] : m_special_regex_characters) { diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 0481081f..cebae88c 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -128,10 +128,9 @@ class RegexAST { } auto const transformed_negative_tags - = m_negative_tags - | std::ranges::views::transform([](Tag const* tag) { - return fmt::format("<~{}>", tag->get_name()); - }); + = m_negative_tags | std::ranges::views::transform([](Tag const* tag) { + return fmt::format("<~{}>", tag->get_name()); + }); auto const negative_tags_string = fmt::format("{}", fmt::join(transformed_negative_tags, "")); From a36bb90ce2d3e9d7e446ce971cd7079677674f91 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 7 Nov 2024 02:59:03 -0500 Subject: [PATCH 169/323] Add test case for Tag class. --- tests/CMakeLists.txt | 2 +- tests/test-tag.cpp | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 tests/test-tag.cpp diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e15ec233..d150252f 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -21,7 +21,7 @@ set( ../src/log_surgeon/Token.hpp ) -set(SOURCES_TESTS test-lexer.cpp test-NFA.cpp) +set(SOURCES_TESTS test-lexer.cpp test-NFA.cpp test-tag.cpp) add_executable(unit-test ${SOURCES_LOG_SURGEON} ${SOURCES_TESTS}) target_link_libraries(unit-test PRIVATE Catch2::Catch2WithMain log_surgeon::log_surgeon) diff --git a/tests/test-tag.cpp b/tests/test-tag.cpp new file mode 100644 index 00000000..fdfff4c1 --- /dev/null +++ b/tests/test-tag.cpp @@ -0,0 +1,10 @@ +#include + +#include + +using log_surgeon::finite_automata::Tag; + +TEST_CASE("Test Tag class", "[Tag]") { + Tag const tag("uID"); + REQUIRE("uID" == tag.get_name()); +} From 59cc6cd2dd19064e4bf47c55d3843039b7f55c4c Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 7 Nov 2024 03:07:00 -0500 Subject: [PATCH 170/323] Add nullptr checks. --- src/log_surgeon/finite_automata/TaggedTransition.hpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index 71336de2..3efe7bc6 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -30,7 +30,7 @@ class PositiveTaggedTransition { [[nodiscard]] auto serialize(std::unordered_map const& state_ids ) const -> std::optional { auto const state_id_it = state_ids.find(m_dest_state); - if (state_id_it == state_ids.end()) { + if (state_id_it == state_ids.end() || nullptr == m_tag) { return std::nullopt; } return fmt::format("{}[{}]", state_id_it->second, m_tag->get_name()); @@ -62,9 +62,11 @@ class NegativeTaggedTransition { return std::nullopt; } - auto const tag_names = m_tags | std::ranges::views::transform([](Tag const* tag) { - return tag->get_name(); - }); + if (std::ranges::any_of(m_tags, [](Tag const* tag) { return tag == nullptr; })) { + return std::nullopt; + } + auto const tag_names = m_tags | std::ranges::views::transform(&Tag::get_name); + return fmt::format("{}[{}]", state_id_it->second, fmt::join(tag_names, ",")); } From 9fc41c09ff61022d819f0760fb5369a15ef4c3a0 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 7 Nov 2024 03:48:38 -0500 Subject: [PATCH 171/323] Change Tag class functionality to reflect how registers will be used. --- src/log_surgeon/finite_automata/Tag.hpp | 10 ++++++---- .../finite_automata/TaggedTransition.hpp | 15 ++++++--------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/src/log_surgeon/finite_automata/Tag.hpp b/src/log_surgeon/finite_automata/Tag.hpp index 1c91e61c..e37fe33f 100644 --- a/src/log_surgeon/finite_automata/Tag.hpp +++ b/src/log_surgeon/finite_automata/Tag.hpp @@ -19,11 +19,13 @@ class Tag { public: explicit Tag(std::string name) : m_name{std::move(name)} {} - auto add_start_pos(uint32_t start_pos) -> void { m_start_positions.push_back(start_pos); } + auto set_start_positions(std::vector start_positions) -> void { + m_start_positions = std::move(start_positions); + } - auto add_end_pos(uint32_t end_pos) -> void { m_end_positions.push_back(end_pos); } - - auto set_unmatched() -> void { m_start_positions.clear(); } + auto set_end_positions(std::vector end_positions) -> void { + m_end_positions = std::move(end_positions); + } [[nodiscard]] auto get_name() const -> std::string const& { return m_name; } diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index 9370e947..8a4c3c33 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -3,7 +3,6 @@ #include #include -#include #include #include @@ -21,9 +20,13 @@ class PositiveTaggedTransition { [[nodiscard]] auto get_dest_state() const -> NFAStateType const* { return m_dest_state; } - auto add_tag_start_pos(uint32_t start_pos) const -> void { m_tag->add_start_pos(start_pos); } + auto set_tag_start_positions(std::vector start_positions) const -> void { + m_tag->set_start_positions(std::move(start_positions)); + } - auto add_tag_end_pos(uint32_t end_pos) const -> void { m_tag->add_end_pos(end_pos); } + auto set_tag_end_positions(std::vector end_positions) const -> void { + m_tag->set_end_positions(std::move(end_positions)); + } /** * @param state_ids A map of states to their unique identifiers. @@ -53,12 +56,6 @@ class NegativeTaggedTransition { [[nodiscard]] auto get_dest_state() const -> NFAStateType const* { return m_dest_state; } - auto negate_tag() const -> void { - for (auto* tag : m_tags) { - tag->set_unmatched(); - } - } - /** * @param state_ids A map of states to their unique identifiers. * @return A string representation of the negative tagged transition on success. From 6e5c9684a7f13e9f84672d1902326dddc4260241 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 7 Nov 2024 08:56:45 -0500 Subject: [PATCH 172/323] Add register class. --- CMakeLists.txt | 1 + src/log_surgeon/finite_automata/Register.hpp | 35 ++++++++++++++++++++ tests/CMakeLists.txt | 1 + 3 files changed, 37 insertions(+) create mode 100644 src/log_surgeon/finite_automata/Register.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index e76ecb8c..20326d33 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,6 +99,7 @@ set(SOURCE_FILES src/log_surgeon/finite_automata/RegexNFA.hpp src/log_surgeon/finite_automata/RegexNFAState.hpp src/log_surgeon/finite_automata/RegexNFAStateType.hpp + src/log_surgeon/finite_automata/Register.hpp src/log_surgeon/finite_automata/Tag.hpp src/log_surgeon/finite_automata/TaggedTransition.hpp src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp diff --git a/src/log_surgeon/finite_automata/Register.hpp b/src/log_surgeon/finite_automata/Register.hpp new file mode 100644 index 00000000..d0be4f15 --- /dev/null +++ b/src/log_surgeon/finite_automata/Register.hpp @@ -0,0 +1,35 @@ +#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGISTER +#define LOG_SURGEON_FINITE_AUTOMATA_REGISTER + +#include + +#include + +namespace log_surgeon::finite_automata { +class Register { +public: + explicit Register(Tag* tag) : m_tag{tag} {} + + auto add_pos(uint32_t const pos) -> void { positions.push_back(pos); } + + auto update_last_position(uint32_t const pos) -> void { positions.back() = pos; } + + auto negate_last_position() -> void { positions.pop_back(); } + + auto negate_all_positions() -> void { positions.clear(); } + + [[nodiscard]] auto get_tag() const -> Tag* { return m_tag; } + + [[nodiscard]] auto get_last_position() const -> uint32_t { return positions.back(); } + + [[nodiscard]] auto get_all_positions() const -> std::vector const& { + return positions; + } + +private: + Tag* m_tag; + std::vector positions; +}; +} // namespace log_surgeon::finite_automata + +#endif // LOG_SURGEON_FINITE_AUTOMATA_REGISTER diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d150252f..e911ff58 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -6,6 +6,7 @@ set( ../src/log_surgeon/finite_automata/RegexNFA.hpp ../src/log_surgeon/finite_automata/RegexNFAState.hpp ../src/log_surgeon/finite_automata/RegexNFAStateType.hpp + ../src/log_surgeon/finite_automata/Register.hpp ../src/log_surgeon/finite_automata/Tag.hpp ../src/log_surgeon/finite_automata/TaggedTransition.hpp ../src/log_surgeon/LALR1Parser.cpp From d060bc60ad047c2768e83ad23fdb02b6958207d7 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 11 Nov 2024 10:58:39 -0500 Subject: [PATCH 173/323] Temp fix for unit-test until future PR where Tag ptrs are stored in vector instead of set. --- tests/test-lexer.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test-lexer.cpp b/tests/test-lexer.cpp index e369020f..09b8bb0c 100644 --- a/tests/test-lexer.cpp +++ b/tests/test-lexer.cpp @@ -152,7 +152,7 @@ TEST_CASE("Test the Schema class", "[Schema]") { "?\\d+" ")C" ")", - U"(Z<~letter2><~containerID><~letter1><~letter>)|(" + U"(Z<~containerID><~letter><~letter1><~letter2>)|(" "A(" "(((a)|(b))<~letter2>)|" "(((c)|(d))<~letter1>)" @@ -196,7 +196,7 @@ TEST_CASE("Test the Schema class", "[Schema]") { "){0,10}" ")", U"(" - U"(<~letterA><~letterB>)|((" + U"(<~letterB><~letterA>)|((" U"((a)<~letterB>)|" U"((b)<~letterA>)" U"){1,inf})" @@ -205,7 +205,7 @@ TEST_CASE("Test the Schema class", "[Schema]") { U"((c)<~letterD>)|" U"((d)<~letterC>)" U"){1,10})" - U"<~letterA><~letterB>)" + U"<~letterB><~letterA>)" // clang-format on ); } From f041a373392ac6f2f7233c104560e43f88eb97bd Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 11 Nov 2024 11:17:04 -0500 Subject: [PATCH 174/323] Swap from set to vector to tag pointers to ensure determinism. --- src/log_surgeon/finite_automata/RegexAST.hpp | 18 +++++++++++------- src/log_surgeon/finite_automata/RegexNFA.hpp | 4 ++-- .../finite_automata/RegexNFAState.hpp | 2 +- .../finite_automata/TaggedTransition.hpp | 4 ++-- tests/test-NFA.cpp | 2 +- tests/test-lexer.cpp | 10 +++++----- 6 files changed, 22 insertions(+), 18 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index cebae88c..0a4431cf 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -82,19 +82,23 @@ class RegexAST { */ [[nodiscard]] virtual auto serialize() const -> std::u32string = 0; - [[nodiscard]] auto get_subtree_positive_tags() const -> std::set const& { + [[nodiscard]] auto get_subtree_positive_tags() const -> std::vector const& { return m_subtree_positive_tags; } - auto set_subtree_positive_tags(std::set subtree_positive_tags) -> void { + auto set_subtree_positive_tags(std::vector subtree_positive_tags) -> void { m_subtree_positive_tags = std::move(subtree_positive_tags); } - auto add_subtree_positive_tags(std::set subtree_positive_tags) -> void { - m_subtree_positive_tags.merge(subtree_positive_tags); + auto add_subtree_positive_tags(std::vector subtree_positive_tags) -> void { + m_subtree_positive_tags.insert( + m_subtree_positive_tags.end(), + subtree_positive_tags.begin(), + subtree_positive_tags.end() + ); } - auto set_negative_tags(std::set negative_tags) -> void { + auto set_negative_tags(std::vector negative_tags) -> void { m_negative_tags = std::move(negative_tags); } @@ -141,8 +145,8 @@ class RegexAST { } private: - std::set m_subtree_positive_tags; - std::set m_negative_tags; + std::vector m_subtree_positive_tags; + std::vector m_negative_tags; }; /** diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index edbbf43a..7919a0c6 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -54,7 +54,7 @@ class RegexNFA { * @return NFAStateType* */ [[nodiscard]] auto new_state_with_negative_tagged_transition( - std::set tags, + std::vector tags, NFAStateType const* dest_state ) -> NFAStateType*; @@ -111,7 +111,7 @@ auto RegexNFA::new_state_with_positive_tagged_transition( template auto RegexNFA::new_state_with_negative_tagged_transition( - std::set tags, + std::vector tags, NFAStateType const* dest_state ) -> NFAStateType* { m_states.emplace_back(std::make_unique(std::move(tags), dest_state)); diff --git a/src/log_surgeon/finite_automata/RegexNFAState.hpp b/src/log_surgeon/finite_automata/RegexNFAState.hpp index f6d18d3c..dd21557b 100644 --- a/src/log_surgeon/finite_automata/RegexNFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexNFAState.hpp @@ -34,7 +34,7 @@ class RegexNFAState { RegexNFAState(Tag const* tag, RegexNFAState const* dest_state) : m_positive_tagged_transitions{{tag, dest_state}} {} - RegexNFAState(std::set tags, RegexNFAState const* dest_state) + RegexNFAState(std::vector tags, RegexNFAState const* dest_state) : m_negative_tagged_transition{NegativeTaggedTransition{std::move(tags), dest_state}} {} auto set_accepting(bool accepting) -> void { m_accepting = accepting; } diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index 3efe7bc6..2c238275 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -44,7 +44,7 @@ class PositiveTaggedTransition { template class NegativeTaggedTransition { public: - NegativeTaggedTransition(std::set tags, NFAStateType const* dest_state) + NegativeTaggedTransition(std::vector tags, NFAStateType const* dest_state) : m_tags{std::move(tags)}, m_dest_state{dest_state} {} @@ -71,7 +71,7 @@ class NegativeTaggedTransition { } private: - std::set const m_tags; + std::vector const m_tags; NFAStateType const* m_dest_state; }; } // namespace log_surgeon::finite_automata diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index 0c379028..c7a599b2 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -59,7 +59,7 @@ TEST_CASE("Test NFA", "[NFA]") { += "2:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_transitions={}," - "negative_tagged_transition={5[containerID,letter1,letter2,letter]}\n"; + "negative_tagged_transition={5[letter1,letter2,letter,containerID]}\n"; expected_serialized_nfa += "3:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_transitions={6[letter1]}," diff --git a/tests/test-lexer.cpp b/tests/test-lexer.cpp index 09b8bb0c..8906be8d 100644 --- a/tests/test-lexer.cpp +++ b/tests/test-lexer.cpp @@ -152,7 +152,7 @@ TEST_CASE("Test the Schema class", "[Schema]") { "?\\d+" ")C" ")", - U"(Z<~containerID><~letter><~letter1><~letter2>)|(" + U"(Z<~letter1><~letter2><~letter><~containerID>)|(" "A(" "(((a)|(b))<~letter2>)|" "(((c)|(d))<~letter1>)" @@ -196,16 +196,16 @@ TEST_CASE("Test the Schema class", "[Schema]") { "){0,10}" ")", U"(" - U"(<~letterB><~letterA>)|((" + U"(<~letterA><~letterB>)|((" U"((a)<~letterB>)|" U"((b)<~letterA>)" U"){1,inf})" - U"<~letterD><~letterC>)|(" - U"(<~letterD><~letterC>)|((" + U"<~letterC><~letterD>)|(" + U"(<~letterC><~letterD>)|((" U"((c)<~letterD>)|" U"((d)<~letterC>)" U"){1,10})" - U"<~letterB><~letterA>)" + U"<~letterA><~letterB>)" // clang-format on ); } From f72e1205cce4204926edb1b72bff5f273f69b005 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 11 Nov 2024 11:20:56 -0500 Subject: [PATCH 175/323] Better test coverage for tag class. --- tests/test-tag.cpp | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tests/test-tag.cpp b/tests/test-tag.cpp index fdfff4c1..7c5394fc 100644 --- a/tests/test-tag.cpp +++ b/tests/test-tag.cpp @@ -4,7 +4,19 @@ using log_surgeon::finite_automata::Tag; -TEST_CASE("Test Tag class", "[Tag]") { - Tag const tag("uID"); - REQUIRE("uID" == tag.get_name()); +TEST_CASE("Tag operations", "[Tag]") { + SECTION("Basic name retrieval works correctly") { + Tag const tag("uID"); + REQUIRE("uID" == tag.get_name()); + } + + SECTION("Empty tag name is handled correctly") { + Tag const empty_tag(""); + REQUIRE(empty_tag.get_name().empty()); + } + + SECTION("Special characters in tag names are preserved") { + Tag const special_tag("user.id-123_@"); + REQUIRE("user.id-123_@" == special_tag.get_name()); + } } From d5ac1adac2bb80b11dbb5ecd7c0ec90a55e8ae3c Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Tue, 12 Nov 2024 08:53:36 -0500 Subject: [PATCH 176/323] Use constant iterators for elements that should not change. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexAST.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 0a4431cf..906c4edf 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -93,8 +93,8 @@ class RegexAST { auto add_subtree_positive_tags(std::vector subtree_positive_tags) -> void { m_subtree_positive_tags.insert( m_subtree_positive_tags.end(), - subtree_positive_tags.begin(), - subtree_positive_tags.end() + subtree_positive_tags.cbegin(), + subtree_positive_tags.cend() ); } From 30f03ede7b13846fe57eed23c4233753005b7b6d Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 12 Nov 2024 08:54:37 -0500 Subject: [PATCH 177/323] Use braced intiailization in test-tag.cpp. --- tests/test-tag.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test-tag.cpp b/tests/test-tag.cpp index 7c5394fc..05a812cd 100644 --- a/tests/test-tag.cpp +++ b/tests/test-tag.cpp @@ -6,17 +6,17 @@ using log_surgeon::finite_automata::Tag; TEST_CASE("Tag operations", "[Tag]") { SECTION("Basic name retrieval works correctly") { - Tag const tag("uID"); + Tag const tag{"uID"}; REQUIRE("uID" == tag.get_name()); } SECTION("Empty tag name is handled correctly") { - Tag const empty_tag(""); + Tag const empty_tag{""}; REQUIRE(empty_tag.get_name().empty()); } SECTION("Special characters in tag names are preserved") { - Tag const special_tag("user.id-123_@"); + Tag const special_tag{"user.id-123_@"}; REQUIRE("user.id-123_@" == special_tag.get_name()); } } From d386fc053c8fad1142be343bb298d105aa4d0806 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Tue, 12 Nov 2024 08:58:50 -0500 Subject: [PATCH 178/323] Use const& for insertion function that can't use move semantics. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexAST.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 906c4edf..0ee544eb 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -90,7 +90,7 @@ class RegexAST { m_subtree_positive_tags = std::move(subtree_positive_tags); } - auto add_subtree_positive_tags(std::vector subtree_positive_tags) -> void { + auto add_subtree_positive_tags(std::vector const& subtree_positive_tags) -> void { m_subtree_positive_tags.insert( m_subtree_positive_tags.end(), subtree_positive_tags.cbegin(), From 4024c3eec05f60782db9ac9085604d75a9c6565a Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 13 Nov 2024 14:23:46 -0500 Subject: [PATCH 179/323] Have get_name() return string_view; Update headers. --- src/log_surgeon/finite_automata/RegexAST.hpp | 3 ++- src/log_surgeon/finite_automata/Tag.hpp | 3 ++- tests/test-lexer.cpp | 4 +--- tests/test-tag.cpp | 7 +++++-- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 0ee544eb..dd390cc9 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -699,7 +700,7 @@ class RegexASTCapture : public RegexAST { [[nodiscard]] auto serialize() const -> std::u32string override; - [[nodiscard]] auto get_group_name() const -> std::string const& { return m_tag->get_name(); } + [[nodiscard]] auto get_group_name() const -> std::string_view { return m_tag->get_name(); } [[nodiscard]] auto get_group_regex_ast( ) const -> std::unique_ptr> const& { diff --git a/src/log_surgeon/finite_automata/Tag.hpp b/src/log_surgeon/finite_automata/Tag.hpp index 36e7c3d1..5a30071e 100644 --- a/src/log_surgeon/finite_automata/Tag.hpp +++ b/src/log_surgeon/finite_automata/Tag.hpp @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -19,7 +20,7 @@ class Tag { public: explicit Tag(std::string name) : m_name{std::move(name)} {} - [[nodiscard]] auto get_name() const -> std::string const& { return m_name; } + [[nodiscard]] auto get_name() const -> std::string_view { return m_name; } private: std::string const m_name; diff --git a/tests/test-lexer.cpp b/tests/test-lexer.cpp index 8906be8d..6c0ee042 100644 --- a/tests/test-lexer.cpp +++ b/tests/test-lexer.cpp @@ -1,7 +1,5 @@ #include -#include #include -#include #include #include #include @@ -118,7 +116,7 @@ TEST_CASE("Test the Schema class", "[Schema]") { auto* regex_ast_capture = dynamic_cast(regex_ast_cat_ptr->get_right()); REQUIRE(nullptr != regex_ast_capture); - REQUIRE("uID" == regex_ast_capture->get_group_name()); + REQUIRE("uID" == string{regex_ast_capture->get_group_name()}); auto* regex_ast_multiplication_ast = dynamic_cast( regex_ast_capture->get_group_regex_ast().get() diff --git a/tests/test-tag.cpp b/tests/test-tag.cpp index 05a812cd..fa7f6b9f 100644 --- a/tests/test-tag.cpp +++ b/tests/test-tag.cpp @@ -2,12 +2,15 @@ #include +#include + using log_surgeon::finite_automata::Tag; +using std::string; TEST_CASE("Tag operations", "[Tag]") { SECTION("Basic name retrieval works correctly") { Tag const tag{"uID"}; - REQUIRE("uID" == tag.get_name()); + REQUIRE("uID" == string{tag.get_name()}); } SECTION("Empty tag name is handled correctly") { @@ -17,6 +20,6 @@ TEST_CASE("Tag operations", "[Tag]") { SECTION("Special characters in tag names are preserved") { Tag const special_tag{"user.id-123_@"}; - REQUIRE("user.id-123_@" == special_tag.get_name()); + REQUIRE("user.id-123_@" == string{special_tag.get_name()}); } } From 22c3b8200a2fd8722e2fb68ea54b4cf43ef4351f Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 13 Nov 2024 14:25:06 -0500 Subject: [PATCH 180/323] Remove const from member variable. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/Tag.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/Tag.hpp b/src/log_surgeon/finite_automata/Tag.hpp index 5a30071e..f552ecc6 100644 --- a/src/log_surgeon/finite_automata/Tag.hpp +++ b/src/log_surgeon/finite_automata/Tag.hpp @@ -23,7 +23,7 @@ class Tag { [[nodiscard]] auto get_name() const -> std::string_view { return m_name; } private: - std::string const m_name; + std::string m_name; std::vector m_starts; std::vector m_ends; }; From ed5553431ddc7cfe8f0eeb0863c384f21865738e Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 13 Nov 2024 14:25:18 -0500 Subject: [PATCH 181/323] Remove const from member variable. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/TaggedTransition.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index 2c238275..effd88c6 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -71,7 +71,7 @@ class NegativeTaggedTransition { } private: - std::vector const m_tags; + std::vector m_tags; NFAStateType const* m_dest_state; }; } // namespace log_surgeon::finite_automata From 534afce1f12491dbc22c5a0fe949028cf9170178 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 13 Nov 2024 14:26:58 -0500 Subject: [PATCH 182/323] Run linter. --- tests/test-tag.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test-tag.cpp b/tests/test-tag.cpp index fa7f6b9f..28d4fedd 100644 --- a/tests/test-tag.cpp +++ b/tests/test-tag.cpp @@ -1,9 +1,9 @@ +#include + #include #include -#include - using log_surgeon::finite_automata::Tag; using std::string; From 61fdb5dabdca7d185006a49feae2ffcde5a66914 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 13 Nov 2024 14:42:30 -0500 Subject: [PATCH 183/323] Add move semantic test cases. --- tests/test-tag.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test-tag.cpp b/tests/test-tag.cpp index 28d4fedd..a8b35e99 100644 --- a/tests/test-tag.cpp +++ b/tests/test-tag.cpp @@ -22,4 +22,14 @@ TEST_CASE("Tag operations", "[Tag]") { Tag const special_tag{"user.id-123_@"}; REQUIRE("user.id-123_@" == string{special_tag.get_name()}); } + + SECTION("Move semantics work correctly") { + Tag original_tag{"source"}; + Tag moved_tag{std::move(original_tag)}; + REQUIRE("source" == string{moved_tag.get_name()}); + + Tag assign_tag{"target"}; + assign_tag = Tag{"new_source"}; + REQUIRE("new_source" == string{assign_tag.get_name()}); + } } From 78e5fe8ef050d534374aff688e75af7ef2dd5375 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 13 Nov 2024 14:54:58 -0500 Subject: [PATCH 184/323] Add PositiveTaggedTransition docstring and make m_tag throw if ever null. --- src/log_surgeon/finite_automata/TaggedTransition.hpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index effd88c6..571c9425 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -13,11 +13,18 @@ #include namespace log_surgeon::finite_automata { + +/** + * Represents an NFA transition indicating a capture group has been matched. + * `m_tag` is always expected to be non-null. + * @throw std::invalid_argument Thrown when a null tag is passed into the constructor. + * @tparam NFAStateType Specifies the type of transition (bytes or UTF-8 characters). + */ template class PositiveTaggedTransition { public: PositiveTaggedTransition(Tag const* tag, NFAStateType const* dest_state) - : m_tag{tag}, + : m_tag{nullptr == tag ? throw std::invalid_argument("tag cannot be null") : tag}, m_dest_state{dest_state} {} [[nodiscard]] auto get_dest_state() const -> NFAStateType const* { return m_dest_state; } @@ -30,7 +37,7 @@ class PositiveTaggedTransition { [[nodiscard]] auto serialize(std::unordered_map const& state_ids ) const -> std::optional { auto const state_id_it = state_ids.find(m_dest_state); - if (state_id_it == state_ids.end() || nullptr == m_tag) { + if (state_id_it == state_ids.end()) { return std::nullopt; } return fmt::format("{}[{}]", state_id_it->second, m_tag->get_name()); From 630d882b025f5e5234e232eeaafeb1f780956103 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 13 Nov 2024 15:14:24 -0500 Subject: [PATCH 185/323] Delete unused operators. --- src/log_surgeon/finite_automata/RegexAST.hpp | 22 +++----------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index dd390cc9..6be339b9 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -123,9 +123,9 @@ class RegexAST { protected: RegexAST(RegexAST const& rhs) = default; - auto operator=(RegexAST const& rhs) -> RegexAST& = default; - RegexAST(RegexAST&& rhs) noexcept = default; - auto operator=(RegexAST&& rhs) noexcept -> RegexAST& = default; + auto operator=(RegexAST const& rhs) -> RegexAST& = delete; + RegexAST(RegexAST&& rhs) noexcept = delete; + auto operator=(RegexAST&& rhs) noexcept -> RegexAST& = delete; [[nodiscard]] auto serialize_negative_tags() const -> std::u32string { if (m_negative_tags.empty()) { @@ -439,10 +439,6 @@ class RegexASTOr : public RegexAST { m_left(std::unique_ptr>(rhs.m_left->clone())), m_right(std::unique_ptr>(rhs.m_right->clone())) {} - auto operator=(RegexASTOr const& rhs) -> RegexASTOr& = default; - RegexASTOr(RegexASTOr&& rhs) noexcept = default; - auto operator=(RegexASTOr&& rhs) noexcept -> RegexASTOr& = default; - /** * Used for cloning a unique_pointer of type RegexASTOr * @return RegexASTOr* @@ -506,10 +502,6 @@ class RegexASTCat : public RegexAST { m_left(std::unique_ptr>(rhs.m_left->clone())), m_right(std::unique_ptr>(rhs.m_right->clone())) {} - auto operator=(RegexASTCat const& rhs) -> RegexASTCat& = default; - RegexASTCat(RegexASTCat&& rhs) noexcept = default; - auto operator=(RegexASTCat&& rhs) noexcept -> RegexASTCat& = default; - /** * Used for cloning a unique_pointer of type RegexASTCat * @return RegexASTCat* @@ -575,10 +567,6 @@ class RegexASTMultiplication : public RegexAST { m_min(rhs.m_min), m_max(rhs.m_max) {} - auto operator=(RegexASTMultiplication const& rhs) -> RegexASTMultiplication& = default; - RegexASTMultiplication(RegexASTMultiplication&& rhs) noexcept = default; - auto operator=(RegexASTMultiplication&& rhs) noexcept -> RegexASTMultiplication& = default; - /** * Used for cloning a unique_pointer of type RegexASTMultiplication * @return RegexASTMultiplication* @@ -659,10 +647,6 @@ class RegexASTCapture : public RegexAST { RegexAST::set_subtree_positive_tags(rhs.get_subtree_positive_tags()); } - auto operator=(RegexASTCapture const& rhs) -> RegexASTCapture& = default; - RegexASTCapture(RegexASTCapture&& rhs) noexcept = default; - auto operator=(RegexASTCapture&& rhs) noexcept -> RegexASTCapture& = default; - /** * Used for cloning a `unique_pointer` of type `RegexASTCapture`. * @return RegexASTCapture* From 543f8af0cfcbaef4905fe4d0b7517a486a864405 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 13 Nov 2024 15:24:47 -0500 Subject: [PATCH 186/323] Move null check into intiailizer list for NegativeTaggedTransition constructor; Add docstring to NegativeTaggedTransition. --- .../finite_automata/TaggedTransition.hpp | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index 571c9425..30d48942 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -15,9 +15,9 @@ namespace log_surgeon::finite_automata { /** - * Represents an NFA transition indicating a capture group has been matched. + * Represents an NFA transition indicating that a capture group has been matched. * `m_tag` is always expected to be non-null. - * @throw std::invalid_argument Thrown when a null tag is passed into the constructor. + * @throw std::invalid_argument Thrown if a null tag is passed into the constructor. * @tparam NFAStateType Specifies the type of transition (bytes or UTF-8 characters). */ template @@ -48,11 +48,22 @@ class PositiveTaggedTransition { NFAStateType const* m_dest_state; }; +/** + * Represents an NFA transition indicating that a capture group has been unmatched. + * All tags in `m_tags` are always expected to be non-null. + * @throw std::invalid_argument Thrown if any tag passed into the constructor is null. + * @tparam NFAStateType Specifies the type of transition (bytes or UTF-8 characters). + */ template class NegativeTaggedTransition { public: NegativeTaggedTransition(std::vector tags, NFAStateType const* dest_state) - : m_tags{std::move(tags)}, + : m_tags{[&tags] { + if (std::ranges::any_of(tags, [](Tag const* tag) { return nullptr == tag; })) { + throw std::invalid_argument("tags cannot contain null elements"); + } + return std::move(tags); + }()}, m_dest_state{dest_state} {} [[nodiscard]] auto get_dest_state() const -> NFAStateType const* { return m_dest_state; } @@ -69,9 +80,6 @@ class NegativeTaggedTransition { return std::nullopt; } - if (std::ranges::any_of(m_tags, [](Tag const* tag) { return tag == nullptr; })) { - return std::nullopt; - } auto const tag_names = m_tags | std::ranges::views::transform(&Tag::get_name); return fmt::format("{}[{}]", state_id_it->second, fmt::join(tag_names, ",")); From ec342fc34036ef783739010ca562e5b10a9c6b43 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 13 Nov 2024 16:12:15 -0500 Subject: [PATCH 187/323] Remove position vectors from Tag, as they arent used in the AST. --- src/log_surgeon/finite_automata/RegexAST.hpp | 2 +- src/log_surgeon/finite_automata/Tag.hpp | 12 ------------ 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 6be339b9..3fd2b104 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -643,7 +643,7 @@ class RegexASTCapture : public RegexAST { m_group_regex_ast{ std::unique_ptr>(rhs.m_group_regex_ast->clone()) }, - m_tag{rhs.m_tag ? std::make_unique(*rhs.m_tag) : nullptr} { + m_tag{std::make_unique(*rhs.m_tag)} { RegexAST::set_subtree_positive_tags(rhs.get_subtree_positive_tags()); } diff --git a/src/log_surgeon/finite_automata/Tag.hpp b/src/log_surgeon/finite_automata/Tag.hpp index f552ecc6..3a3b4d7f 100644 --- a/src/log_surgeon/finite_automata/Tag.hpp +++ b/src/log_surgeon/finite_automata/Tag.hpp @@ -1,21 +1,11 @@ #ifndef LOG_SURGEON_FINITE_AUTOMATA_TAG #define LOG_SURGEON_FINITE_AUTOMATA_TAG -#include #include #include #include -#include namespace log_surgeon::finite_automata { -/** - * This class represents a tag that is associated with matches of a capture group. If `m_starts` is - * empty, it indicates that the capture group was unmatched. - * - * Since capture group regex can be contained within repetition regex, - * (e.g., "((user_id=(?\d+),)+"), `m_starts` and `m_ends` are vectors that track the locations - * of each occurrence of the capture group. - */ class Tag { public: explicit Tag(std::string name) : m_name{std::move(name)} {} @@ -24,8 +14,6 @@ class Tag { private: std::string m_name; - std::vector m_starts; - std::vector m_ends; }; } // namespace log_surgeon::finite_automata From af86281948843e8f0ae8389e87d07e3fe5b77bda Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 13 Nov 2024 17:14:19 -0500 Subject: [PATCH 188/323] RegexASTCapture enforces non-null arguments; Add docstring to RegexASTCapture; Use cbegin() and cend(). --- src/log_surgeon/finite_automata/RegexAST.hpp | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 3fd2b104..d78a8e76 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -621,6 +621,14 @@ class RegexASTMultiplication : public RegexAST { uint32_t m_max; }; +/** + * Represents a capture group AST node. + * `m_tag` is always expected to be non-null. + * `m_group_regex_ast` is always expected to be non-null. + * @throw std::invalid_argument Thrown if a null tag or group regex AST is passed into the + * constructor. + * @tparam NFAStateType Specifies the type of transition (bytes or UTF-8 characters). + */ template class RegexASTCapture : public RegexAST { public: @@ -630,8 +638,9 @@ class RegexASTCapture : public RegexAST { std::unique_ptr> group_regex_ast, std::unique_ptr tag ) - : m_group_regex_ast{std::move(group_regex_ast)}, - m_tag{std::move(tag)} { + : m_group_regex_ast{nullptr == group_regex_ast ? throw std::invalid_argument("group regex AST cannot be null") : std::move(group_regex_ast)}, + m_tag{nullptr == tag ? throw std::invalid_argument("tag cannot be null") + : std::move(tag)} { RegexAST::set_subtree_positive_tags( m_group_regex_ast->get_subtree_positive_tags() ); @@ -884,10 +893,10 @@ void RegexASTCapture::add_to_nfa(RegexNFA* nfa, NFAS template [[nodiscard]] auto RegexASTCapture::serialize() const -> std::u32string { - auto const tag_name_u32 = std::u32string(m_tag->get_name().begin(), m_tag->get_name().end()); + auto const tag_name_u32 = std::u32string(m_tag->get_name().cbegin(), m_tag->get_name().cend()); return fmt::format( U"({})<{}>{}", - nullptr != m_group_regex_ast ? m_group_regex_ast->serialize() : U"null", + m_group_regex_ast->serialize(), tag_name_u32, RegexAST::serialize_negative_tags() ); From 738becd7481dea631c246a4b67ecc237db976c44 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 13 Nov 2024 17:17:48 -0500 Subject: [PATCH 189/323] Capitalize exceptions. --- src/log_surgeon/finite_automata/RegexAST.hpp | 4 ++-- src/log_surgeon/finite_automata/TaggedTransition.hpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index d78a8e76..d37067ec 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -638,8 +638,8 @@ class RegexASTCapture : public RegexAST { std::unique_ptr> group_regex_ast, std::unique_ptr tag ) - : m_group_regex_ast{nullptr == group_regex_ast ? throw std::invalid_argument("group regex AST cannot be null") : std::move(group_regex_ast)}, - m_tag{nullptr == tag ? throw std::invalid_argument("tag cannot be null") + : m_group_regex_ast{nullptr == group_regex_ast ? throw std::invalid_argument("Group regex AST cannot be null") : std::move(group_regex_ast)}, + m_tag{nullptr == tag ? throw std::invalid_argument("Tag cannot be null") : std::move(tag)} { RegexAST::set_subtree_positive_tags( m_group_regex_ast->get_subtree_positive_tags() diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index 30d48942..309fc97d 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -24,7 +24,7 @@ template class PositiveTaggedTransition { public: PositiveTaggedTransition(Tag const* tag, NFAStateType const* dest_state) - : m_tag{nullptr == tag ? throw std::invalid_argument("tag cannot be null") : tag}, + : m_tag{nullptr == tag ? throw std::invalid_argument("Tag cannot be null") : tag}, m_dest_state{dest_state} {} [[nodiscard]] auto get_dest_state() const -> NFAStateType const* { return m_dest_state; } @@ -60,7 +60,7 @@ class NegativeTaggedTransition { NegativeTaggedTransition(std::vector tags, NFAStateType const* dest_state) : m_tags{[&tags] { if (std::ranges::any_of(tags, [](Tag const* tag) { return nullptr == tag; })) { - throw std::invalid_argument("tags cannot contain null elements"); + throw std::invalid_argument("Tags cannot contain null elements"); } return std::move(tags); }()}, From 789263ea958b724a85ad9fd0f0dc7dbfd0300241 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 13 Nov 2024 17:32:47 -0500 Subject: [PATCH 190/323] Use () to fix linting issue. --- src/log_surgeon/finite_automata/RegexAST.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index d37067ec..57a8ae15 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -638,7 +638,11 @@ class RegexASTCapture : public RegexAST { std::unique_ptr> group_regex_ast, std::unique_ptr tag ) - : m_group_regex_ast{nullptr == group_regex_ast ? throw std::invalid_argument("Group regex AST cannot be null") : std::move(group_regex_ast)}, + : m_group_regex_ast{( + nullptr == group_regex_ast + ? throw std::invalid_argument("Group regex AST cannot be null") + : std::move(group_regex_ast) + )}, m_tag{nullptr == tag ? throw std::invalid_argument("Tag cannot be null") : std::move(tag)} { RegexAST::set_subtree_positive_tags( From 1f15ca712709244a7d61f56e7a31c192e3471934 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 14 Nov 2024 10:08:39 -0500 Subject: [PATCH 191/323] Keep default copy assignment. --- src/log_surgeon/finite_automata/RegexAST.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 57a8ae15..ed23ece5 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -123,7 +123,7 @@ class RegexAST { protected: RegexAST(RegexAST const& rhs) = default; - auto operator=(RegexAST const& rhs) -> RegexAST& = delete; + auto operator=(RegexAST const& rhs) -> RegexAST& = default; RegexAST(RegexAST&& rhs) noexcept = delete; auto operator=(RegexAST&& rhs) noexcept -> RegexAST& = delete; From 7688c24442930a5e6aad9af8cd2fe0c532138c79 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 14 Nov 2024 10:15:31 -0500 Subject: [PATCH 192/323] Move @throw to constructor docstrings. --- src/log_surgeon/finite_automata/RegexAST.hpp | 7 +++++-- src/log_surgeon/finite_automata/TaggedTransition.hpp | 12 ++++++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index ed23ece5..acb9ce39 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -625,8 +625,6 @@ class RegexASTMultiplication : public RegexAST { * Represents a capture group AST node. * `m_tag` is always expected to be non-null. * `m_group_regex_ast` is always expected to be non-null. - * @throw std::invalid_argument Thrown if a null tag or group regex AST is passed into the - * constructor. * @tparam NFAStateType Specifies the type of transition (bytes or UTF-8 characters). */ template @@ -634,6 +632,11 @@ class RegexASTCapture : public RegexAST { public: ~RegexASTCapture() override = default; + /** + * @param group_regex_ast + * @param tag + * @throw std::invalid_argument if `group_regex_ast` or `tag` are `nullptr`. + */ RegexASTCapture( std::unique_ptr> group_regex_ast, std::unique_ptr tag diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index 309fc97d..f1460aa7 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -17,12 +17,16 @@ namespace log_surgeon::finite_automata { /** * Represents an NFA transition indicating that a capture group has been matched. * `m_tag` is always expected to be non-null. - * @throw std::invalid_argument Thrown if a null tag is passed into the constructor. * @tparam NFAStateType Specifies the type of transition (bytes or UTF-8 characters). */ template class PositiveTaggedTransition { public: + /** + * @param tag + * @param dest_state + * @throw std::invalid_argument if `tag` is `nullptr`. + */ PositiveTaggedTransition(Tag const* tag, NFAStateType const* dest_state) : m_tag{nullptr == tag ? throw std::invalid_argument("Tag cannot be null") : tag}, m_dest_state{dest_state} {} @@ -51,12 +55,16 @@ class PositiveTaggedTransition { /** * Represents an NFA transition indicating that a capture group has been unmatched. * All tags in `m_tags` are always expected to be non-null. - * @throw std::invalid_argument Thrown if any tag passed into the constructor is null. * @tparam NFAStateType Specifies the type of transition (bytes or UTF-8 characters). */ template class NegativeTaggedTransition { public: + /** + * @param tags + * @param dest_state + * @throw std::invalid_argument if any elements in `tags` is `nullptr`. + */ NegativeTaggedTransition(std::vector tags, NFAStateType const* dest_state) : m_tags{[&tags] { if (std::ranges::any_of(tags, [](Tag const* tag) { return nullptr == tag; })) { From 486190a0fedd66f70a3715c610670f22ab401c6f Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 14 Nov 2024 11:12:34 -0500 Subject: [PATCH 193/323] Do string_viee comparisomn in lexer test. --- tests/test-lexer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-lexer.cpp b/tests/test-lexer.cpp index 6c0ee042..dd305a76 100644 --- a/tests/test-lexer.cpp +++ b/tests/test-lexer.cpp @@ -116,7 +116,7 @@ TEST_CASE("Test the Schema class", "[Schema]") { auto* regex_ast_capture = dynamic_cast(regex_ast_cat_ptr->get_right()); REQUIRE(nullptr != regex_ast_capture); - REQUIRE("uID" == string{regex_ast_capture->get_group_name()}); + REQUIRE("uID" == regex_ast_capture->get_group_name()); auto* regex_ast_multiplication_ast = dynamic_cast( regex_ast_capture->get_group_regex_ast().get() From ac75909319c2c2cfba0f32e9afc01889c40407dd Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 14 Nov 2024 11:17:16 -0500 Subject: [PATCH 194/323] Use string_view compares in tag tests. --- tests/test-tag.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tests/test-tag.cpp b/tests/test-tag.cpp index a8b35e99..90264939 100644 --- a/tests/test-tag.cpp +++ b/tests/test-tag.cpp @@ -1,16 +1,13 @@ -#include - #include #include using log_surgeon::finite_automata::Tag; -using std::string; TEST_CASE("Tag operations", "[Tag]") { SECTION("Basic name retrieval works correctly") { Tag const tag{"uID"}; - REQUIRE("uID" == string{tag.get_name()}); + REQUIRE("uID" == tag.get_name()); } SECTION("Empty tag name is handled correctly") { @@ -20,16 +17,16 @@ TEST_CASE("Tag operations", "[Tag]") { SECTION("Special characters in tag names are preserved") { Tag const special_tag{"user.id-123_@"}; - REQUIRE("user.id-123_@" == string{special_tag.get_name()}); + REQUIRE("user.id-123_@" == special_tag.get_name()); } SECTION("Move semantics work correctly") { Tag original_tag{"source"}; Tag moved_tag{std::move(original_tag)}; - REQUIRE("source" == string{moved_tag.get_name()}); + REQUIRE("source" == moved_tag.get_name()); Tag assign_tag{"target"}; assign_tag = Tag{"new_source"}; - REQUIRE("new_source" == string{assign_tag.get_name()}); + REQUIRE("new_source" == assign_tag.get_name()); } } From 090f18cae3ead36721fd32a85cdb3e21fec863f0 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 15 Nov 2024 11:15:24 -0500 Subject: [PATCH 195/323] Update headers in TaggedTransition.hpp. --- src/log_surgeon/finite_automata/TaggedTransition.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index f1460aa7..c4cfb76a 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -3,13 +3,12 @@ #include #include -#include +#include #include #include #include -#include #include namespace log_surgeon::finite_automata { From c7cfc10b914e54e23085282160bb7de66690ec63 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 15 Nov 2024 11:16:06 -0500 Subject: [PATCH 196/323] Seperate copy and move constructor unit-tests. --- tests/test-tag.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/test-tag.cpp b/tests/test-tag.cpp index 90264939..41f8a2ef 100644 --- a/tests/test-tag.cpp +++ b/tests/test-tag.cpp @@ -20,13 +20,15 @@ TEST_CASE("Tag operations", "[Tag]") { REQUIRE("user.id-123_@" == special_tag.get_name()); } - SECTION("Move semantics work correctly") { - Tag original_tag{"source"}; - Tag moved_tag{std::move(original_tag)}; - REQUIRE("source" == moved_tag.get_name()); - + SECTION("Copy constructor works correctly") { Tag assign_tag{"target"}; assign_tag = Tag{"new_source"}; REQUIRE("new_source" == assign_tag.get_name()); } + + SECTION("Move constructor works correctly") { + Tag original_tag{"source"}; + Tag moved_tag{std::move(original_tag)}; + REQUIRE("source" == moved_tag.get_name()); + } } From 91b8b515eacd07ec77f37469c5303daa4175fa38 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Fri, 15 Nov 2024 11:17:37 -0500 Subject: [PATCH 197/323] Use NOTE for class requirements. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexAST.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index acb9ce39..c0c6b04f 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -623,8 +623,9 @@ class RegexASTMultiplication : public RegexAST { /** * Represents a capture group AST node. - * `m_tag` is always expected to be non-null. - * `m_group_regex_ast` is always expected to be non-null. + * NOTE: + * - `m_tag` is always expected to be non-null. + * - `m_group_regex_ast` is always expected to be non-null. * @tparam NFAStateType Specifies the type of transition (bytes or UTF-8 characters). */ template From fcb1a76fb34e2f2c08d065dbe29dc7620f22d791 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Fri, 15 Nov 2024 11:17:44 -0500 Subject: [PATCH 198/323] Use NOTE for class requirements. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/TaggedTransition.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index c4cfb76a..2d530d7b 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -15,7 +15,7 @@ namespace log_surgeon::finite_automata { /** * Represents an NFA transition indicating that a capture group has been matched. - * `m_tag` is always expected to be non-null. + * NOTE: `m_tag` is always expected to be non-null. * @tparam NFAStateType Specifies the type of transition (bytes or UTF-8 characters). */ template From 9b09e1991c70206793f809661e319f7ad8edfeae Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Fri, 15 Nov 2024 11:17:50 -0500 Subject: [PATCH 199/323] Use NOTE for class requirements. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/TaggedTransition.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index 2d530d7b..86fe7a39 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -53,7 +53,7 @@ class PositiveTaggedTransition { /** * Represents an NFA transition indicating that a capture group has been unmatched. - * All tags in `m_tags` are always expected to be non-null. + * NOTE: All tags in `m_tags` are always expected to be non-null. * @tparam NFAStateType Specifies the type of transition (bytes or UTF-8 characters). */ template From 75aecc44a4b3d5300dd1444c57ac259de70109c4 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 18 Nov 2024 10:41:52 -0500 Subject: [PATCH 200/323] Update install-catch2.sh to compile catch2 with c++17. --- tools/deps-install/ubuntu/install-catch2.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/deps-install/ubuntu/install-catch2.sh b/tools/deps-install/ubuntu/install-catch2.sh index bb5ebfbe..aa063d72 100755 --- a/tools/deps-install/ubuntu/install-catch2.sh +++ b/tools/deps-install/ubuntu/install-catch2.sh @@ -69,7 +69,7 @@ fi # Build cd "$extracted_dir" -cmake -B build -S . -DBUILD_TESTING=OFF +cmake -B build -S . -DBUILD_TESTING=OFF -DCMAKE_CXX_STANDARD=17 cmake --build build --parallel "$num_cpus" # Check if checkinstall is installed From 507a7d3fa0a9ce9b52204c6f4efa5349ae5ee755 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 18 Nov 2024 11:10:37 -0500 Subject: [PATCH 201/323] Loop over end_transitions correctly. --- src/log_surgeon/Lexer.tpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index 43750036..bdac76a1 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -411,7 +411,7 @@ auto Lexer::epsilon_closure(NFAStateType const* stat stack.push(positive_tagged_start_transition.get_dest_state()); } for (auto const& positive_tagged_end_transition : - current_state->get_positive_tagged_start_transitions()) + current_state->get_positive_tagged_end_transitions()) { stack.push(positive_tagged_end_transition.get_dest_state()); } From 34c227b74268e0ccea2847187c86dacfc408b89a Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 18 Nov 2024 11:29:33 -0500 Subject: [PATCH 202/323] Add TagPositions class. --- src/log_surgeon/finite_automata/Tag.hpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/log_surgeon/finite_automata/Tag.hpp b/src/log_surgeon/finite_automata/Tag.hpp index 3a3b4d7f..7fa5ceca 100644 --- a/src/log_surgeon/finite_automata/Tag.hpp +++ b/src/log_surgeon/finite_automata/Tag.hpp @@ -15,6 +15,16 @@ class Tag { private: std::string m_name; }; + +class TagPositions { +public: + explicit TagPositions(Tag const* tag) : m_tag{tag} {} + +private: + Tag const* m_tag; + std::vector start_positions; + std::vector end_positions; +}; } // namespace log_surgeon::finite_automata #endif // LOG_SURGEON_FINITE_AUTOMATA_TAG From 27c8560a578e7d5dc5b8e5496c393c43798a5b0a Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 18 Nov 2024 11:38:32 -0500 Subject: [PATCH 203/323] Remove new class, going to add it later. --- src/log_surgeon/finite_automata/Tag.hpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/log_surgeon/finite_automata/Tag.hpp b/src/log_surgeon/finite_automata/Tag.hpp index 7fa5ceca..3a3b4d7f 100644 --- a/src/log_surgeon/finite_automata/Tag.hpp +++ b/src/log_surgeon/finite_automata/Tag.hpp @@ -15,16 +15,6 @@ class Tag { private: std::string m_name; }; - -class TagPositions { -public: - explicit TagPositions(Tag const* tag) : m_tag{tag} {} - -private: - Tag const* m_tag; - std::vector start_positions; - std::vector end_positions; -}; } // namespace log_surgeon::finite_automata #endif // LOG_SURGEON_FINITE_AUTOMATA_TAG From 86caa9bff25fe8a44c24bdafc699baf479fdaceb Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 18 Nov 2024 11:43:18 -0500 Subject: [PATCH 204/323] Add const back in. --- src/log_surgeon/finite_automata/RegexAST.hpp | 10 +++++----- src/log_surgeon/finite_automata/RegexNFA.hpp | 16 ++++++++-------- .../finite_automata/RegexNFAState.hpp | 6 +++--- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 5b5a82db..6c98109f 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -87,11 +87,11 @@ class RegexAST { return m_subtree_positive_tags; } - auto set_subtree_positive_tags(std::vector subtree_positive_tags) -> void { + auto set_subtree_positive_tags(std::vector subtree_positive_tags) -> void { m_subtree_positive_tags = std::move(subtree_positive_tags); } - auto add_subtree_positive_tags(std::vector const& subtree_positive_tags) -> void { + auto add_subtree_positive_tags(std::vector const& subtree_positive_tags) -> void { m_subtree_positive_tags.insert( m_subtree_positive_tags.end(), subtree_positive_tags.cbegin(), @@ -99,7 +99,7 @@ class RegexAST { ); } - auto set_negative_tags(std::vector negative_tags) -> void { + auto set_negative_tags(std::vector negative_tags) -> void { m_negative_tags = std::move(negative_tags); } @@ -146,8 +146,8 @@ class RegexAST { } private: - std::vector m_subtree_positive_tags; - std::vector m_negative_tags; + std::vector m_subtree_positive_tags; + std::vector m_negative_tags; }; /** diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 9fa0112b..c35aa83a 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -42,8 +42,8 @@ class RegexNFA { * @return NFAStateType* */ [[nodiscard]] auto new_state_with_positive_tagged_transition( - Tag* tag, - NFAStateType* dest_state + Tag const* tag, + NFAStateType const* dest_state ) -> NFAStateType*; /** @@ -54,8 +54,8 @@ class RegexNFA { * @return NFAStateType* */ [[nodiscard]] auto new_state_with_negative_tagged_transition( - std::vector tags, - NFAStateType* dest_state + std::vector tags, + NFAStateType const* dest_state ) -> NFAStateType*; /** @@ -102,8 +102,8 @@ auto RegexNFA::new_state() -> NFAStateType* { template auto RegexNFA::new_state_with_positive_tagged_transition( - Tag* tag, - NFAStateType* dest_state + Tag const* tag, + NFAStateType const* dest_state ) -> NFAStateType* { m_states.emplace_back(std::make_unique(tag, dest_state)); return m_states.back().get(); @@ -111,8 +111,8 @@ auto RegexNFA::new_state_with_positive_tagged_transition( template auto RegexNFA::new_state_with_negative_tagged_transition( - std::vector tags, - NFAStateType* dest_state + std::vector tags, + NFAStateType const* dest_state ) -> NFAStateType* { m_states.emplace_back(std::make_unique(std::move(tags), dest_state)); return m_states.back().get(); diff --git a/src/log_surgeon/finite_automata/RegexNFAState.hpp b/src/log_surgeon/finite_automata/RegexNFAState.hpp index 139b179b..94df249f 100644 --- a/src/log_surgeon/finite_automata/RegexNFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexNFAState.hpp @@ -31,10 +31,10 @@ class RegexNFAState { RegexNFAState() = default; - RegexNFAState(Tag* tag, RegexNFAState* dest_state) + RegexNFAState(Tag const* tag, RegexNFAState* dest_state) : m_positive_tagged_end_transitions{{tag, dest_state}} {} - RegexNFAState(std::vector tags, RegexNFAState* dest_state) + RegexNFAState(std::vector tags, RegexNFAState* dest_state) : m_negative_tagged_transition{NegativeTaggedTransition{std::move(tags), dest_state}} {} auto set_accepting(bool accepting) -> void { m_accepting = accepting; } @@ -49,7 +49,7 @@ class RegexNFAState { return m_matching_variable_id; } - auto add_positive_tagged_start_transition(Tag* tag, RegexNFAState* dest_state) -> void { + auto add_positive_tagged_start_transition(Tag const* tag, RegexNFAState* dest_state) -> void { m_positive_tagged_start_transitions.emplace_back(tag, dest_state); } From 338638e0514d88f063754b66b3fce172d9425d42 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 18 Nov 2024 11:45:53 -0500 Subject: [PATCH 205/323] Add more const back in. --- src/log_surgeon/finite_automata/RegexAST.hpp | 2 +- src/log_surgeon/finite_automata/TaggedTransition.hpp | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 6c98109f..21691d4b 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -83,7 +83,7 @@ class RegexAST { */ [[nodiscard]] virtual auto serialize() const -> std::u32string = 0; - [[nodiscard]] auto get_subtree_positive_tags() const -> std::vector const& { + [[nodiscard]] auto get_subtree_positive_tags() const -> std::vector const& { return m_subtree_positive_tags; } diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index f04143da..f1686c80 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -26,7 +26,7 @@ class PositiveTaggedTransition { * @param dest_state * @throw std::invalid_argument if `tag` is `nullptr`. */ - PositiveTaggedTransition(Tag* tag, NFAStateType const* dest_state) + PositiveTaggedTransition(Tag const* tag, NFAStateType const* dest_state) : m_tag{nullptr == tag ? throw std::invalid_argument("Tag cannot be null") : tag}, m_dest_state{dest_state} {} @@ -47,7 +47,7 @@ class PositiveTaggedTransition { } private: - Tag* m_tag; + Tag const* m_tag; NFAStateType const* m_dest_state; }; @@ -64,7 +64,7 @@ class NegativeTaggedTransition { * @param dest_state * @throw std::invalid_argument if any elements in `tags` is `nullptr`. */ - NegativeTaggedTransition(std::vector tags, NFAStateType* dest_state) + NegativeTaggedTransition(std::vector tags, NFAStateType* dest_state) : m_tags{[&tags] { if (std::ranges::any_of(tags, [](Tag const* tag) { return nullptr == tag; })) { throw std::invalid_argument("Tags cannot contain null elements"); @@ -93,7 +93,7 @@ class NegativeTaggedTransition { } private: - std::vector m_tags; + std::vector m_tags; NFAStateType* m_dest_state; }; } // namespace log_surgeon::finite_automata From a742601f28162bb010a87ee9c2d3e75c9f020702 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 18 Nov 2024 11:46:31 -0500 Subject: [PATCH 206/323] Add more const back in. --- src/log_surgeon/finite_automata/TaggedTransition.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index f1686c80..beedd423 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -94,7 +94,7 @@ class NegativeTaggedTransition { private: std::vector m_tags; - NFAStateType* m_dest_state; + NFAStateType const* m_dest_state; }; } // namespace log_surgeon::finite_automata From d3587134daa5765ed3a61f336a8cdec7525a05cd Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 18 Nov 2024 11:47:58 -0500 Subject: [PATCH 207/323] Linter. --- src/log_surgeon/Lexer.tpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index bdac76a1..d72f11ef 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -411,7 +411,7 @@ auto Lexer::epsilon_closure(NFAStateType const* stat stack.push(positive_tagged_start_transition.get_dest_state()); } for (auto const& positive_tagged_end_transition : - current_state->get_positive_tagged_end_transitions()) + current_state->get_positive_tagged_end_transitions()) { stack.push(positive_tagged_end_transition.get_dest_state()); } From 43870ea84bd6bb4d602082c541d07a869634d1c7 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 18 Nov 2024 11:49:42 -0500 Subject: [PATCH 208/323] Add more const back in. --- src/log_surgeon/finite_automata/RegexNFAState.hpp | 4 ++-- src/log_surgeon/finite_automata/TaggedTransition.hpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFAState.hpp b/src/log_surgeon/finite_automata/RegexNFAState.hpp index 94df249f..61ab42c4 100644 --- a/src/log_surgeon/finite_automata/RegexNFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexNFAState.hpp @@ -31,10 +31,10 @@ class RegexNFAState { RegexNFAState() = default; - RegexNFAState(Tag const* tag, RegexNFAState* dest_state) + RegexNFAState(Tag const* tag, RegexNFAState const* dest_state) : m_positive_tagged_end_transitions{{tag, dest_state}} {} - RegexNFAState(std::vector tags, RegexNFAState* dest_state) + RegexNFAState(std::vector tags, RegexNFAState const* dest_state) : m_negative_tagged_transition{NegativeTaggedTransition{std::move(tags), dest_state}} {} auto set_accepting(bool accepting) -> void { m_accepting = accepting; } diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index beedd423..86fe7a39 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -64,7 +64,7 @@ class NegativeTaggedTransition { * @param dest_state * @throw std::invalid_argument if any elements in `tags` is `nullptr`. */ - NegativeTaggedTransition(std::vector tags, NFAStateType* dest_state) + NegativeTaggedTransition(std::vector tags, NFAStateType const* dest_state) : m_tags{[&tags] { if (std::ranges::any_of(tags, [](Tag const* tag) { return nullptr == tag; })) { throw std::invalid_argument("Tags cannot contain null elements"); From f94160720d593a7d62137a3e25ab17b704e5a7aa Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Tue, 19 Nov 2024 15:10:54 -0500 Subject: [PATCH 209/323] Use `auto`. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexAST.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 21691d4b..0a081b56 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -894,7 +894,7 @@ template template void RegexASTCapture::add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const { - NFAStateType* root = nfa->get_root(); + auto* root = nfa->get_root(); auto* capture_group_start_state = nfa->new_state(); root->add_positive_tagged_start_transition(m_tag.get(), capture_group_start_state); From aad9eb39b2a2ad149385557b3203caf0ce165dff Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Tue, 19 Nov 2024 15:11:49 -0500 Subject: [PATCH 210/323] Fix spacing. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexAST.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 0a081b56..221c77be 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -902,7 +902,6 @@ void RegexASTCapture::add_to_nfa(RegexNFA* nfa, NFAS = nfa->new_state_with_positive_tagged_transition(m_tag.get(), end_state); nfa->set_root(capture_group_start_state); m_group_regex_ast->add_to_nfa_with_negative_tags(nfa, state_with_positive_tagged_transition); - nfa->set_root(root); } From a801bf89ba263e597939b587bf489d2f41befb4f Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 19 Nov 2024 16:23:51 -0500 Subject: [PATCH 211/323] Add diagram for capture group NFA. --- src/log_surgeon/finite_automata/RegexAST.hpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 221c77be..4eb4a21e 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -894,14 +894,20 @@ template template void RegexASTCapture::add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const { + // root --(pos_tagged_start_transition)--> capture_group_start_state --> + // [inner capture group NFA] --(neg_tagged_transition)--> neg_state --> + // state_with_positive_tagged_end_transition --(pos_tagged_end_transition)--> end_state auto* root = nfa->get_root(); auto* capture_group_start_state = nfa->new_state(); root->add_positive_tagged_start_transition(m_tag.get(), capture_group_start_state); - auto* state_with_positive_tagged_transition + auto* state_with_positive_tagged_end_transition = nfa->new_state_with_positive_tagged_transition(m_tag.get(), end_state); nfa->set_root(capture_group_start_state); - m_group_regex_ast->add_to_nfa_with_negative_tags(nfa, state_with_positive_tagged_transition); + m_group_regex_ast->add_to_nfa_with_negative_tags( + nfa, + state_with_positive_tagged_end_transition + ); nfa->set_root(root); } From 08b7548b40cd2342ba61bc5e57d299d457dccb39 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 19 Nov 2024 16:27:42 -0500 Subject: [PATCH 212/323] Add const for consitency with constructor. --- src/log_surgeon/finite_automata/RegexNFAState.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFAState.hpp b/src/log_surgeon/finite_automata/RegexNFAState.hpp index 61ab42c4..bf47011b 100644 --- a/src/log_surgeon/finite_automata/RegexNFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexNFAState.hpp @@ -49,7 +49,8 @@ class RegexNFAState { return m_matching_variable_id; } - auto add_positive_tagged_start_transition(Tag const* tag, RegexNFAState* dest_state) -> void { + auto + add_positive_tagged_start_transition(Tag const* tag, RegexNFAState const* dest_state) -> void { m_positive_tagged_start_transitions.emplace_back(tag, dest_state); } From 449133e40fa019fdd284171d4fb2b6da655481d0 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 19 Nov 2024 16:44:55 -0500 Subject: [PATCH 213/323] Update positive end transition to be optional instead of a vector. --- src/log_surgeon/Lexer.tpp | 9 +++--- src/log_surgeon/finite_automata/RegexNFA.hpp | 11 ++++--- .../finite_automata/RegexNFAState.hpp | 30 +++++++++---------- 3 files changed, 26 insertions(+), 24 deletions(-) diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index d72f11ef..45524ed3 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -410,11 +410,12 @@ auto Lexer::epsilon_closure(NFAStateType const* stat { stack.push(positive_tagged_start_transition.get_dest_state()); } - for (auto const& positive_tagged_end_transition : - current_state->get_positive_tagged_end_transitions()) - { - stack.push(positive_tagged_end_transition.get_dest_state()); + auto const& optional_positive_tagged_end_transition + = current_state->get_positive_tagged_end_transitions(); + if (optional_positive_tagged_end_transition.has_value()) { + stack.push(optional_positive_tagged_end_transition.value().get_dest_state()); } + auto const& optional_negative_tagged_transition = current_state->get_negative_tagged_transition(); if (optional_negative_tagged_transition.has_value()) { diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index c35aa83a..374542cd 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -152,11 +152,14 @@ auto RegexNFA::get_bfs_traversal_order() const -> std::vectorget_positive_tagged_end_transitions()) - { - add_to_queue_and_visited(positive_tagged_end_transition.get_dest_state()); + + auto const& optional_positive_tagged_end_transition + = current_state->get_positive_tagged_end_transitions(); + if (optional_positive_tagged_end_transition.has_value()) { + add_to_queue_and_visited(optional_positive_tagged_end_transition.value().get_dest_state( + )); } + auto const& optional_negative_tagged_transition = current_state->get_negative_tagged_transition(); if (optional_negative_tagged_transition.has_value()) { diff --git a/src/log_surgeon/finite_automata/RegexNFAState.hpp b/src/log_surgeon/finite_automata/RegexNFAState.hpp index bf47011b..5d440551 100644 --- a/src/log_surgeon/finite_automata/RegexNFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexNFAState.hpp @@ -32,7 +32,7 @@ class RegexNFAState { RegexNFAState() = default; RegexNFAState(Tag const* tag, RegexNFAState const* dest_state) - : m_positive_tagged_end_transitions{{tag, dest_state}} {} + : m_positive_tagged_end_transition{PositiveTaggedTransition{tag, dest_state}} {} RegexNFAState(std::vector tags, RegexNFAState const* dest_state) : m_negative_tagged_transition{NegativeTaggedTransition{std::move(tags), dest_state}} {} @@ -60,8 +60,8 @@ class RegexNFAState { } [[nodiscard]] auto get_positive_tagged_end_transitions( - ) const -> std::vector> const& { - return m_positive_tagged_end_transitions; + ) const -> std::optional> const& { + return m_positive_tagged_end_transition; } [[nodiscard]] auto get_negative_tagged_transition( @@ -111,7 +111,7 @@ class RegexNFAState { bool m_accepting{false}; uint32_t m_matching_variable_id{0}; std::vector> m_positive_tagged_start_transitions; - std::vector> m_positive_tagged_end_transitions; + std::optional> m_positive_tagged_end_transition; std::optional> m_negative_tagged_transition; std::vector m_epsilon_transitions; std::array, cSizeOfByte> m_bytes_transitions; @@ -189,26 +189,24 @@ auto RegexNFAState::serialize( std::vector positive_tagged_start_transition_strings; for (auto const& positive_tagged_start_transition : m_positive_tagged_start_transitions) { - auto const optional_serialized_positive_transition + auto const optional_serialized_positive_start_transition = positive_tagged_start_transition.serialize(state_ids); - if (false == optional_serialized_positive_transition.has_value()) { + if (false == optional_serialized_positive_start_transition.has_value()) { return std::nullopt; } positive_tagged_start_transition_strings.emplace_back( - optional_serialized_positive_transition.value() + optional_serialized_positive_start_transition.value() ); } - std::vector positive_tagged_end_transition_strings; - for (auto const& positive_tagged_end_transition : m_positive_tagged_end_transitions) { - auto const optional_serialized_positive_transition - = positive_tagged_end_transition.serialize(state_ids); - if (false == optional_serialized_positive_transition.has_value()) { + std::string positive_tagged_end_transition_string; + if (m_positive_tagged_end_transition.has_value()) { + auto const optional_serialized_positive_end_transition + = m_positive_tagged_end_transition.value().serialize(state_ids); + if (false == optional_serialized_positive_end_transition.has_value()) { return std::nullopt; } - positive_tagged_end_transition_strings.emplace_back( - optional_serialized_positive_transition.value() - ); + positive_tagged_end_transition_string = optional_serialized_positive_end_transition.value(); } std::string negative_tagged_transition_string; @@ -233,7 +231,7 @@ auto RegexNFAState::serialize( fmt::join(byte_transitions, ","), fmt::join(epsilon_transitions, ","), fmt::join(positive_tagged_start_transition_strings, ","), - fmt::join(positive_tagged_end_transition_strings, ","), + positive_tagged_end_transition_string, negative_tagged_transition_string ); } From 7b837bf1580f2d51a5c3e9bb269dcfed838c2b7f Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 19 Nov 2024 17:38:56 -0500 Subject: [PATCH 214/323] Rename new_state function correctly. --- src/log_surgeon/finite_automata/RegexAST.hpp | 2 +- src/log_surgeon/finite_automata/RegexNFA.hpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 4eb4a21e..48ea0313 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -902,7 +902,7 @@ void RegexASTCapture::add_to_nfa(RegexNFA* nfa, NFAS root->add_positive_tagged_start_transition(m_tag.get(), capture_group_start_state); auto* state_with_positive_tagged_end_transition - = nfa->new_state_with_positive_tagged_transition(m_tag.get(), end_state); + = nfa->new_state_with_positive_tagged_end_transition(m_tag.get(), end_state); nfa->set_root(capture_group_start_state); m_group_regex_ast->add_to_nfa_with_negative_tags( nfa, diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 374542cd..7fb87d5b 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -41,7 +41,7 @@ class RegexNFA { * @param dest_state * @return NFAStateType* */ - [[nodiscard]] auto new_state_with_positive_tagged_transition( + [[nodiscard]] auto new_state_with_positive_tagged_end_transition( Tag const* tag, NFAStateType const* dest_state ) -> NFAStateType*; @@ -101,7 +101,7 @@ auto RegexNFA::new_state() -> NFAStateType* { } template -auto RegexNFA::new_state_with_positive_tagged_transition( +auto RegexNFA::new_state_with_positive_tagged_end_transition( Tag const* tag, NFAStateType const* dest_state ) -> NFAStateType* { From f0eb56b1f3d06fd51ec14844b8f6a25631293975 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 19 Nov 2024 17:50:32 -0500 Subject: [PATCH 215/323] Update capture group AST state creation. --- src/log_surgeon/finite_automata/RegexAST.hpp | 6 +++--- src/log_surgeon/finite_automata/RegexNFA.hpp | 16 +++++++++++++++- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 48ea0313..6e7dceeb 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -897,13 +897,13 @@ void RegexASTCapture::add_to_nfa(RegexNFA* nfa, NFAS // root --(pos_tagged_start_transition)--> capture_group_start_state --> // [inner capture group NFA] --(neg_tagged_transition)--> neg_state --> // state_with_positive_tagged_end_transition --(pos_tagged_end_transition)--> end_state + auto* capture_group_start_state = nfa->new_capture_group_start_state(m_tag.get()); + auto* root = nfa->get_root(); - auto* capture_group_start_state = nfa->new_state(); - root->add_positive_tagged_start_transition(m_tag.get(), capture_group_start_state); + nfa->set_root(capture_group_start_state); auto* state_with_positive_tagged_end_transition = nfa->new_state_with_positive_tagged_end_transition(m_tag.get(), end_state); - nfa->set_root(capture_group_start_state); m_group_regex_ast->add_to_nfa_with_negative_tags( nfa, state_with_positive_tagged_end_transition diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 7fb87d5b..fdb289ae 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -35,7 +35,7 @@ class RegexNFA { [[nodiscard]] auto new_state() -> NFAStateType*; /** - * Creates a unique_ptr for an NFA state with a positive tagged transition and adds it to + * Creates a unique_ptr for an NFA state with a positive tagged end transition and adds it to * `m_states`. * @param tag * @param dest_state @@ -46,6 +46,13 @@ class RegexNFA { NFAStateType const* dest_state ) -> NFAStateType*; + /** + * Add an NFA state with in incoming positive tagged start transition from `m_root`. + * @param tag + * @return NFAStateType* + */ + [[nodiscard]] auto new_capture_group_start_state(Tag const* tag) -> NFAStateType*; + /** * Creates a unique_ptr for an NFA state with a negative tagged transition and adds it to * `m_states`. @@ -100,6 +107,13 @@ auto RegexNFA::new_state() -> NFAStateType* { return m_states.back().get(); } +template +auto RegexNFA::new_capture_group_start_state(Tag const* tag) -> NFAStateType* { + auto* state = new_state(); + m_root->add_positive_tagged_start_transition(tag, state); + return state; +} + template auto RegexNFA::new_state_with_positive_tagged_end_transition( Tag const* tag, From a9459154a075e6ce73537e7db8028febf8413b66 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 19 Nov 2024 18:27:34 -0500 Subject: [PATCH 216/323] Encapsulate new state for capture group. --- src/log_surgeon/finite_automata/RegexAST.hpp | 28 ++++++-------- src/log_surgeon/finite_automata/RegexNFA.hpp | 40 +++++++++++++------- 2 files changed, 38 insertions(+), 30 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 6e7dceeb..440c3a89 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -693,11 +693,11 @@ class RegexASTCapture : public RegexAST { /** * Adds the needed `RegexNFA::states` to the passed in nfa to handle a - * `RegexASTCapture` before transitioning to an accepting `end_state`. + * `RegexASTCapture` before transitioning to a `dest_state`. * @param nfa - * @param end_state + * @param dest_state */ - auto add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const -> void override; + auto add_to_nfa(RegexNFA* nfa, NFAStateType* dest_state) const -> void override; [[nodiscard]] auto serialize() const -> std::u32string override; @@ -892,23 +892,19 @@ template } template -void RegexASTCapture::add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) - const { +void RegexASTCapture::add_to_nfa( + RegexNFA* nfa, + NFAStateType* dest_state +) const { // root --(pos_tagged_start_transition)--> capture_group_start_state --> // [inner capture group NFA] --(neg_tagged_transition)--> neg_state --> // state_with_positive_tagged_end_transition --(pos_tagged_end_transition)--> end_state - auto* capture_group_start_state = nfa->new_capture_group_start_state(m_tag.get()); - - auto* root = nfa->get_root(); - nfa->set_root(capture_group_start_state); + auto [start_state, end_state] = nfa->new_capture_group_start_states(m_tag.get(), dest_state); - auto* state_with_positive_tagged_end_transition - = nfa->new_state_with_positive_tagged_end_transition(m_tag.get(), end_state); - m_group_regex_ast->add_to_nfa_with_negative_tags( - nfa, - state_with_positive_tagged_end_transition - ); - nfa->set_root(root); + auto* initial_root = nfa->get_root(); + nfa->set_root(start_state); + m_group_regex_ast->add_to_nfa_with_negative_tags(nfa, end_state); + nfa->set_root(initial_root); } template diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index fdb289ae..95d18615 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -46,13 +46,6 @@ class RegexNFA { NFAStateType const* dest_state ) -> NFAStateType*; - /** - * Add an NFA state with in incoming positive tagged start transition from `m_root`. - * @param tag - * @return NFAStateType* - */ - [[nodiscard]] auto new_capture_group_start_state(Tag const* tag) -> NFAStateType*; - /** * Creates a unique_ptr for an NFA state with a negative tagged transition and adds it to * `m_states`. @@ -65,6 +58,19 @@ class RegexNFA { NFAStateType const* dest_state ) -> NFAStateType*; + /** + * Add two NFA states for a capture group: + * 1. A start state: `m_root` --(start `tag`)--> start_state. + * 2. An end state: end_state --(end `tag`)--> `dest_state`. + * @param tag + * @param dest_state + * @return std::pair + */ + [[nodiscard]] auto new_capture_group_start_states( + Tag const* tag, + NFAStateType const* dest_state + ) -> std::pair; + /** * @return A vector representing the traversal order of the NFA states using breadth-first * search (BFS). @@ -107,13 +113,6 @@ auto RegexNFA::new_state() -> NFAStateType* { return m_states.back().get(); } -template -auto RegexNFA::new_capture_group_start_state(Tag const* tag) -> NFAStateType* { - auto* state = new_state(); - m_root->add_positive_tagged_start_transition(tag, state); - return state; -} - template auto RegexNFA::new_state_with_positive_tagged_end_transition( Tag const* tag, @@ -132,6 +131,19 @@ auto RegexNFA::new_state_with_negative_tagged_transition( return m_states.back().get(); } +template +auto RegexNFA::new_capture_group_start_states( + Tag const* tag, + NFAStateType const* dest_state +) -> std::pair { + auto* start_state = new_state(); + m_root->add_positive_tagged_start_transition(tag, start_state); + + auto* end_state = new_state_with_positive_tagged_transition(tag, dest_state); + + return {start_state, end_state}; +} + template auto RegexNFA::get_bfs_traversal_order() const -> std::vector { std::queue state_queue; From c757deda4859a787908c1e899213de3fcb423699 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 19 Nov 2024 18:30:40 -0500 Subject: [PATCH 217/323] Fix compiler error. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 95d18615..e9638f12 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -139,7 +139,7 @@ auto RegexNFA::new_capture_group_start_states( auto* start_state = new_state(); m_root->add_positive_tagged_start_transition(tag, start_state); - auto* end_state = new_state_with_positive_tagged_transition(tag, dest_state); + auto* end_state = new_state_with_positive_tagged_end_transition(tag, dest_state); return {start_state, end_state}; } From 2eb74772b267aabd72782dd7bd193418088ceb97 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 20 Nov 2024 04:31:47 -0500 Subject: [PATCH 218/323] Use singular for end transition getter function. --- src/log_surgeon/Lexer.tpp | 2 +- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 +- src/log_surgeon/finite_automata/RegexNFAState.hpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index 45524ed3..8a8aeb33 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -411,7 +411,7 @@ auto Lexer::epsilon_closure(NFAStateType const* stat stack.push(positive_tagged_start_transition.get_dest_state()); } auto const& optional_positive_tagged_end_transition - = current_state->get_positive_tagged_end_transitions(); + = current_state->get_positive_tagged_end_transition(); if (optional_positive_tagged_end_transition.has_value()) { stack.push(optional_positive_tagged_end_transition.value().get_dest_state()); } diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index e9638f12..1dbd8810 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -180,7 +180,7 @@ auto RegexNFA::get_bfs_traversal_order() const -> std::vectorget_positive_tagged_end_transitions(); + = current_state->get_positive_tagged_end_transition(); if (optional_positive_tagged_end_transition.has_value()) { add_to_queue_and_visited(optional_positive_tagged_end_transition.value().get_dest_state( )); diff --git a/src/log_surgeon/finite_automata/RegexNFAState.hpp b/src/log_surgeon/finite_automata/RegexNFAState.hpp index 5d440551..a28d35a5 100644 --- a/src/log_surgeon/finite_automata/RegexNFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexNFAState.hpp @@ -59,7 +59,7 @@ class RegexNFAState { return m_positive_tagged_start_transitions; } - [[nodiscard]] auto get_positive_tagged_end_transitions( + [[nodiscard]] auto get_positive_tagged_end_transition( ) const -> std::optional> const& { return m_positive_tagged_end_transition; } From 08060ed2c241c9d9a91fe73ab300a7303385a5a7 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 20 Nov 2024 05:09:26 -0500 Subject: [PATCH 219/323] Void to auto -> void. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegexAST.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 440c3a89..56ff2123 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -892,10 +892,10 @@ template } template -void RegexASTCapture::add_to_nfa( +auto RegexASTCapture::add_to_nfa( RegexNFA* nfa, NFAStateType* dest_state -) const { +) const -> void { // root --(pos_tagged_start_transition)--> capture_group_start_state --> // [inner capture group NFA] --(neg_tagged_transition)--> neg_state --> // state_with_positive_tagged_end_transition --(pos_tagged_end_transition)--> end_state From 0c2c1d1df227c188200fb1c35ad6ebbed2549df6 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 20 Nov 2024 05:16:05 -0500 Subject: [PATCH 220/323] Update new_capture_group_start_states to new_capture_group_states to reflect functionality change. --- src/log_surgeon/finite_automata/RegexAST.hpp | 2 +- src/log_surgeon/finite_automata/RegexNFA.hpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 56ff2123..75d1b220 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -899,7 +899,7 @@ auto RegexASTCapture::add_to_nfa( // root --(pos_tagged_start_transition)--> capture_group_start_state --> // [inner capture group NFA] --(neg_tagged_transition)--> neg_state --> // state_with_positive_tagged_end_transition --(pos_tagged_end_transition)--> end_state - auto [start_state, end_state] = nfa->new_capture_group_start_states(m_tag.get(), dest_state); + auto [start_state, end_state] = nfa->new_capture_group_states(m_tag.get(), dest_state); auto* initial_root = nfa->get_root(); nfa->set_root(start_state); diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 1dbd8810..0fd2b9af 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -66,7 +66,7 @@ class RegexNFA { * @param dest_state * @return std::pair */ - [[nodiscard]] auto new_capture_group_start_states( + [[nodiscard]] auto new_capture_group_states( Tag const* tag, NFAStateType const* dest_state ) -> std::pair; @@ -132,7 +132,7 @@ auto RegexNFA::new_state_with_negative_tagged_transition( } template -auto RegexNFA::new_capture_group_start_states( +auto RegexNFA::new_capture_group_states( Tag const* tag, NFAStateType const* dest_state ) -> std::pair { From b0b951a57ad218a1261c15093cac1d6c701b9506 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 20 Nov 2024 05:17:17 -0500 Subject: [PATCH 221/323] Linter. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 0fd2b9af..05ba6cb9 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -66,10 +66,8 @@ class RegexNFA { * @param dest_state * @return std::pair */ - [[nodiscard]] auto new_capture_group_states( - Tag const* tag, - NFAStateType const* dest_state - ) -> std::pair; + [[nodiscard]] auto new_capture_group_states(Tag const* tag, NFAStateType const* dest_state) + -> std::pair; /** * @return A vector representing the traversal order of the NFA states using breadth-first From 3c2a2abde7ee23b29e0c7b471bbcee9ee4c7c4c3 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 20 Nov 2024 15:11:45 -0500 Subject: [PATCH 222/323] Update docstring for . --- src/log_surgeon/finite_automata/RegexNFA.hpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 05ba6cb9..74847c04 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -59,12 +59,12 @@ class RegexNFA { ) -> NFAStateType*; /** - * Add two NFA states for a capture group: - * 1. A start state: `m_root` --(start `tag`)--> start_state. - * 2. An end state: end_state --(end `tag`)--> `dest_state`. - * @param tag + * Creates the start and end states for a capture group. + * @param tag The tag associated with the capture group. * @param dest_state - * @return std::pair + * @return A pair of states: + * - A new state with a positive tagged start transition from `m_root`. + * - A new state with a positive tagged end transition to `dest_state`. */ [[nodiscard]] auto new_capture_group_states(Tag const* tag, NFAStateType const* dest_state) -> std::pair; @@ -138,7 +138,6 @@ auto RegexNFA::new_capture_group_states( m_root->add_positive_tagged_start_transition(tag, start_state); auto* end_state = new_state_with_positive_tagged_end_transition(tag, dest_state); - return {start_state, end_state}; } From 98c5b95db954405945794701213d5d37a90dac10 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 20 Nov 2024 16:53:04 -0500 Subject: [PATCH 223/323] Rename to new_start_and_end_states_with_positively_tagged_transitions. --- src/log_surgeon/finite_automata/RegexAST.hpp | 6 +++++- src/log_surgeon/finite_automata/RegexNFA.hpp | 8 +++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 75d1b220..6247c558 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -899,7 +899,11 @@ auto RegexASTCapture::add_to_nfa( // root --(pos_tagged_start_transition)--> capture_group_start_state --> // [inner capture group NFA] --(neg_tagged_transition)--> neg_state --> // state_with_positive_tagged_end_transition --(pos_tagged_end_transition)--> end_state - auto [start_state, end_state] = nfa->new_capture_group_states(m_tag.get(), dest_state); + auto [start_state, end_state] + = nfa->new_start_and_end_states_with_positively_tagged_transitions( + m_tag.get(), + dest_state + ); auto* initial_root = nfa->get_root(); nfa->set_root(start_state); diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 74847c04..0e425a64 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -66,8 +66,10 @@ class RegexNFA { * - A new state with a positive tagged start transition from `m_root`. * - A new state with a positive tagged end transition to `dest_state`. */ - [[nodiscard]] auto new_capture_group_states(Tag const* tag, NFAStateType const* dest_state) - -> std::pair; + [[nodiscard]] auto new_start_and_end_states_with_positively_tagged_transitions( + Tag const* tag, + NFAStateType const* dest_state + ) -> std::pair; /** * @return A vector representing the traversal order of the NFA states using breadth-first @@ -130,7 +132,7 @@ auto RegexNFA::new_state_with_negative_tagged_transition( } template -auto RegexNFA::new_capture_group_states( +auto RegexNFA::new_start_and_end_states_with_positively_tagged_transitions( Tag const* tag, NFAStateType const* dest_state ) -> std::pair { From f59cf41ccf2b51fbadd8b47a18e2b16532d2d3f0 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 20 Nov 2024 17:06:43 -0500 Subject: [PATCH 224/323] Rename to capture_X_state. --- src/log_surgeon/finite_automata/RegexAST.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 6247c558..3b432e96 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -899,15 +899,15 @@ auto RegexASTCapture::add_to_nfa( // root --(pos_tagged_start_transition)--> capture_group_start_state --> // [inner capture group NFA] --(neg_tagged_transition)--> neg_state --> // state_with_positive_tagged_end_transition --(pos_tagged_end_transition)--> end_state - auto [start_state, end_state] + auto [capture_start_state, capture_end_state] = nfa->new_start_and_end_states_with_positively_tagged_transitions( m_tag.get(), dest_state ); auto* initial_root = nfa->get_root(); - nfa->set_root(start_state); - m_group_regex_ast->add_to_nfa_with_negative_tags(nfa, end_state); + nfa->set_root(capture_start_state); + m_group_regex_ast->add_to_nfa_with_negative_tags(nfa, capture_end_state); nfa->set_root(initial_root); } From 85a2d69de00aee0bc8b009b658b879636cd8433b Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 20 Nov 2024 17:16:03 -0500 Subject: [PATCH 225/323] Update docstring. --- src/log_surgeon/finite_automata/RegexNFA.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 0e425a64..7f6ebbaf 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -39,7 +39,7 @@ class RegexNFA { * `m_states`. * @param tag * @param dest_state - * @return NFAStateType* + * @return A new state with a positive tagged end transition to `dest_state`. */ [[nodiscard]] auto new_state_with_positive_tagged_end_transition( Tag const* tag, From 4c602d485027a7c03b912dc7ffe0b87b18024cf4 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 20 Nov 2024 17:52:20 -0500 Subject: [PATCH 226/323] Updated diagram to match vars used in code. --- src/log_surgeon/finite_automata/RegexAST.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 3b432e96..b9a8329e 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -896,9 +896,8 @@ auto RegexASTCapture::add_to_nfa( RegexNFA* nfa, NFAStateType* dest_state ) const -> void { - // root --(pos_tagged_start_transition)--> capture_group_start_state --> - // [inner capture group NFA] --(neg_tagged_transition)--> neg_state --> - // state_with_positive_tagged_end_transition --(pos_tagged_end_transition)--> end_state + // root --(`m_tag` start)--> capture_start_state --> [`m_group_regex_ast` NFA] + // --(`m_negative_tags`)--> capture_end_state --(`m_tag` end)--> dest_state auto [capture_start_state, capture_end_state] = nfa->new_start_and_end_states_with_positively_tagged_transitions( m_tag.get(), From 2b0143334e5d00f74c7ef3d8f8efe00f91be2774 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 20 Nov 2024 17:56:25 -0500 Subject: [PATCH 227/323] Rename vars to serialized_X. --- src/log_surgeon/finite_automata/RegexNFAState.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexNFAState.hpp b/src/log_surgeon/finite_automata/RegexNFAState.hpp index a28d35a5..0fd59c50 100644 --- a/src/log_surgeon/finite_automata/RegexNFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexNFAState.hpp @@ -187,26 +187,26 @@ auto RegexNFAState::serialize( epsilon_transitions.emplace_back(std::to_string(state_ids.at(dest_state))); } - std::vector positive_tagged_start_transition_strings; + std::vector serialized_positive_tagged_start_transitions; for (auto const& positive_tagged_start_transition : m_positive_tagged_start_transitions) { auto const optional_serialized_positive_start_transition = positive_tagged_start_transition.serialize(state_ids); if (false == optional_serialized_positive_start_transition.has_value()) { return std::nullopt; } - positive_tagged_start_transition_strings.emplace_back( + serialized_positive_tagged_start_transitions.emplace_back( optional_serialized_positive_start_transition.value() ); } - std::string positive_tagged_end_transition_string; + std::string serialized_positive_tagged_end_transition; if (m_positive_tagged_end_transition.has_value()) { auto const optional_serialized_positive_end_transition = m_positive_tagged_end_transition.value().serialize(state_ids); if (false == optional_serialized_positive_end_transition.has_value()) { return std::nullopt; } - positive_tagged_end_transition_string = optional_serialized_positive_end_transition.value(); + serialized_positive_tagged_end_transition = optional_serialized_positive_end_transition.value(); } std::string negative_tagged_transition_string; @@ -230,8 +230,8 @@ auto RegexNFAState::serialize( accepting_tag_string, fmt::join(byte_transitions, ","), fmt::join(epsilon_transitions, ","), - fmt::join(positive_tagged_start_transition_strings, ","), - positive_tagged_end_transition_string, + fmt::join(serialized_positive_tagged_start_transitions, ","), + serialized_positive_tagged_end_transition, negative_tagged_transition_string ); } From e37b29a33e685fa7b7c8bba6df9b139b03f4c932 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 20 Nov 2024 18:05:50 -0500 Subject: [PATCH 228/323] Run Linter. --- src/log_surgeon/finite_automata/RegexNFAState.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexNFAState.hpp b/src/log_surgeon/finite_automata/RegexNFAState.hpp index 0fd59c50..8fce8cf7 100644 --- a/src/log_surgeon/finite_automata/RegexNFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexNFAState.hpp @@ -206,7 +206,8 @@ auto RegexNFAState::serialize( if (false == optional_serialized_positive_end_transition.has_value()) { return std::nullopt; } - serialized_positive_tagged_end_transition = optional_serialized_positive_end_transition.value(); + serialized_positive_tagged_end_transition + = optional_serialized_positive_end_transition.value(); } std::string negative_tagged_transition_string; From c5beca321728c9a3885652450f3ca1183d105693 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 20 Nov 2024 18:20:20 -0500 Subject: [PATCH 229/323] Fix typo. --- src/log_surgeon/finite_automata/RegexAST.hpp | 2 +- src/log_surgeon/finite_automata/RegexNFA.hpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index b9a8329e..ef55071f 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -899,7 +899,7 @@ auto RegexASTCapture::add_to_nfa( // root --(`m_tag` start)--> capture_start_state --> [`m_group_regex_ast` NFA] // --(`m_negative_tags`)--> capture_end_state --(`m_tag` end)--> dest_state auto [capture_start_state, capture_end_state] - = nfa->new_start_and_end_states_with_positively_tagged_transitions( + = nfa->new_start_and_end_states_with_positive_tagged_transitions( m_tag.get(), dest_state ); diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 7f6ebbaf..ba9791b1 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -66,7 +66,7 @@ class RegexNFA { * - A new state with a positive tagged start transition from `m_root`. * - A new state with a positive tagged end transition to `dest_state`. */ - [[nodiscard]] auto new_start_and_end_states_with_positively_tagged_transitions( + [[nodiscard]] auto new_start_and_end_states_with_positive_tagged_transitions( Tag const* tag, NFAStateType const* dest_state ) -> std::pair; @@ -132,7 +132,7 @@ auto RegexNFA::new_state_with_negative_tagged_transition( } template -auto RegexNFA::new_start_and_end_states_with_positively_tagged_transitions( +auto RegexNFA::new_start_and_end_states_with_positive_tagged_transitions( Tag const* tag, NFAStateType const* dest_state ) -> std::pair { From fe4a7b33b378b1b07c1743fd3dcdc871303670d1 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 20 Nov 2024 18:51:09 -0500 Subject: [PATCH 230/323] Update diagram for capture group NFA. --- src/log_surgeon/finite_automata/RegexAST.hpp | 33 ++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index ef55071f..beeb588e 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -896,8 +896,37 @@ auto RegexASTCapture::add_to_nfa( RegexNFA* nfa, NFAStateType* dest_state ) const -> void { - // root --(`m_tag` start)--> capture_start_state --> [`m_group_regex_ast` NFA] - // --(`m_negative_tags`)--> capture_end_state --(`m_tag` end)--> dest_state + // TODO: move this into a documentation file in the future, and reference it here. + // The NFA constructed for a capture group follows the structure below, with tagged transitions + // explicitly labeled for clarity: + // +---------------------+ + // | `m_root` | + // +---------------------+ + // | `m_tag` start + // | (positive tagged start transition) + // v + // +---------------------+ + // |`capture_start_state`| + // +---------------------+ + // | + // | (epsilon transition) + // v + // +---------------------+ + // | `m_group_regex_ast` | + // | (nested NFA) | + // +---------------------+ + // | `m_negative_tags` + // | (negative tagged transition) + // v + // +---------------------+ + // | `capture_end_state` | + // +---------------------+ + // | `m_tag` end + // | (positive tagged end transition) + // v + // +---------------------+ + // | `dest_state` | + // +---------------------+ auto [capture_start_state, capture_end_state] = nfa->new_start_and_end_states_with_positive_tagged_transitions( m_tag.get(), From 0017512c882a3289ce842100d84b4362792a60a7 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 26 Nov 2024 09:45:56 -0500 Subject: [PATCH 231/323] Add register unit-tests, add PrefixTree with unit-tests. --- CMakeLists.txt | 2 + .../finite_automata/PrefixTree.cpp | 21 ++++++ .../finite_automata/PrefixTree.hpp | 66 +++++++++++++++++++ src/log_surgeon/finite_automata/Register.hpp | 7 +- tests/CMakeLists.txt | 4 +- tests/test-prefix-tree.cpp | 36 ++++++++++ tests/test-register.cpp | 19 ++++++ 7 files changed, 150 insertions(+), 5 deletions(-) create mode 100644 src/log_surgeon/finite_automata/PrefixTree.cpp create mode 100644 src/log_surgeon/finite_automata/PrefixTree.hpp create mode 100644 tests/test-prefix-tree.cpp create mode 100644 tests/test-register.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 20326d33..93f59208 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -93,6 +93,8 @@ set(SOURCE_FILES src/log_surgeon/SchemaParser.hpp src/log_surgeon/Token.cpp src/log_surgeon/Token.hpp + src/log_surgeon/finite_automata/PrefixTree.cpp + src/log_surgeon/finite_automata/PrefixTree.hpp src/log_surgeon/finite_automata/RegexAST.hpp src/log_surgeon/finite_automata/RegexDFA.hpp src/log_surgeon/finite_automata/RegexDFA.tpp diff --git a/src/log_surgeon/finite_automata/PrefixTree.cpp b/src/log_surgeon/finite_automata/PrefixTree.cpp new file mode 100644 index 00000000..6b39b1bf --- /dev/null +++ b/src/log_surgeon/finite_automata/PrefixTree.cpp @@ -0,0 +1,21 @@ +#include "PrefixTree.hpp" + +#include + +namespace log_surgeon::finite_automata { +[[nodiscard]] auto PrefixTree::get_reversed_positions(uint32_t const index +) const -> std::vector { + if (m_nodes.size() <= index) { + throw std::invalid_argument("Prefix tree index out-of-bounds."); + } + + std::vector reversed_positions; + auto current_index = index; + while(0 < current_index) { + auto const& current_node = m_nodes[current_index]; + reversed_positions.push_back(current_node.get_position()); + current_index = current_node.get_predecessor_index(); + } + return reversed_positions; +} +} // namespace log_surgeon::finite_automata diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp new file mode 100644 index 00000000..dd6b1229 --- /dev/null +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -0,0 +1,66 @@ +#ifndef LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE +#define LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE + +#include +#include +#include +#include + +namespace log_surgeon::finite_automata { + +/** + * A prefix tree node helps a register represent a tag by storing the current position where a tag + * was matched in the lexxed string, as well as the index of the prefix tree node that stores the + * previous time the tag was matched. + * + * Note: m_position is -1 when a tag is + * unmatched. + */ +class PrefixTreeNode { +public: + PrefixTreeNode(uint32_t const predecessor_index, int32_t const position) + : m_predecessor_index(predecessor_index), + m_position(position) {} + + [[nodiscard]] auto get_predecessor_index() const -> uint32_t { return m_predecessor_index; } + + [[nodiscard]] auto get_position() const -> int32_t { return m_position; } + +private: + uint32_t m_predecessor_index; + int32_t m_position; +}; + +/** + * A prefix tree structure to store positions associated with registers. + * + * PrefixTree stores positions at nodes, and each node can represent a part of a position. + * Multiple positions can be stored at each index in the tree. The tree allows for the addition of + * positions and the retrieval of positions by their associated index. + */ +class PrefixTree { +public: + PrefixTree() : m_nodes{{0, -1}} {} + + /** + * @return The index of the newly inserted node in the tree. + */ + uint32_t insert(uint32_t const predecessor_index, int32_t const position) { + m_nodes.emplace_back(predecessor_index, position); + return m_nodes.size() - 1; + } + + /** + * @param index Representing the leaf node of the register's sub-tree. + * @return The positions, in reverse order, at which the register places the tag in the + * lexed string. + */ + [[nodiscard]] auto get_reversed_positions(uint32_t index) const -> std::vector; + +private: + std::vector m_nodes; +}; + +} // namespace log_surgeon::finite_automata + +#endif // LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE diff --git a/src/log_surgeon/finite_automata/Register.hpp b/src/log_surgeon/finite_automata/Register.hpp index d0be4f15..fddb28eb 100644 --- a/src/log_surgeon/finite_automata/Register.hpp +++ b/src/log_surgeon/finite_automata/Register.hpp @@ -2,7 +2,8 @@ #define LOG_SURGEON_FINITE_AUTOMATA_REGISTER #include - +#include +#include #include namespace log_surgeon::finite_automata { @@ -20,9 +21,7 @@ class Register { [[nodiscard]] auto get_tag() const -> Tag* { return m_tag; } - [[nodiscard]] auto get_last_position() const -> uint32_t { return positions.back(); } - - [[nodiscard]] auto get_all_positions() const -> std::vector const& { + [[nodiscard]] auto get_positions() const -> std::vector const& { return positions; } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e911ff58..669af769 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -2,6 +2,8 @@ set( SOURCES_LOG_SURGEON ../src/log_surgeon/FileReader.cpp ../src/log_surgeon/FileReader.hpp + ../src/log_surgeon/finite_automata/PrefixTree.cpp + ../src/log_surgeon/finite_automata/PrefixTree.hpp ../src/log_surgeon/finite_automata/RegexAST.hpp ../src/log_surgeon/finite_automata/RegexNFA.hpp ../src/log_surgeon/finite_automata/RegexNFAState.hpp @@ -22,7 +24,7 @@ set( ../src/log_surgeon/Token.hpp ) -set(SOURCES_TESTS test-lexer.cpp test-NFA.cpp test-tag.cpp) +set(SOURCES_TESTS test-lexer.cpp test-NFA.cpp test-prefix-tree.cpp test-register.cpp test-tag.cpp) add_executable(unit-test ${SOURCES_LOG_SURGEON} ${SOURCES_TESTS}) target_link_libraries(unit-test PRIVATE Catch2::Catch2WithMain log_surgeon::log_surgeon) diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp new file mode 100644 index 00000000..89d71003 --- /dev/null +++ b/tests/test-prefix-tree.cpp @@ -0,0 +1,36 @@ +#include + +#include + +#include + +using log_surgeon::finite_automata::PrefixTree; + +TEST_CASE("Prefix tree operations", "[PrefixTree]") { + SECTION("Newly constructed tree works correctly") { + PrefixTree const tree; + + REQUIRE(tree.get_reversed_positions(0).empty()); + } + + SECTION("Adding nodes to the prefix tree works correctly") { + PrefixTree tree; + uint32_t index_1 = tree.insert(0, 4); + REQUIRE(std::vector({4}) == tree.get_reversed_positions(index_1)); + + uint32_t index_2 = tree.insert(index_1, 7); + REQUIRE(std::vector({7, 4}) == tree.get_reversed_positions(index_2)); + + uint32_t index_3 = tree.insert(index_2, 9); + REQUIRE(std::vector({9, 7, 4}) == tree.get_reversed_positions(index_3)); + } + + SECTION("Invalid index access throws correctly") { + PrefixTree tree; + REQUIRE_THROWS_AS(tree.get_reversed_positions(1), std::invalid_argument); + + tree.insert(0, 4); + REQUIRE_THROWS_AS(tree.get_reversed_positions(2), std::invalid_argument); + REQUIRE_THROWS_AS(tree.get_reversed_positions(3), std::invalid_argument); + } +} diff --git a/tests/test-register.cpp b/tests/test-register.cpp new file mode 100644 index 00000000..10b9c0a8 --- /dev/null +++ b/tests/test-register.cpp @@ -0,0 +1,19 @@ +#include + +#include + +#include +#include + +using log_surgeon::finite_automata::Register; +using log_surgeon::finite_automata::Tag; +using std::make_unique; +using std::unique_ptr; + +TEST_CASE("Register operations", "[Register]") { + SECTION("Basic tag retrieval works correctly") { + auto const tag = make_unique("uID"); + Register const reg(tag.get()); + REQUIRE(tag.get() == reg.get_tag()); + } +} From 336f2ae6e426b3bfff6e29617ab16f044b0880af Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 26 Nov 2024 11:04:42 -0500 Subject: [PATCH 232/323] Finished with initial register implementation. --- CMakeLists.txt | 2 +- .../finite_automata/PrefixTree.cpp | 2 +- .../finite_automata/PrefixTree.hpp | 20 +++- src/log_surgeon/finite_automata/Register.hpp | 34 ------ .../finite_automata/RegisterHandler.hpp | 112 ++++++++++++++++++ tests/CMakeLists.txt | 4 +- tests/test-prefix-tree.cpp | 22 +++- tests/test-register-handler.cpp | 66 +++++++++++ tests/test-register.cpp | 19 --- 9 files changed, 219 insertions(+), 62 deletions(-) delete mode 100644 src/log_surgeon/finite_automata/Register.hpp create mode 100644 src/log_surgeon/finite_automata/RegisterHandler.hpp create mode 100644 tests/test-register-handler.cpp delete mode 100644 tests/test-register.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 93f59208..117cde51 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -101,7 +101,7 @@ set(SOURCE_FILES src/log_surgeon/finite_automata/RegexNFA.hpp src/log_surgeon/finite_automata/RegexNFAState.hpp src/log_surgeon/finite_automata/RegexNFAStateType.hpp - src/log_surgeon/finite_automata/Register.hpp + src/log_surgeon/finite_automata/RegisterHandler.hpp src/log_surgeon/finite_automata/Tag.hpp src/log_surgeon/finite_automata/TaggedTransition.hpp src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp diff --git a/src/log_surgeon/finite_automata/PrefixTree.cpp b/src/log_surgeon/finite_automata/PrefixTree.cpp index 6b39b1bf..de52f5be 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.cpp +++ b/src/log_surgeon/finite_automata/PrefixTree.cpp @@ -6,7 +6,7 @@ namespace log_surgeon::finite_automata { [[nodiscard]] auto PrefixTree::get_reversed_positions(uint32_t const index ) const -> std::vector { if (m_nodes.size() <= index) { - throw std::invalid_argument("Prefix tree index out-of-bounds."); + throw std::out_of_range("Prefix tree index out-of-bounds."); } std::vector reversed_positions; diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index dd6b1229..09adc915 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -19,11 +19,13 @@ namespace log_surgeon::finite_automata { class PrefixTreeNode { public: PrefixTreeNode(uint32_t const predecessor_index, int32_t const position) - : m_predecessor_index(predecessor_index), - m_position(position) {} + : m_predecessor_index{predecessor_index}, + m_position{position} {} [[nodiscard]] auto get_predecessor_index() const -> uint32_t { return m_predecessor_index; } + auto set_position(int32_t const position) -> void { m_position = position; } + [[nodiscard]] auto get_position() const -> int32_t { return m_position; } private: @@ -50,10 +52,24 @@ class PrefixTree { return m_nodes.size() - 1; } + /** + * @param index + * @param position + * @throw std::out_of_range("Prefix tree index out-of-bounds."); + */ + auto set(uint32_t const index, int32_t const position) -> void { + if (m_nodes.size() <= index) { + throw std::out_of_range("Prefix tree index out-of-bounds"); + } + + m_nodes[index].set_position(position); + } + /** * @param index Representing the leaf node of the register's sub-tree. * @return The positions, in reverse order, at which the register places the tag in the * lexed string. + * @throw std::out_of_range("Prefix tree index out-of-bounds."); */ [[nodiscard]] auto get_reversed_positions(uint32_t index) const -> std::vector; diff --git a/src/log_surgeon/finite_automata/Register.hpp b/src/log_surgeon/finite_automata/Register.hpp deleted file mode 100644 index fddb28eb..00000000 --- a/src/log_surgeon/finite_automata/Register.hpp +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGISTER -#define LOG_SURGEON_FINITE_AUTOMATA_REGISTER - -#include -#include -#include -#include - -namespace log_surgeon::finite_automata { -class Register { -public: - explicit Register(Tag* tag) : m_tag{tag} {} - - auto add_pos(uint32_t const pos) -> void { positions.push_back(pos); } - - auto update_last_position(uint32_t const pos) -> void { positions.back() = pos; } - - auto negate_last_position() -> void { positions.pop_back(); } - - auto negate_all_positions() -> void { positions.clear(); } - - [[nodiscard]] auto get_tag() const -> Tag* { return m_tag; } - - [[nodiscard]] auto get_positions() const -> std::vector const& { - return positions; - } - -private: - Tag* m_tag; - std::vector positions; -}; -} // namespace log_surgeon::finite_automata - -#endif // LOG_SURGEON_FINITE_AUTOMATA_REGISTER diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp new file mode 100644 index 00000000..52d464a8 --- /dev/null +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -0,0 +1,112 @@ +#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGISTER +#define LOG_SURGEON_FINITE_AUTOMATA_REGISTER + +#include +#include +#include + +#include + +namespace log_surgeon::finite_automata { +/** + * A register stores an index in the prefix tree. The index node fully represents the register's + * history. + * + * Note: history refers to the previous tag locations. E.g., given the tagged regex "aaa(1\d2)+", + * after parsing input string "aaa123", a register representing tag 1 would contain the history + * {3,4,5}. + */ +class Register { +public: + explicit Register(uint32_t const index) : m_index{index} {} + + auto set_index(uint32_t const index) -> void { m_index = index; } + + [[nodiscard]] auto get_index() const -> uint32_t { return m_index; } + +private: + uint32_t m_index; +}; + +/** + * The register handler maintains a prefix tree that is sufficient to reperesent all registers. + * The register handler also contains a vector of registers, and performs the set, copy, and append + * operations for these registers. + * + * Note: for efficiency these registers may be re-used, but are not required to be re-initialized. + * It is the responsibility of the DFA to set the register value when needed. + */ +class RegisterHandler { +public: + void add_register(uint32_t const predecessor_index, int32_t const position) { + auto const index = prefix_tree.insert(predecessor_index, position); + m_registers.emplace_back(index); + } + + /** + * + * @param register_index + * @param position + * @throws std::out_of_range("Register index out-of-bounds") + */ + void set_register(uint32_t const register_index, int32_t const position) { + if (m_registers.size() <= register_index) { + throw std::out_of_range("Register index out-of-bounds"); + } + + auto const tree_index = m_registers[register_index].get_index(); + prefix_tree.set(tree_index, position); + } + + /** + * @param dest_register_index + * @param source_register_index + * @throws std::out_of_range("Register index out-of-bounds") + */ + void copy_register(uint32_t const dest_register_index, uint32_t const source_register_index) { + if (m_registers.size() <= source_register_index + || m_registers.size() <= dest_register_index) + { + throw std::out_of_range("Register index out of range"); + } + + m_registers[dest_register_index] = m_registers[source_register_index]; + } + + /** + * @param register_index + * @param position + * @throws std::out_of_range("Register index out-of-bounds") + */ + void append_position(uint32_t register_index, int32_t position) { + if (register_index >= m_registers.size()) { + throw std::out_of_range("Register index out of range"); + } + + uint32_t const tree_index = m_registers[register_index].get_index(); + auto const new_index = prefix_tree.insert(tree_index, position); + m_registers[register_index].set_index(new_index); + } + + /** + * @param register_index + * @return Vector of positions representing the history of the given register. + * @throws std::out_of_range("Register index out-of-bounds") + + */ + [[nodiscard]] auto get_reversed_positions(uint32_t const register_index) const -> std::vector { + if (register_index >= m_registers.size()) { + throw std::out_of_range("Register index out of range"); + } + + uint32_t const tree_index = m_registers[register_index].get_index(); + return prefix_tree.get_reversed_positions(tree_index); + } + +private: + PrefixTree prefix_tree; + std::vector m_registers; +}; +} // namespace log_surgeon::finite_automata + +#endif // LOG_SURGEON_FINITE_AUTOMATA_REGISTER diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 669af769..ec974e6b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -8,7 +8,7 @@ set( ../src/log_surgeon/finite_automata/RegexNFA.hpp ../src/log_surgeon/finite_automata/RegexNFAState.hpp ../src/log_surgeon/finite_automata/RegexNFAStateType.hpp - ../src/log_surgeon/finite_automata/Register.hpp + ../src/log_surgeon/finite_automata/RegisterHandler.hpp ../src/log_surgeon/finite_automata/Tag.hpp ../src/log_surgeon/finite_automata/TaggedTransition.hpp ../src/log_surgeon/LALR1Parser.cpp @@ -24,7 +24,7 @@ set( ../src/log_surgeon/Token.hpp ) -set(SOURCES_TESTS test-lexer.cpp test-NFA.cpp test-prefix-tree.cpp test-register.cpp test-tag.cpp) +set(SOURCES_TESTS test-lexer.cpp test-NFA.cpp test-prefix-tree.cpp test-register-handler.cpp test-tag.cpp) add_executable(unit-test ${SOURCES_LOG_SURGEON} ${SOURCES_TESTS}) target_link_libraries(unit-test PRIVATE Catch2::Catch2WithMain log_surgeon::log_surgeon) diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp index 89d71003..5d143f97 100644 --- a/tests/test-prefix-tree.cpp +++ b/tests/test-prefix-tree.cpp @@ -27,10 +27,26 @@ TEST_CASE("Prefix tree operations", "[PrefixTree]") { SECTION("Invalid index access throws correctly") { PrefixTree tree; - REQUIRE_THROWS_AS(tree.get_reversed_positions(1), std::invalid_argument); + REQUIRE_THROWS_AS(tree.get_reversed_positions(1), std::out_of_range); tree.insert(0, 4); - REQUIRE_THROWS_AS(tree.get_reversed_positions(2), std::invalid_argument); - REQUIRE_THROWS_AS(tree.get_reversed_positions(3), std::invalid_argument); + REQUIRE_THROWS_AS(tree.get_reversed_positions(2), std::out_of_range); + REQUIRE_THROWS_AS(tree.get_reversed_positions(3), std::out_of_range); + } + + SECTION("Set position for a valid index works correctly") { + PrefixTree tree; + uint32_t index_1 = tree.insert(0, 4); + tree.set(index_1, 10); + REQUIRE(tree.get_reversed_positions(index_1) == std::vector({10})); + + uint32_t index_2 = tree.insert(index_1, 7); + tree.set(index_2, 12); + REQUIRE(tree.get_reversed_positions(index_2) == std::vector({12, 10})); + } + + SECTION("Set position for an invalid index throws correctly") { + PrefixTree tree; + REQUIRE_THROWS_AS(tree.set(100, 20), std::out_of_range); } } diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp new file mode 100644 index 00000000..d5b6d75a --- /dev/null +++ b/tests/test-register-handler.cpp @@ -0,0 +1,66 @@ +#include + +#include + +#include +#include + +using log_surgeon::finite_automata::Register; +using log_surgeon::finite_automata::RegisterHandler; +using log_surgeon::finite_automata::Tag; +using std::make_unique; +using std::unique_ptr; + +TEST_CASE("Register operations", "[Register]") { + SECTION("Register constructor and getter initializes correctly") { + Register const reg(5); + REQUIRE(reg.get_index() == 5); + } + + SECTION("Register sets index correctly") { + Register reg(5); + reg.set_index(10); + REQUIRE(reg.get_index() == 10); + } +} + +TEST_CASE("RegisterHandler tests", "[RegisterHandler]") { + RegisterHandler handler; + + // This example will have 5 registers each be the next's predecessor. Example tagged regex that + // may lead to this would be "(((((1a)+a)+a)+a)+a)+", whereeach regex represents 1 at a + // different layer of repetition. + constexpr uint32_t num_registers = 5; + for (uint32_t i = 0; i < num_registers; i++) { + handler.add_register(i, 0); + } + + SECTION("Set register position correctly") { + handler.set_register(0, 5); + REQUIRE(std::vector{{5}} == handler.get_reversed_positions(0)); + handler.set_register(0, 10); + REQUIRE(std::vector{{10}} == handler.get_reversed_positions(0)); + handler.set_register(1, 15); + REQUIRE(std::vector{{15, 10}} == handler.get_reversed_positions(1)); + } + + SECTION("Copy register index correctly") { + handler.set_register(0, 5); + handler.copy_register(1, 0); + REQUIRE(std::vector{{5}} == handler.get_reversed_positions(1)); + } + + SECTION("append_position appends position correctly") { + handler.set_register(0, 5); + handler.append_position(0, 7); + REQUIRE(std::vector{{7, 5}} == handler.get_reversed_positions(0)); + } + + SECTION("Throws out-of-bounds correctly") { + REQUIRE_THROWS_AS(handler.set_register(10, 5), std::out_of_range); + REQUIRE_THROWS_AS(handler.copy_register(10, 1), std::out_of_range); + REQUIRE_THROWS_AS(handler.copy_register(0, 10), std::out_of_range); + REQUIRE_THROWS_AS(handler.append_position(10, 5), std::out_of_range); + REQUIRE_THROWS_AS(handler.get_reversed_positions(10), std::out_of_range); + } +} diff --git a/tests/test-register.cpp b/tests/test-register.cpp deleted file mode 100644 index 10b9c0a8..00000000 --- a/tests/test-register.cpp +++ /dev/null @@ -1,19 +0,0 @@ -#include - -#include - -#include -#include - -using log_surgeon::finite_automata::Register; -using log_surgeon::finite_automata::Tag; -using std::make_unique; -using std::unique_ptr; - -TEST_CASE("Register operations", "[Register]") { - SECTION("Basic tag retrieval works correctly") { - auto const tag = make_unique("uID"); - Register const reg(tag.get()); - REQUIRE(tag.get() == reg.get_tag()); - } -} From 3449df26b2a3f4f87555c5693d9d190302876c29 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 26 Nov 2024 11:11:52 -0500 Subject: [PATCH 233/323] Linter. --- src/log_surgeon/finite_automata/PrefixTree.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 09adc915..986d4a83 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -53,7 +53,7 @@ class PrefixTree { } /** - * @param index + * @param index * @param position * @throw std::out_of_range("Prefix tree index out-of-bounds."); */ @@ -61,7 +61,7 @@ class PrefixTree { if (m_nodes.size() <= index) { throw std::out_of_range("Prefix tree index out-of-bounds"); } - + m_nodes[index].set_position(position); } From ef62df17a382ab352f29877bef795a1cc9568c2e Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 26 Nov 2024 11:13:38 -0500 Subject: [PATCH 234/323] Linter. --- src/log_surgeon/finite_automata/PrefixTree.cpp | 2 +- src/log_surgeon/finite_automata/RegisterHandler.hpp | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.cpp b/src/log_surgeon/finite_automata/PrefixTree.cpp index de52f5be..b3296953 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.cpp +++ b/src/log_surgeon/finite_automata/PrefixTree.cpp @@ -11,7 +11,7 @@ namespace log_surgeon::finite_automata { std::vector reversed_positions; auto current_index = index; - while(0 < current_index) { + while (0 < current_index) { auto const& current_node = m_nodes[current_index]; reversed_positions.push_back(current_node.get_position()); current_index = current_node.get_predecessor_index(); diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index 52d464a8..7ddfa573 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -94,7 +94,8 @@ class RegisterHandler { * @throws std::out_of_range("Register index out-of-bounds") */ - [[nodiscard]] auto get_reversed_positions(uint32_t const register_index) const -> std::vector { + [[nodiscard]] auto get_reversed_positions(uint32_t const register_index + ) const -> std::vector { if (register_index >= m_registers.size()) { throw std::out_of_range("Register index out of range"); } From a0856501b4eaad56af4c56f432aeadaef6ec53d5 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 27 Nov 2024 15:04:15 -0500 Subject: [PATCH 235/323] Docstring fixes. --- src/log_surgeon/finite_automata/PrefixTree.hpp | 9 +++++---- src/log_surgeon/finite_automata/RegisterHandler.hpp | 8 ++++---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 986d4a83..2698fed8 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -66,10 +66,11 @@ class PrefixTree { } /** - * @param index Representing the leaf node of the register's sub-tree. - * @return The positions, in reverse order, at which the register places the tag in the - * lexed string. - * @throw std::out_of_range("Prefix tree index out-of-bounds."); + * Retrieves a vector of positions in reverse order by traversing from the given index to the + * root. + * @param index The index of the node to start the tarversal from. + * @return A vector containing positions in reverse order from the given index to root. + * @throw std::out_of_range if the index is out of bounds */ [[nodiscard]] auto get_reversed_positions(uint32_t index) const -> std::vector; diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index 7ddfa573..fdbd9052 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -47,7 +47,7 @@ class RegisterHandler { * * @param register_index * @param position - * @throws std::out_of_range("Register index out-of-bounds") + * @throw std::out_of_range if the register index is out of bounds */ void set_register(uint32_t const register_index, int32_t const position) { if (m_registers.size() <= register_index) { @@ -61,7 +61,7 @@ class RegisterHandler { /** * @param dest_register_index * @param source_register_index - * @throws std::out_of_range("Register index out-of-bounds") + * @throw std::out_of_range if the register index is out of bounds */ void copy_register(uint32_t const dest_register_index, uint32_t const source_register_index) { if (m_registers.size() <= source_register_index @@ -76,7 +76,7 @@ class RegisterHandler { /** * @param register_index * @param position - * @throws std::out_of_range("Register index out-of-bounds") + * @throw std::out_of_range if the register index is out of bounds */ void append_position(uint32_t register_index, int32_t position) { if (register_index >= m_registers.size()) { @@ -91,7 +91,7 @@ class RegisterHandler { /** * @param register_index * @return Vector of positions representing the history of the given register. - * @throws std::out_of_range("Register index out-of-bounds") + * @throw std::out_of_range if the register index is out of bounds */ [[nodiscard]] auto get_reversed_positions(uint32_t const register_index From 2be06c0b47a95803e33fb7bd16bf22b90014dfd9 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 27 Nov 2024 15:07:07 -0500 Subject: [PATCH 236/323] Add boundry test case. --- tests/test-prefix-tree.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp index 5d143f97..cf5b9304 100644 --- a/tests/test-prefix-tree.cpp +++ b/tests/test-prefix-tree.cpp @@ -32,6 +32,11 @@ TEST_CASE("Prefix tree operations", "[PrefixTree]") { tree.insert(0, 4); REQUIRE_THROWS_AS(tree.get_reversed_positions(2), std::out_of_range); REQUIRE_THROWS_AS(tree.get_reversed_positions(3), std::out_of_range); + + REQUIRE_THROWS_AS( + tree.get_reversed_positions(std::numeric_limits::max()), + std::out_of_range + ); } SECTION("Set position for a valid index works correctly") { From 9ec01dd8478d2972a5b88e13752afd57819004e1 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 27 Nov 2024 15:17:39 -0500 Subject: [PATCH 237/323] Improve test cases for setting positions in prefix tree. --- tests/test-prefix-tree.cpp | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp index cf5b9304..415a252d 100644 --- a/tests/test-prefix-tree.cpp +++ b/tests/test-prefix-tree.cpp @@ -41,13 +41,25 @@ TEST_CASE("Prefix tree operations", "[PrefixTree]") { SECTION("Set position for a valid index works correctly") { PrefixTree tree; - uint32_t index_1 = tree.insert(0, 4); - tree.set(index_1, 10); - REQUIRE(tree.get_reversed_positions(index_1) == std::vector({10})); + // Test updates to different nodes + uint32_t index_1 = tree.insert(0, 4); uint32_t index_2 = tree.insert(index_1, 7); + tree.set(index_1, 10); tree.set(index_2, 12); + REQUIRE(tree.get_reversed_positions(index_1) == std::vector({10})); REQUIRE(tree.get_reversed_positions(index_2) == std::vector({12, 10})); + + // Test multiple updates to the same node + tree.set(index_2, 15); + tree.set(index_2, 20); + REQUIRE(tree.get_reversed_positions(index_2) == std::vector({20, 10})); + + // Test that updates don't affect unrelated paths + uint32_t index_3 = tree.insert(0, 30); + tree.set(index_3, 25); + REQUIRE(tree.get_reversed_positions(index_1) == std::vector({10})); + REQUIRE(tree.get_reversed_positions(index_2) == std::vector({20, 10})); } SECTION("Set position for an invalid index throws correctly") { From 019e675648953df85cac9fed297baa1ea360a33b Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 27 Nov 2024 15:21:05 -0500 Subject: [PATCH 238/323] Improve test cases for setting invalid positions in prefix tree. --- tests/test-prefix-tree.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp index 415a252d..6b97f9a7 100644 --- a/tests/test-prefix-tree.cpp +++ b/tests/test-prefix-tree.cpp @@ -64,6 +64,12 @@ TEST_CASE("Prefix tree operations", "[PrefixTree]") { SECTION("Set position for an invalid index throws correctly") { PrefixTree tree; + + // Test setting position before any insertions REQUIRE_THROWS_AS(tree.set(100, 20), std::out_of_range); + + // Test setting position just beyond valid range + uint32_t index_1 = tree.insert(0, 4); + REQUIRE_THROWS_AS(tree.set(index_1 + 1, 20), std::out_of_range); } } From 83a411a4dc807b58bd637dc788a086d6ba753b15 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 27 Nov 2024 15:22:44 -0500 Subject: [PATCH 239/323] Remove confusing description; Remove unused include. --- tests/test-register-handler.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp index d5b6d75a..9d691c84 100644 --- a/tests/test-register-handler.cpp +++ b/tests/test-register-handler.cpp @@ -3,11 +3,9 @@ #include #include -#include using log_surgeon::finite_automata::Register; using log_surgeon::finite_automata::RegisterHandler; -using log_surgeon::finite_automata::Tag; using std::make_unique; using std::unique_ptr; @@ -27,9 +25,6 @@ TEST_CASE("Register operations", "[Register]") { TEST_CASE("RegisterHandler tests", "[RegisterHandler]") { RegisterHandler handler; - // This example will have 5 registers each be the next's predecessor. Example tagged regex that - // may lead to this would be "(((((1a)+a)+a)+a)+a)+", whereeach regex represents 1 at a - // different layer of repetition. constexpr uint32_t num_registers = 5; for (uint32_t i = 0; i < num_registers; i++) { handler.add_register(i, 0); From c88fbb59f5e715420731b5bb62972a2046516ff9 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 27 Nov 2024 15:27:00 -0500 Subject: [PATCH 240/323] Add edge case test to register unit-tests. --- tests/test-register-handler.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp index 9d691c84..16be1217 100644 --- a/tests/test-register-handler.cpp +++ b/tests/test-register-handler.cpp @@ -20,6 +20,14 @@ TEST_CASE("Register operations", "[Register]") { reg.set_index(10); REQUIRE(reg.get_index() == 10); } + + SECTION("Register handles edge cases correctly") { + Register reg(-1); + REQUIRE(reg.get_index() == -1); + + reg.set_index(std::numeric_limits::max()); + REQUIRE(reg.get_index() == std::numeric_limits::max()); + } } TEST_CASE("RegisterHandler tests", "[RegisterHandler]") { From 7c91ddc4c925cd87b922cda9a1a6fbb6c787712d Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 27 Nov 2024 15:27:26 -0500 Subject: [PATCH 241/323] Update docstring for PrefixTreeNode. --- src/log_surgeon/finite_automata/PrefixTree.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 2698fed8..4c3ee8af 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -13,8 +13,7 @@ namespace log_surgeon::finite_automata { * was matched in the lexxed string, as well as the index of the prefix tree node that stores the * previous time the tag was matched. * - * Note: m_position is -1 when a tag is - * unmatched. + * Note: m_position is -1 to indicate that a tag is currently unmatched in the lexed string. */ class PrefixTreeNode { public: From 4c507695bfa49ab29a38774eac9e80429780d935 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 27 Nov 2024 15:39:30 -0500 Subject: [PATCH 242/323] Add comments to test-case; Add new test case for setting root value. --- tests/test-prefix-tree.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp index 6b97f9a7..c92f21da 100644 --- a/tests/test-prefix-tree.cpp +++ b/tests/test-prefix-tree.cpp @@ -10,6 +10,7 @@ TEST_CASE("Prefix tree operations", "[PrefixTree]") { SECTION("Newly constructed tree works correctly") { PrefixTree const tree; + // A newly constructed tree should return no positions as the root node is ignored REQUIRE(tree.get_reversed_positions(0).empty()); } @@ -41,6 +42,8 @@ TEST_CASE("Prefix tree operations", "[PrefixTree]") { SECTION("Set position for a valid index works correctly") { PrefixTree tree; + // Test that you can set the root node for sanity, although this value is not used + tree.set(0, 10); // Test updates to different nodes uint32_t index_1 = tree.insert(0, 4); From 98200b47702c9992fbcd75a1b29da8d283464c00 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 27 Nov 2024 15:46:40 -0500 Subject: [PATCH 243/323] Update docstring to make it clear that any negative value of m_position is for unmatched tags. This makes it better defined what any assigned value means. --- src/log_surgeon/finite_automata/PrefixTree.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 4c3ee8af..46870409 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -13,7 +13,7 @@ namespace log_surgeon::finite_automata { * was matched in the lexxed string, as well as the index of the prefix tree node that stores the * previous time the tag was matched. * - * Note: m_position is -1 to indicate that a tag is currently unmatched in the lexed string. + * Note: m_position < 0 indicates that a tag is currently unmatched in the lexed string. */ class PrefixTreeNode { public: From afaf01aa8b2e9a66c27fa33cd3757f3e8c0c5ed0 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 27 Nov 2024 15:52:46 -0500 Subject: [PATCH 244/323] Fix header gaurd. --- src/log_surgeon/finite_automata/RegisterHandler.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index fdbd9052..c9fbe753 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -1,5 +1,5 @@ -#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGISTER -#define LOG_SURGEON_FINITE_AUTOMATA_REGISTER +#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER +#define LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER #include #include @@ -110,4 +110,4 @@ class RegisterHandler { }; } // namespace log_surgeon::finite_automata -#endif // LOG_SURGEON_FINITE_AUTOMATA_REGISTER +#endif // LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER From 8dea4769881e729870c9de74fec733927d38acfb Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 27 Nov 2024 15:53:09 -0500 Subject: [PATCH 245/323] Fix typo. --- src/log_surgeon/finite_automata/RegisterHandler.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index c9fbe753..cf785646 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -29,7 +29,7 @@ class Register { }; /** - * The register handler maintains a prefix tree that is sufficient to reperesent all registers. + * The register handler maintains a prefix tree that is sufficient to represent all registers. * The register handler also contains a vector of registers, and performs the set, copy, and append * operations for these registers. * From dbb1e164bba46251e5f9ed8e72428a069fd06077 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 27 Nov 2024 15:53:49 -0500 Subject: [PATCH 246/323] Remove newline in docstring. --- src/log_surgeon/finite_automata/RegisterHandler.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index cf785646..b7880e9d 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -44,7 +44,6 @@ class RegisterHandler { } /** - * * @param register_index * @param position * @throw std::out_of_range if the register index is out of bounds From e0548255ec8f2d245af760268d9e0b21d91d4067 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 27 Nov 2024 15:54:51 -0500 Subject: [PATCH 247/323] Improve throw consistency. --- src/log_surgeon/finite_automata/RegisterHandler.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index b7880e9d..28f9960c 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -50,7 +50,7 @@ class RegisterHandler { */ void set_register(uint32_t const register_index, int32_t const position) { if (m_registers.size() <= register_index) { - throw std::out_of_range("Register index out-of-bounds"); + throw std::out_of_range("Register index out of range"); } auto const tree_index = m_registers[register_index].get_index(); From 792ce9618c90f0015b1b5f17d741ed5da42adee6 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 27 Nov 2024 16:01:55 -0500 Subject: [PATCH 248/323] Update prefix tree insertion test cases. --- tests/test-prefix-tree.cpp | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp index c92f21da..ef2a1882 100644 --- a/tests/test-prefix-tree.cpp +++ b/tests/test-prefix-tree.cpp @@ -14,16 +14,26 @@ TEST_CASE("Prefix tree operations", "[PrefixTree]") { REQUIRE(tree.get_reversed_positions(0).empty()); } - SECTION("Adding nodes to the prefix tree works correctly") { + SECTION("Inserting nodes into the prefix tree works correctly") { PrefixTree tree; - uint32_t index_1 = tree.insert(0, 4); - REQUIRE(std::vector({4}) == tree.get_reversed_positions(index_1)); + // Test basic insertions + uint32_t index_1 = tree.insert(0, 4); uint32_t index_2 = tree.insert(index_1, 7); - REQUIRE(std::vector({7, 4}) == tree.get_reversed_positions(index_2)); - uint32_t index_3 = tree.insert(index_2, 9); - REQUIRE(std::vector({9, 7, 4}) == tree.get_reversed_positions(index_3)); + REQUIRE(std::vector{4} == tree.get_reversed_positions(index_1)); + REQUIRE(std::vector{7, 4} == tree.get_reversed_positions(index_2)); + REQUIRE(std::vector{9, 7, 4} == tree.get_reversed_positions(index_3)); + + // Test insertion with large position values + uint32_t index_4 = tree.insert(0, std::numeric_limits::max()); + REQUIRE(std::numeric_limits::max() == tree.get_reversed_positions(index_4)[0]); + + // Test insertion with negative position values + uint32_t index_5 = tree.insert(0, -1); + uint32_t index_6 = tree.insert(index_5, -100); + REQUIRE(std::vector{-1} == tree.get_reversed_positions(index_5)); + REQUIRE(std::vector{-1, -100} == tree.get_reversed_positions(index_6)); } SECTION("Invalid index access throws correctly") { From cab6e811f06c4d5ed5f3ffb0a9d3b203b888f3b5 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 27 Nov 2024 16:02:20 -0500 Subject: [PATCH 249/323] Fix test case. --- tests/test-prefix-tree.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp index ef2a1882..2d8822dc 100644 --- a/tests/test-prefix-tree.cpp +++ b/tests/test-prefix-tree.cpp @@ -33,7 +33,7 @@ TEST_CASE("Prefix tree operations", "[PrefixTree]") { uint32_t index_5 = tree.insert(0, -1); uint32_t index_6 = tree.insert(index_5, -100); REQUIRE(std::vector{-1} == tree.get_reversed_positions(index_5)); - REQUIRE(std::vector{-1, -100} == tree.get_reversed_positions(index_6)); + REQUIRE(std::vector{-100, -1} == tree.get_reversed_positions(index_6)); } SECTION("Invalid index access throws correctly") { From ffda5e64e25c381161f28a16667f6c5271685ac7 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 27 Nov 2024 16:21:34 -0500 Subject: [PATCH 250/323] Fix @throws doscstring for consistency; Improve insert() docstring. --- src/log_surgeon/finite_automata/PrefixTree.cpp | 2 +- src/log_surgeon/finite_automata/PrefixTree.hpp | 15 +++++++++++---- .../finite_automata/RegisterHandler.hpp | 8 ++++---- tests/test-register-handler.cpp | 2 +- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.cpp b/src/log_surgeon/finite_automata/PrefixTree.cpp index b3296953..84feccc5 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.cpp +++ b/src/log_surgeon/finite_automata/PrefixTree.cpp @@ -6,7 +6,7 @@ namespace log_surgeon::finite_automata { [[nodiscard]] auto PrefixTree::get_reversed_positions(uint32_t const index ) const -> std::vector { if (m_nodes.size() <= index) { - throw std::out_of_range("Prefix tree index out-of-bounds."); + throw std::out_of_range("Prefix tree index out of range"); } std::vector reversed_positions; diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 46870409..66866901 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -44,9 +44,16 @@ class PrefixTree { PrefixTree() : m_nodes{{0, -1}} {} /** - * @return The index of the newly inserted node in the tree. + * @param predecessor_index Index of the inserted node's predecessor in the prefix tree + * @param position The position in the lexed string + * @return The index of the newly inserted node in the tree + * @throw std::out_of_range if the predecessor index is out of range */ uint32_t insert(uint32_t const predecessor_index, int32_t const position) { + if (m_nodes.size() <= predecessor_index) { + throw std::out_of_range("Predecessor index out of range"); + } + m_nodes.emplace_back(predecessor_index, position); return m_nodes.size() - 1; } @@ -54,11 +61,11 @@ class PrefixTree { /** * @param index * @param position - * @throw std::out_of_range("Prefix tree index out-of-bounds."); + * @throw std::out_of_range if prefix tree index is out of range */ auto set(uint32_t const index, int32_t const position) -> void { if (m_nodes.size() <= index) { - throw std::out_of_range("Prefix tree index out-of-bounds"); + throw std::out_of_range("Prefix tree index out of range"); } m_nodes[index].set_position(position); @@ -69,7 +76,7 @@ class PrefixTree { * root. * @param index The index of the node to start the tarversal from. * @return A vector containing positions in reverse order from the given index to root. - * @throw std::out_of_range if the index is out of bounds + * @throw std::out_of_range if the index is out of range */ [[nodiscard]] auto get_reversed_positions(uint32_t index) const -> std::vector; diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index 28f9960c..1c52fe2c 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -46,7 +46,7 @@ class RegisterHandler { /** * @param register_index * @param position - * @throw std::out_of_range if the register index is out of bounds + * @throw std::out_of_range if the register index is out of range */ void set_register(uint32_t const register_index, int32_t const position) { if (m_registers.size() <= register_index) { @@ -60,7 +60,7 @@ class RegisterHandler { /** * @param dest_register_index * @param source_register_index - * @throw std::out_of_range if the register index is out of bounds + * @throw std::out_of_range if the register index is out of range */ void copy_register(uint32_t const dest_register_index, uint32_t const source_register_index) { if (m_registers.size() <= source_register_index @@ -75,7 +75,7 @@ class RegisterHandler { /** * @param register_index * @param position - * @throw std::out_of_range if the register index is out of bounds + * @throw std::out_of_range if the register index is out of range */ void append_position(uint32_t register_index, int32_t position) { if (register_index >= m_registers.size()) { @@ -90,7 +90,7 @@ class RegisterHandler { /** * @param register_index * @return Vector of positions representing the history of the given register. - * @throw std::out_of_range if the register index is out of bounds + * @throw std::out_of_range if the register index is out of range */ [[nodiscard]] auto get_reversed_positions(uint32_t const register_index diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp index 16be1217..7d9db03f 100644 --- a/tests/test-register-handler.cpp +++ b/tests/test-register-handler.cpp @@ -59,7 +59,7 @@ TEST_CASE("RegisterHandler tests", "[RegisterHandler]") { REQUIRE(std::vector{{7, 5}} == handler.get_reversed_positions(0)); } - SECTION("Throws out-of-bounds correctly") { + SECTION("Throws out of range correctly") { REQUIRE_THROWS_AS(handler.set_register(10, 5), std::out_of_range); REQUIRE_THROWS_AS(handler.copy_register(10, 1), std::out_of_range); REQUIRE_THROWS_AS(handler.copy_register(0, 10), std::out_of_range); From ff1167224b1d6deab572519b108b52b0d9de1330 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 27 Nov 2024 16:28:35 -0500 Subject: [PATCH 251/323] Improve register handler test coverage. --- tests/test-register-handler.cpp | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp index 7d9db03f..2876f8d2 100644 --- a/tests/test-register-handler.cpp +++ b/tests/test-register-handler.cpp @@ -33,6 +33,10 @@ TEST_CASE("Register operations", "[Register]") { TEST_CASE("RegisterHandler tests", "[RegisterHandler]") { RegisterHandler handler; + SECTION("Initial state is empty") { + REQUIRE_THROWS_AS(handler.get_reversed_positions(0), std::out_of_range); + } + constexpr uint32_t num_registers = 5; for (uint32_t i = 0; i < num_registers; i++) { handler.add_register(i, 0); @@ -40,11 +44,16 @@ TEST_CASE("RegisterHandler tests", "[RegisterHandler]") { SECTION("Set register position correctly") { handler.set_register(0, 5); - REQUIRE(std::vector{{5}} == handler.get_reversed_positions(0)); - handler.set_register(0, 10); - REQUIRE(std::vector{{10}} == handler.get_reversed_positions(0)); - handler.set_register(1, 15); - REQUIRE(std::vector{{15, 10}} == handler.get_reversed_positions(1)); + REQUIRE(std::vector{5} == handler.get_reversed_positions(0)); + } + + SECTION("Register relationships are maintained") { + handler.set_register(0, 5); + handler.set_register(1, 10); + handler.set_register(2, 15); + + auto positions = handler.get_reversed_positions(2); + REQUIRE(std::vector{15, 10, 5} == handler.get_reversed_positions(2)); } SECTION("Copy register index correctly") { From 536b50b3a31c2dcc987889e04178f6340e6291fa Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 27 Nov 2024 16:43:47 -0500 Subject: [PATCH 252/323] Fix == ordering in test-cases; Fix vector initialization to remove redundant braces. --- tests/test-prefix-tree.cpp | 10 +++++----- tests/test-register-handler.cpp | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp index 2d8822dc..4207eec3 100644 --- a/tests/test-prefix-tree.cpp +++ b/tests/test-prefix-tree.cpp @@ -60,19 +60,19 @@ TEST_CASE("Prefix tree operations", "[PrefixTree]") { uint32_t index_2 = tree.insert(index_1, 7); tree.set(index_1, 10); tree.set(index_2, 12); - REQUIRE(tree.get_reversed_positions(index_1) == std::vector({10})); - REQUIRE(tree.get_reversed_positions(index_2) == std::vector({12, 10})); + REQUIRE(std::vector{10} == tree.get_reversed_positions(index_1)); + REQUIRE(std::vector{12, 10} == tree.get_reversed_positions(index_2)); // Test multiple updates to the same node tree.set(index_2, 15); tree.set(index_2, 20); - REQUIRE(tree.get_reversed_positions(index_2) == std::vector({20, 10})); + REQUIRE(std::vector{20, 10} == tree.get_reversed_positions(index_2)); // Test that updates don't affect unrelated paths uint32_t index_3 = tree.insert(0, 30); tree.set(index_3, 25); - REQUIRE(tree.get_reversed_positions(index_1) == std::vector({10})); - REQUIRE(tree.get_reversed_positions(index_2) == std::vector({20, 10})); + REQUIRE(std::vector{10} == tree.get_reversed_positions(index_1)); + REQUIRE(std::vector{20, 10} == tree.get_reversed_positions(index_2)); } SECTION("Set position for an invalid index throws correctly") { diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp index 2876f8d2..74294134 100644 --- a/tests/test-register-handler.cpp +++ b/tests/test-register-handler.cpp @@ -59,13 +59,13 @@ TEST_CASE("RegisterHandler tests", "[RegisterHandler]") { SECTION("Copy register index correctly") { handler.set_register(0, 5); handler.copy_register(1, 0); - REQUIRE(std::vector{{5}} == handler.get_reversed_positions(1)); + REQUIRE(std::vector{5} == handler.get_reversed_positions(1)); } SECTION("append_position appends position correctly") { handler.set_register(0, 5); handler.append_position(0, 7); - REQUIRE(std::vector{{7, 5}} == handler.get_reversed_positions(0)); + REQUIRE(std::vector{7, 5} == handler.get_reversed_positions(0)); } SECTION("Throws out of range correctly") { From 77c20f7c0b9cfa09c81b547530f8dda34dea4ad8 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 27 Nov 2024 16:52:55 -0500 Subject: [PATCH 253/323] Add const for consistency. --- src/log_surgeon/finite_automata/RegisterHandler.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index 1c52fe2c..7840384b 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -77,7 +77,7 @@ class RegisterHandler { * @param position * @throw std::out_of_range if the register index is out of range */ - void append_position(uint32_t register_index, int32_t position) { + void append_position(uint32_t const register_index, int32_t const position) { if (register_index >= m_registers.size()) { throw std::out_of_range("Register index out of range"); } From f43759c449d0a8168af4a3b39faa4e4f0d34135c Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 27 Nov 2024 16:55:10 -0500 Subject: [PATCH 254/323] Add _HPP to header guards; Remove unused include. --- src/log_surgeon/finite_automata/PrefixTree.hpp | 7 +++---- src/log_surgeon/finite_automata/RegisterHandler.hpp | 6 +++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 66866901..2a4c3d65 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -1,9 +1,8 @@ -#ifndef LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE -#define LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE +#ifndef LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE_HPP +#define LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE_HPP #include #include -#include #include namespace log_surgeon::finite_automata { @@ -86,4 +85,4 @@ class PrefixTree { } // namespace log_surgeon::finite_automata -#endif // LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE +#endif // LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE_HPP diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index 7840384b..8e4b7607 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -1,5 +1,5 @@ -#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER -#define LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER +#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER_HPP +#define LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER_HPP #include #include @@ -109,4 +109,4 @@ class RegisterHandler { }; } // namespace log_surgeon::finite_automata -#endif // LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER +#endif // LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER_HPP From 01e8881ce445dac67ef146339e1a0a932a4e0856 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 27 Nov 2024 16:59:46 -0500 Subject: [PATCH 255/323] Fix typo. --- src/log_surgeon/finite_automata/PrefixTree.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 2a4c3d65..5cbf7d15 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -73,7 +73,7 @@ class PrefixTree { /** * Retrieves a vector of positions in reverse order by traversing from the given index to the * root. - * @param index The index of the node to start the tarversal from. + * @param index The index of the node to start the traversal from. * @return A vector containing positions in reverse order from the given index to root. * @throw std::out_of_range if the index is out of range */ From fbb3d362e6c1fe6adae353fa0cfbd40680ae8761 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 27 Nov 2024 17:03:21 -0500 Subject: [PATCH 256/323] Remove blank line. --- src/log_surgeon/finite_automata/RegisterHandler.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index 8e4b7607..76f6d26d 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -90,8 +90,7 @@ class RegisterHandler { /** * @param register_index * @return Vector of positions representing the history of the given register. - * @throw std::out_of_range if the register index is out of range - + * @throw std::out_of_range if the register index is out of range. */ [[nodiscard]] auto get_reversed_positions(uint32_t const register_index ) const -> std::vector { From e1f2b18ee2469ec81802495d2da504a2ecb6719b Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 27 Nov 2024 17:04:36 -0500 Subject: [PATCH 257/323] Rename to m_prefix_tree; Remove unused include. --- src/log_surgeon/finite_automata/RegisterHandler.hpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index 76f6d26d..ce222f0c 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -2,7 +2,6 @@ #define LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER_HPP #include -#include #include #include @@ -39,7 +38,7 @@ class Register { class RegisterHandler { public: void add_register(uint32_t const predecessor_index, int32_t const position) { - auto const index = prefix_tree.insert(predecessor_index, position); + auto const index = m_prefix_tree.insert(predecessor_index, position); m_registers.emplace_back(index); } @@ -54,7 +53,7 @@ class RegisterHandler { } auto const tree_index = m_registers[register_index].get_index(); - prefix_tree.set(tree_index, position); + m_prefix_tree.set(tree_index, position); } /** @@ -83,7 +82,7 @@ class RegisterHandler { } uint32_t const tree_index = m_registers[register_index].get_index(); - auto const new_index = prefix_tree.insert(tree_index, position); + auto const new_index = m_prefix_tree.insert(tree_index, position); m_registers[register_index].set_index(new_index); } @@ -99,11 +98,11 @@ class RegisterHandler { } uint32_t const tree_index = m_registers[register_index].get_index(); - return prefix_tree.get_reversed_positions(tree_index); + return m_prefix_tree.get_reversed_positions(tree_index); } private: - PrefixTree prefix_tree; + PrefixTree m_prefix_tree; std::vector m_registers; }; } // namespace log_surgeon::finite_automata From a51b49d7d565df5355b7b02073a759da5384c180 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 27 Nov 2024 17:08:31 -0500 Subject: [PATCH 258/323] Add param descriptions to docstrings. --- .../finite_automata/RegisterHandler.hpp | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index ce222f0c..8eba80c2 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -43,9 +43,9 @@ class RegisterHandler { } /** - * @param register_index - * @param position - * @throw std::out_of_range if the register index is out of range + * @param register_index The index of the register to set. + * @param position The position value to set in the register. + * @throw std::out_of_range if the register index is out of range. */ void set_register(uint32_t const register_index, int32_t const position) { if (m_registers.size() <= register_index) { @@ -57,9 +57,9 @@ class RegisterHandler { } /** - * @param dest_register_index - * @param source_register_index - * @throw std::out_of_range if the register index is out of range + * @param dest_register_index The index of the destination register. + * @param source_register_index The index of the source register. + * @throw std::out_of_range if the register index is out of range. */ void copy_register(uint32_t const dest_register_index, uint32_t const source_register_index) { if (m_registers.size() <= source_register_index @@ -72,9 +72,9 @@ class RegisterHandler { } /** - * @param register_index - * @param position - * @throw std::out_of_range if the register index is out of range + * @param register_index The index of the register to append to. + * @param position The position to append to the register's history. + * @throw std::out_of_range if the register index is out of range. */ void append_position(uint32_t const register_index, int32_t const position) { if (register_index >= m_registers.size()) { @@ -87,8 +87,8 @@ class RegisterHandler { } /** - * @param register_index - * @return Vector of positions representing the history of the given register. + * @param register_index The index of the register whose positions are retrieved. + * @return A vector of positions representing the history of the given register. * @throw std::out_of_range if the register index is out of range. */ [[nodiscard]] auto get_reversed_positions(uint32_t const register_index From 002577e4d855ea6307c029b33e9be1575feb60c0 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 27 Nov 2024 17:11:15 -0500 Subject: [PATCH 259/323] Improve out of range check to be consistent. --- src/log_surgeon/finite_automata/RegisterHandler.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index 8eba80c2..be655384 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -77,7 +77,7 @@ class RegisterHandler { * @throw std::out_of_range if the register index is out of range. */ void append_position(uint32_t const register_index, int32_t const position) { - if (register_index >= m_registers.size()) { + if (m_registers.size() <= register_index) { throw std::out_of_range("Register index out of range"); } @@ -93,7 +93,7 @@ class RegisterHandler { */ [[nodiscard]] auto get_reversed_positions(uint32_t const register_index ) const -> std::vector { - if (register_index >= m_registers.size()) { + if (m_registers.size() <= register_index) { throw std::out_of_range("Register index out of range"); } From 52a155c4761115fdac5493ccbcd91dbd2bcadabf Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 27 Nov 2024 17:13:35 -0500 Subject: [PATCH 260/323] Update set docstring. --- src/log_surgeon/finite_automata/PrefixTree.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 5cbf7d15..8cb98fa3 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -58,9 +58,9 @@ class PrefixTree { } /** - * @param index - * @param position - * @throw std::out_of_range if prefix tree index is out of range + * @param index Index of the node to update. + * @param position New position value to set for the node. + * @throw std::out_of_range if prefix tree index is out of range. */ auto set(uint32_t const index, int32_t const position) -> void { if (m_nodes.size() <= index) { From a6beafcaaf0af4d57ea22d88bedac8866c0a685f Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 27 Nov 2024 17:19:28 -0500 Subject: [PATCH 261/323] Punctuate docstrings. --- src/log_surgeon/finite_automata/PrefixTree.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 8cb98fa3..128a112a 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -43,10 +43,10 @@ class PrefixTree { PrefixTree() : m_nodes{{0, -1}} {} /** - * @param predecessor_index Index of the inserted node's predecessor in the prefix tree - * @param position The position in the lexed string - * @return The index of the newly inserted node in the tree - * @throw std::out_of_range if the predecessor index is out of range + * @param predecessor_index Index of the inserted node's predecessor in the prefix tree. + * @param position The position in the lexed string. + * @return The index of the newly inserted node in the tree. + * @throw std::out_of_range if the predecessor index is out of range. */ uint32_t insert(uint32_t const predecessor_index, int32_t const position) { if (m_nodes.size() <= predecessor_index) { @@ -75,7 +75,7 @@ class PrefixTree { * root. * @param index The index of the node to start the traversal from. * @return A vector containing positions in reverse order from the given index to root. - * @throw std::out_of_range if the index is out of range + * @throw std::out_of_range if the index is out of range. */ [[nodiscard]] auto get_reversed_positions(uint32_t index) const -> std::vector; From ec1f7571e7a4ac44b68ecde82a72f26277e08107 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 28 Nov 2024 11:05:22 -0500 Subject: [PATCH 262/323] Update PregixTreeNode docstring. --- src/log_surgeon/finite_automata/PrefixTree.hpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 128a112a..caabdf29 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -8,11 +8,12 @@ namespace log_surgeon::finite_automata { /** - * A prefix tree node helps a register represent a tag by storing the current position where a tag - * was matched in the lexxed string, as well as the index of the prefix tree node that stores the - * previous time the tag was matched. + * Represents a prefix tree node used by a register to track tag matches in a lexed string. + * This node stores the current position where a tag was matched, as well as the index of the prefix + * tree node corresponding to the previous match of the same tag. * - * Note: m_position < 0 indicates that a tag is currently unmatched in the lexed string. + * Note: A value of m_position < 0 indicates that the tag is currently unmatched in the lexed + * string. */ class PrefixTreeNode { public: From f35741f954b7b43fc33b5da85c914c61d6da7515 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 28 Nov 2024 11:09:24 -0500 Subject: [PATCH 263/323] Improve docstring for PrefixTree. --- src/log_surgeon/finite_automata/PrefixTree.hpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index caabdf29..363eede6 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -33,11 +33,8 @@ class PrefixTreeNode { }; /** - * A prefix tree structure to store positions associated with registers. - * - * PrefixTree stores positions at nodes, and each node can represent a part of a position. - * Multiple positions can be stored at each index in the tree. The tree allows for the addition of - * positions and the retrieval of positions by their associated index. + * A prefix tree for storing registers. + * Each path from the root to an index represents a sequence of matched tag positions. */ class PrefixTree { public: From e8e5e5545a55052ac63e5549340bfa3911a974cf Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 28 Nov 2024 11:21:01 -0500 Subject: [PATCH 264/323] Change to use auto -> void; Punctuate out_of_range throws. --- src/log_surgeon/finite_automata/PrefixTree.cpp | 2 +- src/log_surgeon/finite_automata/PrefixTree.hpp | 6 +++--- .../finite_automata/RegisterHandler.hpp | 17 +++++++++-------- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.cpp b/src/log_surgeon/finite_automata/PrefixTree.cpp index 84feccc5..c92b2a90 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.cpp +++ b/src/log_surgeon/finite_automata/PrefixTree.cpp @@ -6,7 +6,7 @@ namespace log_surgeon::finite_automata { [[nodiscard]] auto PrefixTree::get_reversed_positions(uint32_t const index ) const -> std::vector { if (m_nodes.size() <= index) { - throw std::out_of_range("Prefix tree index out of range"); + throw std::out_of_range("Prefix tree index out of range."); } std::vector reversed_positions; diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 363eede6..0bf0eb8c 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -46,9 +46,9 @@ class PrefixTree { * @return The index of the newly inserted node in the tree. * @throw std::out_of_range if the predecessor index is out of range. */ - uint32_t insert(uint32_t const predecessor_index, int32_t const position) { + auto insert(uint32_t const predecessor_index, int32_t const position) -> uint32_t { if (m_nodes.size() <= predecessor_index) { - throw std::out_of_range("Predecessor index out of range"); + throw std::out_of_range("Predecessor index out of range."); } m_nodes.emplace_back(predecessor_index, position); @@ -62,7 +62,7 @@ class PrefixTree { */ auto set(uint32_t const index, int32_t const position) -> void { if (m_nodes.size() <= index) { - throw std::out_of_range("Prefix tree index out of range"); + throw std::out_of_range("Prefix tree index out of range."); } m_nodes[index].set_position(position); diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index be655384..53b2882a 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -37,7 +37,7 @@ class Register { */ class RegisterHandler { public: - void add_register(uint32_t const predecessor_index, int32_t const position) { + auto add_register(uint32_t const predecessor_index, int32_t const position) -> void { auto const index = m_prefix_tree.insert(predecessor_index, position); m_registers.emplace_back(index); } @@ -47,9 +47,9 @@ class RegisterHandler { * @param position The position value to set in the register. * @throw std::out_of_range if the register index is out of range. */ - void set_register(uint32_t const register_index, int32_t const position) { + auto set_register(uint32_t const register_index, int32_t const position) -> void { if (m_registers.size() <= register_index) { - throw std::out_of_range("Register index out of range"); + throw std::out_of_range("Register index out of range."); } auto const tree_index = m_registers[register_index].get_index(); @@ -61,11 +61,12 @@ class RegisterHandler { * @param source_register_index The index of the source register. * @throw std::out_of_range if the register index is out of range. */ - void copy_register(uint32_t const dest_register_index, uint32_t const source_register_index) { + auto copy_register(uint32_t const dest_register_index, uint32_t const source_register_index) + -> void { if (m_registers.size() <= source_register_index || m_registers.size() <= dest_register_index) { - throw std::out_of_range("Register index out of range"); + throw std::out_of_range("Register index out of range."); } m_registers[dest_register_index] = m_registers[source_register_index]; @@ -76,9 +77,9 @@ class RegisterHandler { * @param position The position to append to the register's history. * @throw std::out_of_range if the register index is out of range. */ - void append_position(uint32_t const register_index, int32_t const position) { + auto append_position(uint32_t const register_index, int32_t const position) -> void { if (m_registers.size() <= register_index) { - throw std::out_of_range("Register index out of range"); + throw std::out_of_range("Register index out of range."); } uint32_t const tree_index = m_registers[register_index].get_index(); @@ -94,7 +95,7 @@ class RegisterHandler { [[nodiscard]] auto get_reversed_positions(uint32_t const register_index ) const -> std::vector { if (m_registers.size() <= register_index) { - throw std::out_of_range("Register index out of range"); + throw std::out_of_range("Register index out of range."); } uint32_t const tree_index = m_registers[register_index].get_index(); From f1ece306fb4f05b39aa07699c45fb4ccb5d01468 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 28 Nov 2024 11:27:58 -0500 Subject: [PATCH 265/323] Update Register docstring. --- src/log_surgeon/finite_automata/RegisterHandler.hpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index 53b2882a..5a5bfc60 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -8,12 +8,11 @@ namespace log_surgeon::finite_automata { /** - * A register stores an index in the prefix tree. The index node fully represents the register's - * history. + * Represents a register that tracks a sequence of positions where a tag was matched in a lexed + * string. * - * Note: history refers to the previous tag locations. E.g., given the tagged regex "aaa(1\d2)+", - * after parsing input string "aaa123", a register representing tag 1 would contain the history - * {3,4,5}. + * To improve efficiency, registers are stored in a prefix tree. This class holds only the index + * of the prefix tree node that represents the current state of the register. */ class Register { public: From 08997aeebf5180ee3ca274fc82570465b63e7f3e Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 28 Nov 2024 11:28:59 -0500 Subject: [PATCH 266/323] Update PrefixTree docstring. --- src/log_surgeon/finite_automata/PrefixTree.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 0bf0eb8c..5bc77a56 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -33,7 +33,7 @@ class PrefixTreeNode { }; /** - * A prefix tree for storing registers. + * Represents a prefix tree that stores all data needed by registers. * Each path from the root to an index represents a sequence of matched tag positions. */ class PrefixTree { From 0910c626c606601d8902b5767754428a129544c4 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 28 Nov 2024 11:29:49 -0500 Subject: [PATCH 267/323] Grammar fix. --- src/log_surgeon/finite_automata/PrefixTree.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 5bc77a56..1ea9c7af 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -9,8 +9,8 @@ namespace log_surgeon::finite_automata { /** * Represents a prefix tree node used by a register to track tag matches in a lexed string. - * This node stores the current position where a tag was matched, as well as the index of the prefix - * tree node corresponding to the previous match of the same tag. + * This node stores the current position at which a tag was matched, as well as the index of the + * prefix tree node corresponding to the previous match of the same tag. * * Note: A value of m_position < 0 indicates that the tag is currently unmatched in the lexed * string. From ede680e26e051737341591b8ceba2bfd2a0a6cfb Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 28 Nov 2024 11:30:48 -0500 Subject: [PATCH 268/323] Grammar fix. --- src/log_surgeon/finite_automata/PrefixTree.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 1ea9c7af..a7e2639b 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -34,6 +34,7 @@ class PrefixTreeNode { /** * Represents a prefix tree that stores all data needed by registers. + * * Each path from the root to an index represents a sequence of matched tag positions. */ class PrefixTree { From c7b047c53a0918dab6ac536f70640883e92373d5 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 28 Nov 2024 11:51:59 -0500 Subject: [PATCH 269/323] Use auto where possible. --- .../finite_automata/RegisterHandler.hpp | 4 ++-- tests/test-prefix-tree.cpp | 20 +++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index 5a5bfc60..f701e9a3 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -81,7 +81,7 @@ class RegisterHandler { throw std::out_of_range("Register index out of range."); } - uint32_t const tree_index = m_registers[register_index].get_index(); + auto const tree_index = m_registers[register_index].get_index(); auto const new_index = m_prefix_tree.insert(tree_index, position); m_registers[register_index].set_index(new_index); } @@ -97,7 +97,7 @@ class RegisterHandler { throw std::out_of_range("Register index out of range."); } - uint32_t const tree_index = m_registers[register_index].get_index(); + auto const tree_index = m_registers[register_index].get_index(); return m_prefix_tree.get_reversed_positions(tree_index); } diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp index 4207eec3..17a41a49 100644 --- a/tests/test-prefix-tree.cpp +++ b/tests/test-prefix-tree.cpp @@ -18,20 +18,20 @@ TEST_CASE("Prefix tree operations", "[PrefixTree]") { PrefixTree tree; // Test basic insertions - uint32_t index_1 = tree.insert(0, 4); - uint32_t index_2 = tree.insert(index_1, 7); - uint32_t index_3 = tree.insert(index_2, 9); + auto const index_1 = tree.insert(0, 4); + auto const index_2 = tree.insert(index_1, 7); + auto const index_3 = tree.insert(index_2, 9); REQUIRE(std::vector{4} == tree.get_reversed_positions(index_1)); REQUIRE(std::vector{7, 4} == tree.get_reversed_positions(index_2)); REQUIRE(std::vector{9, 7, 4} == tree.get_reversed_positions(index_3)); // Test insertion with large position values - uint32_t index_4 = tree.insert(0, std::numeric_limits::max()); + auto const index_4 = tree.insert(0, std::numeric_limits::max()); REQUIRE(std::numeric_limits::max() == tree.get_reversed_positions(index_4)[0]); // Test insertion with negative position values - uint32_t index_5 = tree.insert(0, -1); - uint32_t index_6 = tree.insert(index_5, -100); + auto const index_5 = tree.insert(0, -1); + auto const index_6 = tree.insert(index_5, -100); REQUIRE(std::vector{-1} == tree.get_reversed_positions(index_5)); REQUIRE(std::vector{-100, -1} == tree.get_reversed_positions(index_6)); } @@ -56,8 +56,8 @@ TEST_CASE("Prefix tree operations", "[PrefixTree]") { tree.set(0, 10); // Test updates to different nodes - uint32_t index_1 = tree.insert(0, 4); - uint32_t index_2 = tree.insert(index_1, 7); + auto const index_1 = tree.insert(0, 4); + auto const index_2 = tree.insert(index_1, 7); tree.set(index_1, 10); tree.set(index_2, 12); REQUIRE(std::vector{10} == tree.get_reversed_positions(index_1)); @@ -69,7 +69,7 @@ TEST_CASE("Prefix tree operations", "[PrefixTree]") { REQUIRE(std::vector{20, 10} == tree.get_reversed_positions(index_2)); // Test that updates don't affect unrelated paths - uint32_t index_3 = tree.insert(0, 30); + auto const index_3 = tree.insert(0, 30); tree.set(index_3, 25); REQUIRE(std::vector{10} == tree.get_reversed_positions(index_1)); REQUIRE(std::vector{20, 10} == tree.get_reversed_positions(index_2)); @@ -82,7 +82,7 @@ TEST_CASE("Prefix tree operations", "[PrefixTree]") { REQUIRE_THROWS_AS(tree.set(100, 20), std::out_of_range); // Test setting position just beyond valid range - uint32_t index_1 = tree.insert(0, 4); + auto const index_1 = tree.insert(0, 4); REQUIRE_THROWS_AS(tree.set(index_1 + 1, 20), std::out_of_range); } } From 6fa8fcb3be09cc4490dff4d8fb5ff51792f3e32c Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 2 Dec 2024 10:39:03 -0500 Subject: [PATCH 270/323] Use uniform initialization. --- .../finite_automata/PrefixTree.cpp | 4 ++-- .../finite_automata/RegisterHandler.hpp | 10 +++++----- tests/test-prefix-tree.cpp | 20 +++++++++---------- tests/test-register-handler.cpp | 4 ++-- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.cpp b/src/log_surgeon/finite_automata/PrefixTree.cpp index c92b2a90..0cd0415c 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.cpp +++ b/src/log_surgeon/finite_automata/PrefixTree.cpp @@ -10,9 +10,9 @@ namespace log_surgeon::finite_automata { } std::vector reversed_positions; - auto current_index = index; + auto current_index{index}; while (0 < current_index) { - auto const& current_node = m_nodes[current_index]; + auto const& current_node{m_nodes[current_index]}; reversed_positions.push_back(current_node.get_position()); current_index = current_node.get_predecessor_index(); } diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index f701e9a3..4fc864a1 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -37,7 +37,7 @@ class Register { class RegisterHandler { public: auto add_register(uint32_t const predecessor_index, int32_t const position) -> void { - auto const index = m_prefix_tree.insert(predecessor_index, position); + auto const index{m_prefix_tree.insert(predecessor_index, position)}; m_registers.emplace_back(index); } @@ -51,7 +51,7 @@ class RegisterHandler { throw std::out_of_range("Register index out of range."); } - auto const tree_index = m_registers[register_index].get_index(); + auto const tree_index{m_registers[register_index].get_index()}; m_prefix_tree.set(tree_index, position); } @@ -81,8 +81,8 @@ class RegisterHandler { throw std::out_of_range("Register index out of range."); } - auto const tree_index = m_registers[register_index].get_index(); - auto const new_index = m_prefix_tree.insert(tree_index, position); + auto const tree_index{m_registers[register_index].get_index()}; + auto const new_index{m_prefix_tree.insert(tree_index, position)}; m_registers[register_index].set_index(new_index); } @@ -97,7 +97,7 @@ class RegisterHandler { throw std::out_of_range("Register index out of range."); } - auto const tree_index = m_registers[register_index].get_index(); + auto const tree_index{m_registers[register_index].get_index()}; return m_prefix_tree.get_reversed_positions(tree_index); } diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp index 17a41a49..22ad8029 100644 --- a/tests/test-prefix-tree.cpp +++ b/tests/test-prefix-tree.cpp @@ -18,20 +18,20 @@ TEST_CASE("Prefix tree operations", "[PrefixTree]") { PrefixTree tree; // Test basic insertions - auto const index_1 = tree.insert(0, 4); - auto const index_2 = tree.insert(index_1, 7); - auto const index_3 = tree.insert(index_2, 9); + auto const index_1{tree.insert(0, 4)}; + auto const index_2{tree.insert(index_1, 7)}; + auto const index_3{tree.insert(index_2, 9)}; REQUIRE(std::vector{4} == tree.get_reversed_positions(index_1)); REQUIRE(std::vector{7, 4} == tree.get_reversed_positions(index_2)); REQUIRE(std::vector{9, 7, 4} == tree.get_reversed_positions(index_3)); // Test insertion with large position values - auto const index_4 = tree.insert(0, std::numeric_limits::max()); + auto const index_4{tree.insert(0, std::numeric_limits::max())}; REQUIRE(std::numeric_limits::max() == tree.get_reversed_positions(index_4)[0]); // Test insertion with negative position values - auto const index_5 = tree.insert(0, -1); - auto const index_6 = tree.insert(index_5, -100); + auto const index_5{tree.insert(0, -1)}; + auto const index_6{tree.insert(index_5, -100)}; REQUIRE(std::vector{-1} == tree.get_reversed_positions(index_5)); REQUIRE(std::vector{-100, -1} == tree.get_reversed_positions(index_6)); } @@ -56,8 +56,8 @@ TEST_CASE("Prefix tree operations", "[PrefixTree]") { tree.set(0, 10); // Test updates to different nodes - auto const index_1 = tree.insert(0, 4); - auto const index_2 = tree.insert(index_1, 7); + auto const index_1{tree.insert(0, 4)}; + auto const index_2{tree.insert(index_1, 7)}; tree.set(index_1, 10); tree.set(index_2, 12); REQUIRE(std::vector{10} == tree.get_reversed_positions(index_1)); @@ -69,7 +69,7 @@ TEST_CASE("Prefix tree operations", "[PrefixTree]") { REQUIRE(std::vector{20, 10} == tree.get_reversed_positions(index_2)); // Test that updates don't affect unrelated paths - auto const index_3 = tree.insert(0, 30); + auto const index_3{tree.insert(0, 30)}; tree.set(index_3, 25); REQUIRE(std::vector{10} == tree.get_reversed_positions(index_1)); REQUIRE(std::vector{20, 10} == tree.get_reversed_positions(index_2)); @@ -82,7 +82,7 @@ TEST_CASE("Prefix tree operations", "[PrefixTree]") { REQUIRE_THROWS_AS(tree.set(100, 20), std::out_of_range); // Test setting position just beyond valid range - auto const index_1 = tree.insert(0, 4); + auto const index_1{tree.insert(0, 4)}; REQUIRE_THROWS_AS(tree.set(index_1 + 1, 20), std::out_of_range); } } diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp index 74294134..b55bed40 100644 --- a/tests/test-register-handler.cpp +++ b/tests/test-register-handler.cpp @@ -37,7 +37,7 @@ TEST_CASE("RegisterHandler tests", "[RegisterHandler]") { REQUIRE_THROWS_AS(handler.get_reversed_positions(0), std::out_of_range); } - constexpr uint32_t num_registers = 5; + constexpr uint32_t num_registers{5}; for (uint32_t i = 0; i < num_registers; i++) { handler.add_register(i, 0); } @@ -52,7 +52,7 @@ TEST_CASE("RegisterHandler tests", "[RegisterHandler]") { handler.set_register(1, 10); handler.set_register(2, 15); - auto positions = handler.get_reversed_positions(2); + auto positions{handler.get_reversed_positions(2)}; REQUIRE(std::vector{15, 10, 5} == handler.get_reversed_positions(2)); } From 18b91604e837dd702701216d1457fddebf923855 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 2 Dec 2024 10:43:46 -0500 Subject: [PATCH 271/323] Add missing header. --- src/log_surgeon/finite_automata/RegisterHandler.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index 4fc864a1..a3a5eb56 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -2,6 +2,7 @@ #define LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER_HPP #include +#include #include #include From 3f08fa3484e588f7d0b064eb0ff37443e569aebf Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 2 Dec 2024 10:44:54 -0500 Subject: [PATCH 272/323] Linter. --- src/log_surgeon/finite_automata/PrefixTree.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index a7e2639b..ed53d835 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -34,7 +34,7 @@ class PrefixTreeNode { /** * Represents a prefix tree that stores all data needed by registers. - * + * * Each path from the root to an index represents a sequence of matched tag positions. */ class PrefixTree { From e281f043528598944a64e4bb202bdf9f16e35fa7 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Mon, 2 Dec 2024 10:45:16 -0500 Subject: [PATCH 273/323] Fix spacing. Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/PrefixTree.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index ed53d835..21244eac 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -6,7 +6,6 @@ #include namespace log_surgeon::finite_automata { - /** * Represents a prefix tree node used by a register to track tag matches in a lexed string. * This node stores the current position at which a tag was matched, as well as the index of the From a03734e88c82fbcda477c880f00a379d72a7dfdf Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 2 Dec 2024 10:50:03 -0500 Subject: [PATCH 274/323] Make Node a member of PrefixTree. --- .../finite_automata/PrefixTree.hpp | 49 ++++++++++--------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 21244eac..0bf01c78 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -7,36 +7,37 @@ namespace log_surgeon::finite_automata { /** - * Represents a prefix tree node used by a register to track tag matches in a lexed string. - * This node stores the current position at which a tag was matched, as well as the index of the - * prefix tree node corresponding to the previous match of the same tag. + * Represents a prefix tree that stores all data needed by the TDFA registers. * - * Note: A value of m_position < 0 indicates that the tag is currently unmatched in the lexed - * string. + * Each path from the root to an index represents a sequence of matched tag positions. */ -class PrefixTreeNode { -public: - PrefixTreeNode(uint32_t const predecessor_index, int32_t const position) - : m_predecessor_index{predecessor_index}, - m_position{position} {} +class PrefixTree { + /** + * Represents a prefix tree node. A node stores a potential value for a TDFA register. + * + * A node stores the current position at which a tag was matched, as well as the index of the + * prefix tree node corresponding to the previous match of the same tag. + * + * Note: A value of m_position < 0 indicates that the tag is currently unmatched in the lexed + * string. + */ + class Node { + public: + Node(uint32_t const predecessor_index, int32_t const position) + : m_predecessor_index{predecessor_index}, + m_position{position} {} - [[nodiscard]] auto get_predecessor_index() const -> uint32_t { return m_predecessor_index; } + [[nodiscard]] auto get_predecessor_index() const -> uint32_t { return m_predecessor_index; } - auto set_position(int32_t const position) -> void { m_position = position; } + auto set_position(int32_t const position) -> void { m_position = position; } - [[nodiscard]] auto get_position() const -> int32_t { return m_position; } + [[nodiscard]] auto get_position() const -> int32_t { return m_position; } -private: - uint32_t m_predecessor_index; - int32_t m_position; -}; + private: + uint32_t m_predecessor_index; + int32_t m_position; + }; -/** - * Represents a prefix tree that stores all data needed by registers. - * - * Each path from the root to an index represents a sequence of matched tag positions. - */ -class PrefixTree { public: PrefixTree() : m_nodes{{0, -1}} {} @@ -78,7 +79,7 @@ class PrefixTree { [[nodiscard]] auto get_reversed_positions(uint32_t index) const -> std::vector; private: - std::vector m_nodes; + std::vector m_nodes; }; } // namespace log_surgeon::finite_automata From 9123c7ac191f933241615d3ba96da72220ec1b3e Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 2 Dec 2024 10:55:28 -0500 Subject: [PATCH 275/323] Rename index to prefix_tree_node_id. --- .../finite_automata/RegisterHandler.hpp | 17 +++++++++-------- tests/test-register-handler.cpp | 12 ++++++------ 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index a3a5eb56..39c17054 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -17,14 +17,15 @@ namespace log_surgeon::finite_automata { */ class Register { public: - explicit Register(uint32_t const index) : m_index{index} {} + explicit Register(uint32_t const prefix_tree_node_id) + : m_prefix_tree_node_id{prefix_tree_node_id} {} - auto set_index(uint32_t const index) -> void { m_index = index; } + auto set_prefix_tree_node_id(uint32_t const index) -> void { m_prefix_tree_node_id = index; } - [[nodiscard]] auto get_index() const -> uint32_t { return m_index; } + [[nodiscard]] auto get_prefix_tree_node_id() const -> uint32_t { return m_prefix_tree_node_id; } private: - uint32_t m_index; + uint32_t m_prefix_tree_node_id; }; /** @@ -52,7 +53,7 @@ class RegisterHandler { throw std::out_of_range("Register index out of range."); } - auto const tree_index{m_registers[register_index].get_index()}; + auto const tree_index{m_registers[register_index].get_prefix_tree_node_id()}; m_prefix_tree.set(tree_index, position); } @@ -82,9 +83,9 @@ class RegisterHandler { throw std::out_of_range("Register index out of range."); } - auto const tree_index{m_registers[register_index].get_index()}; + auto const tree_index{m_registers[register_index].get_prefix_tree_node_id()}; auto const new_index{m_prefix_tree.insert(tree_index, position)}; - m_registers[register_index].set_index(new_index); + m_registers[register_index].set_prefix_tree_node_id(new_index); } /** @@ -98,7 +99,7 @@ class RegisterHandler { throw std::out_of_range("Register index out of range."); } - auto const tree_index{m_registers[register_index].get_index()}; + auto const tree_index{m_registers[register_index].get_prefix_tree_node_id()}; return m_prefix_tree.get_reversed_positions(tree_index); } diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp index b55bed40..90de0edf 100644 --- a/tests/test-register-handler.cpp +++ b/tests/test-register-handler.cpp @@ -12,21 +12,21 @@ using std::unique_ptr; TEST_CASE("Register operations", "[Register]") { SECTION("Register constructor and getter initializes correctly") { Register const reg(5); - REQUIRE(reg.get_index() == 5); + REQUIRE(reg.get_prefix_tree_node_id() == 5); } SECTION("Register sets index correctly") { Register reg(5); - reg.set_index(10); - REQUIRE(reg.get_index() == 10); + reg.set_prefix_tree_node_id(10); + REQUIRE(reg.get_prefix_tree_node_id() == 10); } SECTION("Register handles edge cases correctly") { Register reg(-1); - REQUIRE(reg.get_index() == -1); + REQUIRE(reg.get_prefix_tree_node_id() == -1); - reg.set_index(std::numeric_limits::max()); - REQUIRE(reg.get_index() == std::numeric_limits::max()); + reg.set_prefix_tree_node_id(std::numeric_limits::max()); + REQUIRE(reg.get_prefix_tree_node_id() == std::numeric_limits::max()); } } From fe35fe0cf2759dcd0450f99d56af8fbc3a5c8ce3 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 2 Dec 2024 19:55:10 -0500 Subject: [PATCH 276/323] Make it clear indicies in add_register are refering to prefix_tree nodes. --- src/log_surgeon/finite_automata/RegisterHandler.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index 39c17054..c05fb509 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -38,9 +38,9 @@ class Register { */ class RegisterHandler { public: - auto add_register(uint32_t const predecessor_index, int32_t const position) -> void { - auto const index{m_prefix_tree.insert(predecessor_index, position)}; - m_registers.emplace_back(index); + auto add_register(uint32_t const prefix_tree_parent_node_id, int32_t const position) -> void { + auto const prefix_tree_node_id{m_prefix_tree.insert(prefix_tree_parent_node_id, position)}; + m_registers.emplace_back(prefix_tree_node_id); } /** From de58e088669b2700397d1035733571a83e2e2fab Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 2 Dec 2024 19:56:28 -0500 Subject: [PATCH 277/323] Linter. --- src/log_surgeon/finite_automata/PrefixTree.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 0bf01c78..6bae551e 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -14,7 +14,7 @@ namespace log_surgeon::finite_automata { class PrefixTree { /** * Represents a prefix tree node. A node stores a potential value for a TDFA register. - * + * * A node stores the current position at which a tag was matched, as well as the index of the * prefix tree node corresponding to the previous match of the same tag. * From 1426179489108a497594b83f12dd4b98970d98b9 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 2 Dec 2024 20:05:02 -0500 Subject: [PATCH 278/323] rename to reg_id. --- src/log_surgeon/finite_automata/RegisterHandler.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index c05fb509..f2bc4d30 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -44,16 +44,16 @@ class RegisterHandler { } /** - * @param register_index The index of the register to set. + * @param reg_id The index of the register to set. * @param position The position value to set in the register. * @throw std::out_of_range if the register index is out of range. */ - auto set_register(uint32_t const register_index, int32_t const position) -> void { - if (m_registers.size() <= register_index) { + auto set_register(uint32_t const reg_id, int32_t const position) -> void { + if (m_registers.size() <= reg_id) { throw std::out_of_range("Register index out of range."); } - auto const tree_index{m_registers[register_index].get_prefix_tree_node_id()}; + auto const tree_index{m_registers[reg_id].get_prefix_tree_node_id()}; m_prefix_tree.set(tree_index, position); } From 3301f14a35ca102b4277b6c51fca3990ad7eda73 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 2 Dec 2024 20:12:03 -0500 Subject: [PATCH 279/323] Rename to reg_id. --- src/log_surgeon/finite_automata/RegisterHandler.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index f2bc4d30..c9989af4 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -62,15 +62,15 @@ class RegisterHandler { * @param source_register_index The index of the source register. * @throw std::out_of_range if the register index is out of range. */ - auto copy_register(uint32_t const dest_register_index, uint32_t const source_register_index) + auto copy_register(uint32_t const dest_reg_id, uint32_t const source_reg_id) -> void { - if (m_registers.size() <= source_register_index - || m_registers.size() <= dest_register_index) + if (m_registers.size() <= source_reg_id + || m_registers.size() <= dest_reg_id) { throw std::out_of_range("Register index out of range."); } - m_registers[dest_register_index] = m_registers[source_register_index]; + m_registers[dest_reg_id] = m_registers[source_reg_id]; } /** From c9b1369fa312543cef381b3b976279d2672da332 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 2 Dec 2024 20:36:21 -0500 Subject: [PATCH 280/323] Use at(). --- src/log_surgeon/finite_automata/PrefixTree.hpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 6bae551e..f874a2bb 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -62,11 +62,7 @@ class PrefixTree { * @throw std::out_of_range if prefix tree index is out of range. */ auto set(uint32_t const index, int32_t const position) -> void { - if (m_nodes.size() <= index) { - throw std::out_of_range("Prefix tree index out of range."); - } - - m_nodes[index].set_position(position); + m_nodes.at(index).set_position(position); } /** From e2aee661ea58e02abceff79e466b8f27f69f682e Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 2 Dec 2024 21:14:35 -0500 Subject: [PATCH 281/323] Remove Register class and use uint32_t instead; Rename vers to xxx_reg_id; Remove error checking in favor of using .at(). --- .../finite_automata/RegisterHandler.hpp | 79 ++----------------- tests/test-register-handler.cpp | 22 ------ 2 files changed, 8 insertions(+), 93 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index c9989af4..e736c26a 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -8,26 +8,6 @@ #include namespace log_surgeon::finite_automata { -/** - * Represents a register that tracks a sequence of positions where a tag was matched in a lexed - * string. - * - * To improve efficiency, registers are stored in a prefix tree. This class holds only the index - * of the prefix tree node that represents the current state of the register. - */ -class Register { -public: - explicit Register(uint32_t const prefix_tree_node_id) - : m_prefix_tree_node_id{prefix_tree_node_id} {} - - auto set_prefix_tree_node_id(uint32_t const index) -> void { m_prefix_tree_node_id = index; } - - [[nodiscard]] auto get_prefix_tree_node_id() const -> uint32_t { return m_prefix_tree_node_id; } - -private: - uint32_t m_prefix_tree_node_id; -}; - /** * The register handler maintains a prefix tree that is sufficient to represent all registers. * The register handler also contains a vector of registers, and performs the set, copy, and append @@ -43,69 +23,26 @@ class RegisterHandler { m_registers.emplace_back(prefix_tree_node_id); } - /** - * @param reg_id The index of the register to set. - * @param position The position value to set in the register. - * @throw std::out_of_range if the register index is out of range. - */ auto set_register(uint32_t const reg_id, int32_t const position) -> void { - if (m_registers.size() <= reg_id) { - throw std::out_of_range("Register index out of range."); - } - - auto const tree_index{m_registers[reg_id].get_prefix_tree_node_id()}; - m_prefix_tree.set(tree_index, position); + m_prefix_tree.set(m_registers.at(reg_id), position); } - /** - * @param dest_register_index The index of the destination register. - * @param source_register_index The index of the source register. - * @throw std::out_of_range if the register index is out of range. - */ - auto copy_register(uint32_t const dest_reg_id, uint32_t const source_reg_id) - -> void { - if (m_registers.size() <= source_reg_id - || m_registers.size() <= dest_reg_id) - { - throw std::out_of_range("Register index out of range."); - } - - m_registers[dest_reg_id] = m_registers[source_reg_id]; + auto copy_register(uint32_t const dest_reg_id, uint32_t const source_reg_id) -> void { + m_registers.at(dest_reg_id) = m_registers.at(source_reg_id); } - /** - * @param register_index The index of the register to append to. - * @param position The position to append to the register's history. - * @throw std::out_of_range if the register index is out of range. - */ auto append_position(uint32_t const register_index, int32_t const position) -> void { - if (m_registers.size() <= register_index) { - throw std::out_of_range("Register index out of range."); - } - - auto const tree_index{m_registers[register_index].get_prefix_tree_node_id()}; - auto const new_index{m_prefix_tree.insert(tree_index, position)}; - m_registers[register_index].set_prefix_tree_node_id(new_index); + auto& reg{m_registers.at(register_index)}; + reg = m_prefix_tree.insert(reg, position); } - /** - * @param register_index The index of the register whose positions are retrieved. - * @return A vector of positions representing the history of the given register. - * @throw std::out_of_range if the register index is out of range. - */ - [[nodiscard]] auto get_reversed_positions(uint32_t const register_index - ) const -> std::vector { - if (m_registers.size() <= register_index) { - throw std::out_of_range("Register index out of range."); - } - - auto const tree_index{m_registers[register_index].get_prefix_tree_node_id()}; - return m_prefix_tree.get_reversed_positions(tree_index); + [[nodiscard]] auto get_reversed_positions(uint32_t const reg_id) const -> std::vector { + return m_prefix_tree.get_reversed_positions(m_registers.at(reg_id)); } private: PrefixTree m_prefix_tree; - std::vector m_registers; + std::vector m_registers; }; } // namespace log_surgeon::finite_automata diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp index 90de0edf..67fa3d95 100644 --- a/tests/test-register-handler.cpp +++ b/tests/test-register-handler.cpp @@ -4,32 +4,10 @@ #include -using log_surgeon::finite_automata::Register; using log_surgeon::finite_automata::RegisterHandler; using std::make_unique; using std::unique_ptr; -TEST_CASE("Register operations", "[Register]") { - SECTION("Register constructor and getter initializes correctly") { - Register const reg(5); - REQUIRE(reg.get_prefix_tree_node_id() == 5); - } - - SECTION("Register sets index correctly") { - Register reg(5); - reg.set_prefix_tree_node_id(10); - REQUIRE(reg.get_prefix_tree_node_id() == 10); - } - - SECTION("Register handles edge cases correctly") { - Register reg(-1); - REQUIRE(reg.get_prefix_tree_node_id() == -1); - - reg.set_prefix_tree_node_id(std::numeric_limits::max()); - REQUIRE(reg.get_prefix_tree_node_id() == std::numeric_limits::max()); - } -} - TEST_CASE("RegisterHandler tests", "[RegisterHandler]") { RegisterHandler handler; From 36c1810779b2e7ee404a3999130acf632170b036 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 2 Dec 2024 21:20:41 -0500 Subject: [PATCH 282/323] Rename to reg_id. --- src/log_surgeon/finite_automata/RegisterHandler.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index e736c26a..e78ba58f 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -31,8 +31,8 @@ class RegisterHandler { m_registers.at(dest_reg_id) = m_registers.at(source_reg_id); } - auto append_position(uint32_t const register_index, int32_t const position) -> void { - auto& reg{m_registers.at(register_index)}; + auto append_position(uint32_t const reg_id, int32_t const position) -> void { + auto& reg{m_registers.at(reg_id)}; reg = m_prefix_tree.insert(reg, position); } From 48df8b0ff25d7f2ea77a5cdce81d2bb7d87c62a2 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 2 Dec 2024 21:22:11 -0500 Subject: [PATCH 283/323] Remove unused header. --- src/log_surgeon/finite_automata/RegisterHandler.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index e78ba58f..feb9f83b 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -2,7 +2,6 @@ #define LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER_HPP #include -#include #include #include From a8605fc8fbb589147165875c717a4661fcecb295 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 2 Dec 2024 21:43:21 -0500 Subject: [PATCH 284/323] Change pred index to be optional and nullopt for root. --- src/log_surgeon/finite_automata/PrefixTree.cpp | 7 +++---- src/log_surgeon/finite_automata/PrefixTree.hpp | 12 +++++++----- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.cpp b/src/log_surgeon/finite_automata/PrefixTree.cpp index 0cd0415c..a57bac04 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.cpp +++ b/src/log_surgeon/finite_automata/PrefixTree.cpp @@ -10,11 +10,10 @@ namespace log_surgeon::finite_automata { } std::vector reversed_positions; - auto current_index{index}; - while (0 < current_index) { - auto const& current_node{m_nodes[current_index]}; + auto current_node{m_nodes[index]}; + while (current_node.get_predecessor_index().has_value()) { reversed_positions.push_back(current_node.get_position()); - current_index = current_node.get_predecessor_index(); + current_node = m_nodes[current_node.get_predecessor_index().value()]; } return reversed_positions; } diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index f874a2bb..6a84bf2a 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -2,6 +2,7 @@ #define LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE_HPP #include +#include #include #include @@ -23,23 +24,25 @@ class PrefixTree { */ class Node { public: - Node(uint32_t const predecessor_index, int32_t const position) + Node(std::optional const predecessor_index, int32_t const position) : m_predecessor_index{predecessor_index}, m_position{position} {} - [[nodiscard]] auto get_predecessor_index() const -> uint32_t { return m_predecessor_index; } + [[nodiscard]] auto get_predecessor_index() const -> std::optional { + return m_predecessor_index; + } auto set_position(int32_t const position) -> void { m_position = position; } [[nodiscard]] auto get_position() const -> int32_t { return m_position; } private: - uint32_t m_predecessor_index; + std::optional m_predecessor_index; int32_t m_position; }; public: - PrefixTree() : m_nodes{{0, -1}} {} + PrefixTree() : m_nodes{{std::nullopt, -1}} {} /** * @param predecessor_index Index of the inserted node's predecessor in the prefix tree. @@ -59,7 +62,6 @@ class PrefixTree { /** * @param index Index of the node to update. * @param position New position value to set for the node. - * @throw std::out_of_range if prefix tree index is out of range. */ auto set(uint32_t const index, int32_t const position) -> void { m_nodes.at(index).set_position(position); From 15cb1b6be4006cdb81dfaff66eb2d8a6cceef5d1 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 2 Dec 2024 21:49:38 -0500 Subject: [PATCH 285/323] Add and use node_id_t. --- src/log_surgeon/finite_automata/PrefixTree.hpp | 16 ++++++++-------- .../finite_automata/RegisterHandler.hpp | 7 +++++-- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 6a84bf2a..39e1c3da 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -13,6 +13,10 @@ namespace log_surgeon::finite_automata { * Each path from the root to an index represents a sequence of matched tag positions. */ class PrefixTree { +public: + using node_id_t = uint32_t; + +private: /** * Represents a prefix tree node. A node stores a potential value for a TDFA register. * @@ -24,11 +28,11 @@ class PrefixTree { */ class Node { public: - Node(std::optional const predecessor_index, int32_t const position) + Node(std::optional const predecessor_index, int32_t const position) : m_predecessor_index{predecessor_index}, m_position{position} {} - [[nodiscard]] auto get_predecessor_index() const -> std::optional { + [[nodiscard]] auto get_predecessor_index() const -> std::optional { return m_predecessor_index; } @@ -37,7 +41,7 @@ class PrefixTree { [[nodiscard]] auto get_position() const -> int32_t { return m_position; } private: - std::optional m_predecessor_index; + std::optional m_predecessor_index; int32_t m_position; }; @@ -50,7 +54,7 @@ class PrefixTree { * @return The index of the newly inserted node in the tree. * @throw std::out_of_range if the predecessor index is out of range. */ - auto insert(uint32_t const predecessor_index, int32_t const position) -> uint32_t { + auto insert(node_id_t const predecessor_index, int32_t const position) -> uint32_t { if (m_nodes.size() <= predecessor_index) { throw std::out_of_range("Predecessor index out of range."); } @@ -59,10 +63,6 @@ class PrefixTree { return m_nodes.size() - 1; } - /** - * @param index Index of the node to update. - * @param position New position value to set for the node. - */ auto set(uint32_t const index, int32_t const position) -> void { m_nodes.at(index).set_position(position); } diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index feb9f83b..8892b481 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -17,7 +17,10 @@ namespace log_surgeon::finite_automata { */ class RegisterHandler { public: - auto add_register(uint32_t const prefix_tree_parent_node_id, int32_t const position) -> void { + auto add_register( + PrefixTree::node_id_t const prefix_tree_parent_node_id, + int32_t const position + ) -> void { auto const prefix_tree_node_id{m_prefix_tree.insert(prefix_tree_parent_node_id, position)}; m_registers.emplace_back(prefix_tree_node_id); } @@ -41,7 +44,7 @@ class RegisterHandler { private: PrefixTree m_prefix_tree; - std::vector m_registers; + std::vector m_registers; }; } // namespace log_surgeon::finite_automata From 6b787d036f2055e1ba57cce3b837f9299952dc05 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 2 Dec 2024 21:56:29 -0500 Subject: [PATCH 286/323] Add position_t. --- src/log_surgeon/finite_automata/PrefixTree.cpp | 8 +++----- src/log_surgeon/finite_automata/PrefixTree.hpp | 15 ++++++++------- .../finite_automata/RegisterHandler.hpp | 9 +++++---- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.cpp b/src/log_surgeon/finite_automata/PrefixTree.cpp index a57bac04..7f4f17e9 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.cpp +++ b/src/log_surgeon/finite_automata/PrefixTree.cpp @@ -1,15 +1,13 @@ #include "PrefixTree.hpp" -#include - namespace log_surgeon::finite_automata { -[[nodiscard]] auto PrefixTree::get_reversed_positions(uint32_t const index -) const -> std::vector { +[[nodiscard]] auto PrefixTree::get_reversed_positions(node_id_t const index +) const -> std::vector { if (m_nodes.size() <= index) { throw std::out_of_range("Prefix tree index out of range."); } - std::vector reversed_positions; + std::vector reversed_positions; auto current_node{m_nodes[index]}; while (current_node.get_predecessor_index().has_value()) { reversed_positions.push_back(current_node.get_position()); diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 39e1c3da..3fdb3e6e 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -14,6 +14,7 @@ namespace log_surgeon::finite_automata { */ class PrefixTree { public: + using position_t = int32_t; using node_id_t = uint32_t; private: @@ -28,7 +29,7 @@ class PrefixTree { */ class Node { public: - Node(std::optional const predecessor_index, int32_t const position) + Node(std::optional const predecessor_index, position_t const position) : m_predecessor_index{predecessor_index}, m_position{position} {} @@ -36,13 +37,13 @@ class PrefixTree { return m_predecessor_index; } - auto set_position(int32_t const position) -> void { m_position = position; } + auto set_position(position_t const position) -> void { m_position = position; } - [[nodiscard]] auto get_position() const -> int32_t { return m_position; } + [[nodiscard]] auto get_position() const -> position_t { return m_position; } private: std::optional m_predecessor_index; - int32_t m_position; + position_t m_position; }; public: @@ -54,7 +55,7 @@ class PrefixTree { * @return The index of the newly inserted node in the tree. * @throw std::out_of_range if the predecessor index is out of range. */ - auto insert(node_id_t const predecessor_index, int32_t const position) -> uint32_t { + auto insert(node_id_t const predecessor_index, position_t const position) -> node_id_t { if (m_nodes.size() <= predecessor_index) { throw std::out_of_range("Predecessor index out of range."); } @@ -63,7 +64,7 @@ class PrefixTree { return m_nodes.size() - 1; } - auto set(uint32_t const index, int32_t const position) -> void { + auto set(node_id_t const index, position_t const position) -> void { m_nodes.at(index).set_position(position); } @@ -74,7 +75,7 @@ class PrefixTree { * @return A vector containing positions in reverse order from the given index to root. * @throw std::out_of_range if the index is out of range. */ - [[nodiscard]] auto get_reversed_positions(uint32_t index) const -> std::vector; + [[nodiscard]] auto get_reversed_positions(node_id_t index) const -> std::vector; private: std::vector m_nodes; diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index 8892b481..fa820897 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -19,13 +19,13 @@ class RegisterHandler { public: auto add_register( PrefixTree::node_id_t const prefix_tree_parent_node_id, - int32_t const position + PrefixTree::position_t const position ) -> void { auto const prefix_tree_node_id{m_prefix_tree.insert(prefix_tree_parent_node_id, position)}; m_registers.emplace_back(prefix_tree_node_id); } - auto set_register(uint32_t const reg_id, int32_t const position) -> void { + auto set_register(uint32_t const reg_id, PrefixTree::position_t const position) -> void { m_prefix_tree.set(m_registers.at(reg_id), position); } @@ -33,12 +33,13 @@ class RegisterHandler { m_registers.at(dest_reg_id) = m_registers.at(source_reg_id); } - auto append_position(uint32_t const reg_id, int32_t const position) -> void { + auto append_position(uint32_t const reg_id, PrefixTree::position_t const position) -> void { auto& reg{m_registers.at(reg_id)}; reg = m_prefix_tree.insert(reg, position); } - [[nodiscard]] auto get_reversed_positions(uint32_t const reg_id) const -> std::vector { + [[nodiscard]] auto get_reversed_positions(uint32_t const reg_id + ) const -> std::vector { return m_prefix_tree.get_reversed_positions(m_registers.at(reg_id)); } From cd8f4e3d8fc7febe79152661be1b563d1db4c60a Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 2 Dec 2024 21:58:24 -0500 Subject: [PATCH 287/323] Change to id_t. --- src/log_surgeon/finite_automata/PrefixTree.cpp | 2 +- src/log_surgeon/finite_automata/PrefixTree.hpp | 14 +++++++------- .../finite_automata/RegisterHandler.hpp | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.cpp b/src/log_surgeon/finite_automata/PrefixTree.cpp index 7f4f17e9..c51c7938 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.cpp +++ b/src/log_surgeon/finite_automata/PrefixTree.cpp @@ -1,7 +1,7 @@ #include "PrefixTree.hpp" namespace log_surgeon::finite_automata { -[[nodiscard]] auto PrefixTree::get_reversed_positions(node_id_t const index +[[nodiscard]] auto PrefixTree::get_reversed_positions(id_t const index ) const -> std::vector { if (m_nodes.size() <= index) { throw std::out_of_range("Prefix tree index out of range."); diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 3fdb3e6e..60a21932 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -14,8 +14,8 @@ namespace log_surgeon::finite_automata { */ class PrefixTree { public: + using id_t = uint32_t; using position_t = int32_t; - using node_id_t = uint32_t; private: /** @@ -29,11 +29,11 @@ class PrefixTree { */ class Node { public: - Node(std::optional const predecessor_index, position_t const position) + Node(std::optional const predecessor_index, position_t const position) : m_predecessor_index{predecessor_index}, m_position{position} {} - [[nodiscard]] auto get_predecessor_index() const -> std::optional { + [[nodiscard]] auto get_predecessor_index() const -> std::optional { return m_predecessor_index; } @@ -42,7 +42,7 @@ class PrefixTree { [[nodiscard]] auto get_position() const -> position_t { return m_position; } private: - std::optional m_predecessor_index; + std::optional m_predecessor_index; position_t m_position; }; @@ -55,7 +55,7 @@ class PrefixTree { * @return The index of the newly inserted node in the tree. * @throw std::out_of_range if the predecessor index is out of range. */ - auto insert(node_id_t const predecessor_index, position_t const position) -> node_id_t { + auto insert(id_t const predecessor_index, position_t const position) -> id_t { if (m_nodes.size() <= predecessor_index) { throw std::out_of_range("Predecessor index out of range."); } @@ -64,7 +64,7 @@ class PrefixTree { return m_nodes.size() - 1; } - auto set(node_id_t const index, position_t const position) -> void { + auto set(id_t const index, position_t const position) -> void { m_nodes.at(index).set_position(position); } @@ -75,7 +75,7 @@ class PrefixTree { * @return A vector containing positions in reverse order from the given index to root. * @throw std::out_of_range if the index is out of range. */ - [[nodiscard]] auto get_reversed_positions(node_id_t index) const -> std::vector; + [[nodiscard]] auto get_reversed_positions(id_t index) const -> std::vector; private: std::vector m_nodes; diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index fa820897..3e85cd84 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -18,7 +18,7 @@ namespace log_surgeon::finite_automata { class RegisterHandler { public: auto add_register( - PrefixTree::node_id_t const prefix_tree_parent_node_id, + PrefixTree::id_t const prefix_tree_parent_node_id, PrefixTree::position_t const position ) -> void { auto const prefix_tree_node_id{m_prefix_tree.insert(prefix_tree_parent_node_id, position)}; @@ -45,7 +45,7 @@ class RegisterHandler { private: PrefixTree m_prefix_tree; - std::vector m_registers; + std::vector m_registers; }; } // namespace log_surgeon::finite_automata From 72da50c4b58105fa440f548fdad289fd1d36868e Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 4 Dec 2024 06:18:42 -0500 Subject: [PATCH 288/323] Add is_root(). --- src/log_surgeon/finite_automata/PrefixTree.cpp | 2 +- src/log_surgeon/finite_automata/PrefixTree.hpp | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.cpp b/src/log_surgeon/finite_automata/PrefixTree.cpp index c51c7938..18edbdac 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.cpp +++ b/src/log_surgeon/finite_automata/PrefixTree.cpp @@ -9,7 +9,7 @@ namespace log_surgeon::finite_automata { std::vector reversed_positions; auto current_node{m_nodes[index]}; - while (current_node.get_predecessor_index().has_value()) { + while (false == current_node.is_root()) { reversed_positions.push_back(current_node.get_position()); current_node = m_nodes[current_node.get_predecessor_index().value()]; } diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 60a21932..57350550 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -33,6 +33,10 @@ class PrefixTree { : m_predecessor_index{predecessor_index}, m_position{position} {} + [[nodiscard]] auto is_root() const -> bool { + return false == m_predecessor_index.has_value(); + } + [[nodiscard]] auto get_predecessor_index() const -> std::optional { return m_predecessor_index; } From 3fc7ea776bd9aa4275fa1079f8dc7c12fe4f4b2b Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 4 Dec 2024 06:20:23 -0500 Subject: [PATCH 289/323] Add missing header. --- src/log_surgeon/finite_automata/PrefixTree.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/log_surgeon/finite_automata/PrefixTree.cpp b/src/log_surgeon/finite_automata/PrefixTree.cpp index 18edbdac..9d3dfb15 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.cpp +++ b/src/log_surgeon/finite_automata/PrefixTree.cpp @@ -1,5 +1,7 @@ #include "PrefixTree.hpp" +#include + namespace log_surgeon::finite_automata { [[nodiscard]] auto PrefixTree::get_reversed_positions(id_t const index ) const -> std::vector { From 6443d6619d569488ee32182a0e76bc63b2638339 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 4 Dec 2024 06:41:38 -0500 Subject: [PATCH 290/323] Update PrefixTree docstring. --- src/log_surgeon/finite_automata/PrefixTree.hpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 57350550..c67dbb58 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -8,9 +8,13 @@ namespace log_surgeon::finite_automata { /** - * Represents a prefix tree that stores all data needed by the TDFA registers. - * - * Each path from the root to an index represents a sequence of matched tag positions. + * Represents a prefix tree to store register data during TDFA simulation. Each path from the root + * to an index corresponds to a sequence of positions for an individual tag: + * - Positive position node: Indicates the tag was matched at the position. + * - Negative position node: Indicates the tag was unmatched. If a negative node is the entire path, + * it indicates the tag was never matched. If the negative tag is along a path containing positive + * nodes, it functions as a placeholder. This can be useful for nested capture groups, to maintain a + * one-to-one mapping between the contained capture group and the enclosing capture group. */ class PrefixTree { public: From 63aec4d3c8307a6de5d08d0167bdd8f310f5d3bc Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 4 Dec 2024 06:45:32 -0500 Subject: [PATCH 291/323] Removing node docstring as its redundant. --- src/log_surgeon/finite_automata/PrefixTree.hpp | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index c67dbb58..ed6ebd6b 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -8,8 +8,9 @@ namespace log_surgeon::finite_automata { /** - * Represents a prefix tree to store register data during TDFA simulation. Each path from the root - * to an index corresponds to a sequence of positions for an individual tag: + * Represents a prefix tree to store register data during TDFA simulation. Each node in the tree + * stores a single posiiton in the lexed string. Each path from the root to an index corresponds to + * a sequence of positions for an individual tag: * - Positive position node: Indicates the tag was matched at the position. * - Negative position node: Indicates the tag was unmatched. If a negative node is the entire path, * it indicates the tag was never matched. If the negative tag is along a path containing positive @@ -22,15 +23,6 @@ class PrefixTree { using position_t = int32_t; private: - /** - * Represents a prefix tree node. A node stores a potential value for a TDFA register. - * - * A node stores the current position at which a tag was matched, as well as the index of the - * prefix tree node corresponding to the previous match of the same tag. - * - * Note: A value of m_position < 0 indicates that the tag is currently unmatched in the lexed - * string. - */ class Node { public: Node(std::optional const predecessor_index, position_t const position) From 295f3eed024d3c3888504acc3d06c261a77a309b Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 4 Dec 2024 06:47:14 -0500 Subject: [PATCH 292/323] Combine private section in PrefixTree. --- .../finite_automata/PrefixTree.hpp | 48 +++++++++---------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index ed6ebd6b..04392238 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -22,31 +22,6 @@ class PrefixTree { using id_t = uint32_t; using position_t = int32_t; -private: - class Node { - public: - Node(std::optional const predecessor_index, position_t const position) - : m_predecessor_index{predecessor_index}, - m_position{position} {} - - [[nodiscard]] auto is_root() const -> bool { - return false == m_predecessor_index.has_value(); - } - - [[nodiscard]] auto get_predecessor_index() const -> std::optional { - return m_predecessor_index; - } - - auto set_position(position_t const position) -> void { m_position = position; } - - [[nodiscard]] auto get_position() const -> position_t { return m_position; } - - private: - std::optional m_predecessor_index; - position_t m_position; - }; - -public: PrefixTree() : m_nodes{{std::nullopt, -1}} {} /** @@ -78,6 +53,29 @@ class PrefixTree { [[nodiscard]] auto get_reversed_positions(id_t index) const -> std::vector; private: + class Node { + public: + Node(std::optional const predecessor_index, position_t const position) + : m_predecessor_index{predecessor_index}, + m_position{position} {} + + [[nodiscard]] auto is_root() const -> bool { + return false == m_predecessor_index.has_value(); + } + + [[nodiscard]] auto get_predecessor_index() const -> std::optional { + return m_predecessor_index; + } + + auto set_position(position_t const position) -> void { m_position = position; } + + [[nodiscard]] auto get_position() const -> position_t { return m_position; } + + private: + std::optional m_predecessor_index; + position_t m_position; + }; + std::vector m_nodes; }; From 11866669fcfb1f0d97f1eb3640001a2cff62a9c8 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 4 Dec 2024 06:54:35 -0500 Subject: [PATCH 293/323] Add missing header; Remove copy paste error. --- src/log_surgeon/finite_automata/PrefixTree.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.cpp b/src/log_surgeon/finite_automata/PrefixTree.cpp index 9d3dfb15..08b71982 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.cpp +++ b/src/log_surgeon/finite_automata/PrefixTree.cpp @@ -1,9 +1,10 @@ #include "PrefixTree.hpp" #include +#include namespace log_surgeon::finite_automata { -[[nodiscard]] auto PrefixTree::get_reversed_positions(id_t const index +auto PrefixTree::get_reversed_positions(id_t const index ) const -> std::vector { if (m_nodes.size() <= index) { throw std::out_of_range("Prefix tree index out of range."); From 06ee38e7ebafa072b0bb9fc94128786ffc8462eb Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 4 Dec 2024 07:03:30 -0500 Subject: [PATCH 294/323] Rename to node_id and parent_node_id. --- .../finite_automata/PrefixTree.cpp | 9 ++-- .../finite_automata/PrefixTree.hpp | 32 ++++++----- tests/test-prefix-tree.cpp | 54 +++++++++---------- 3 files changed, 46 insertions(+), 49 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.cpp b/src/log_surgeon/finite_automata/PrefixTree.cpp index 08b71982..bf0705c8 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.cpp +++ b/src/log_surgeon/finite_automata/PrefixTree.cpp @@ -4,17 +4,16 @@ #include namespace log_surgeon::finite_automata { -auto PrefixTree::get_reversed_positions(id_t const index -) const -> std::vector { - if (m_nodes.size() <= index) { +auto PrefixTree::get_reversed_positions(id_t const node_id) const -> std::vector { + if (m_nodes.size() <= node_id) { throw std::out_of_range("Prefix tree index out of range."); } std::vector reversed_positions; - auto current_node{m_nodes[index]}; + auto current_node{m_nodes[node_id]}; while (false == current_node.is_root()) { reversed_positions.push_back(current_node.get_position()); - current_node = m_nodes[current_node.get_predecessor_index().value()]; + current_node = m_nodes[current_node.get_parent_node_id().value()]; } return reversed_positions; } diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 04392238..53e72037 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -25,46 +25,44 @@ class PrefixTree { PrefixTree() : m_nodes{{std::nullopt, -1}} {} /** - * @param predecessor_index Index of the inserted node's predecessor in the prefix tree. + * @param parent_node_id Index of the inserted node's parent in the prefix tree. * @param position The position in the lexed string. * @return The index of the newly inserted node in the tree. - * @throw std::out_of_range if the predecessor index is out of range. + * @throw std::out_of_range if the parent's index is out of range. */ - auto insert(id_t const predecessor_index, position_t const position) -> id_t { - if (m_nodes.size() <= predecessor_index) { + auto insert(id_t const parent_node_id, position_t const position) -> id_t { + if (m_nodes.size() <= parent_node_id) { throw std::out_of_range("Predecessor index out of range."); } - m_nodes.emplace_back(predecessor_index, position); + m_nodes.emplace_back(parent_node_id, position); return m_nodes.size() - 1; } - auto set(id_t const index, position_t const position) -> void { - m_nodes.at(index).set_position(position); + auto set(id_t const node_id, position_t const position) -> void { + m_nodes.at(node_id).set_position(position); } /** * Retrieves a vector of positions in reverse order by traversing from the given index to the * root. - * @param index The index of the node to start the traversal from. + * @param node_id The index of the node to start the traversal from. * @return A vector containing positions in reverse order from the given index to root. * @throw std::out_of_range if the index is out of range. */ - [[nodiscard]] auto get_reversed_positions(id_t index) const -> std::vector; + [[nodiscard]] auto get_reversed_positions(id_t node_id) const -> std::vector; private: class Node { public: - Node(std::optional const predecessor_index, position_t const position) - : m_predecessor_index{predecessor_index}, + Node(std::optional const parent_node_id, position_t const position) + : m_parent_node_id{parent_node_id}, m_position{position} {} - [[nodiscard]] auto is_root() const -> bool { - return false == m_predecessor_index.has_value(); - } + [[nodiscard]] auto is_root() const -> bool { return false == m_parent_node_id.has_value(); } - [[nodiscard]] auto get_predecessor_index() const -> std::optional { - return m_predecessor_index; + [[nodiscard]] auto get_parent_node_id() const -> std::optional { + return m_parent_node_id; } auto set_position(position_t const position) -> void { m_position = position; } @@ -72,7 +70,7 @@ class PrefixTree { [[nodiscard]] auto get_position() const -> position_t { return m_position; } private: - std::optional m_predecessor_index; + std::optional m_parent_node_id; position_t m_position; }; diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp index 22ad8029..ec19c156 100644 --- a/tests/test-prefix-tree.cpp +++ b/tests/test-prefix-tree.cpp @@ -18,22 +18,22 @@ TEST_CASE("Prefix tree operations", "[PrefixTree]") { PrefixTree tree; // Test basic insertions - auto const index_1{tree.insert(0, 4)}; - auto const index_2{tree.insert(index_1, 7)}; - auto const index_3{tree.insert(index_2, 9)}; - REQUIRE(std::vector{4} == tree.get_reversed_positions(index_1)); - REQUIRE(std::vector{7, 4} == tree.get_reversed_positions(index_2)); - REQUIRE(std::vector{9, 7, 4} == tree.get_reversed_positions(index_3)); + auto const node_id_1{tree.insert(0, 4)}; + auto const node_id_2{tree.insert(node_id_1, 7)}; + auto const node_id_3{tree.insert(node_id_2, 9)}; + REQUIRE(std::vector{4} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{7, 4} == tree.get_reversed_positions(node_id_2)); + REQUIRE(std::vector{9, 7, 4} == tree.get_reversed_positions(node_id_3)); // Test insertion with large position values - auto const index_4{tree.insert(0, std::numeric_limits::max())}; - REQUIRE(std::numeric_limits::max() == tree.get_reversed_positions(index_4)[0]); + auto const node_id_4{tree.insert(0, std::numeric_limits::max())}; + REQUIRE(std::numeric_limits::max() == tree.get_reversed_positions(node_id_4)[0]); // Test insertion with negative position values - auto const index_5{tree.insert(0, -1)}; - auto const index_6{tree.insert(index_5, -100)}; - REQUIRE(std::vector{-1} == tree.get_reversed_positions(index_5)); - REQUIRE(std::vector{-100, -1} == tree.get_reversed_positions(index_6)); + auto const node_id_5{tree.insert(0, -1)}; + auto const node_id_6{tree.insert(node_id_5, -100)}; + REQUIRE(std::vector{-1} == tree.get_reversed_positions(node_id_5)); + REQUIRE(std::vector{-100, -1} == tree.get_reversed_positions(node_id_6)); } SECTION("Invalid index access throws correctly") { @@ -56,23 +56,23 @@ TEST_CASE("Prefix tree operations", "[PrefixTree]") { tree.set(0, 10); // Test updates to different nodes - auto const index_1{tree.insert(0, 4)}; - auto const index_2{tree.insert(index_1, 7)}; - tree.set(index_1, 10); - tree.set(index_2, 12); - REQUIRE(std::vector{10} == tree.get_reversed_positions(index_1)); - REQUIRE(std::vector{12, 10} == tree.get_reversed_positions(index_2)); + auto const node_id_1{tree.insert(0, 4)}; + auto const node_id_2{tree.insert(node_id_1, 7)}; + tree.set(node_id_1, 10); + tree.set(node_id_2, 12); + REQUIRE(std::vector{10} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{12, 10} == tree.get_reversed_positions(node_id_2)); // Test multiple updates to the same node - tree.set(index_2, 15); - tree.set(index_2, 20); - REQUIRE(std::vector{20, 10} == tree.get_reversed_positions(index_2)); + tree.set(node_id_2, 15); + tree.set(node_id_2, 20); + REQUIRE(std::vector{20, 10} == tree.get_reversed_positions(node_id_2)); // Test that updates don't affect unrelated paths - auto const index_3{tree.insert(0, 30)}; - tree.set(index_3, 25); - REQUIRE(std::vector{10} == tree.get_reversed_positions(index_1)); - REQUIRE(std::vector{20, 10} == tree.get_reversed_positions(index_2)); + auto const node_id_3{tree.insert(0, 30)}; + tree.set(node_id_3, 25); + REQUIRE(std::vector{10} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{20, 10} == tree.get_reversed_positions(node_id_2)); } SECTION("Set position for an invalid index throws correctly") { @@ -82,7 +82,7 @@ TEST_CASE("Prefix tree operations", "[PrefixTree]") { REQUIRE_THROWS_AS(tree.set(100, 20), std::out_of_range); // Test setting position just beyond valid range - auto const index_1{tree.insert(0, 4)}; - REQUIRE_THROWS_AS(tree.set(index_1 + 1, 20), std::out_of_range); + auto const node_id_1{tree.insert(0, 4)}; + REQUIRE_THROWS_AS(tree.set(node_id_1 + 1, 20), std::out_of_range); } } From e103011e1cd46382815a756e5fa2da21c98fd5ea Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 4 Dec 2024 07:04:29 -0500 Subject: [PATCH 295/323] Update get_reversed_positions' docstring. --- src/log_surgeon/finite_automata/PrefixTree.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 53e72037..47de73e4 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -44,8 +44,6 @@ class PrefixTree { } /** - * Retrieves a vector of positions in reverse order by traversing from the given index to the - * root. * @param node_id The index of the node to start the traversal from. * @return A vector containing positions in reverse order from the given index to root. * @throw std::out_of_range if the index is out of range. From 31b03465af5d4cca352e21344e1fb4669ce2d230 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 4 Dec 2024 07:11:08 -0500 Subject: [PATCH 296/323] Update get_reversed positions' docstring to clarify exlcusivity of the root. --- src/log_surgeon/finite_automata/PrefixTree.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 47de73e4..47c7bdb8 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -44,8 +44,9 @@ class PrefixTree { } /** - * @param node_id The index of the node to start the traversal from. - * @return A vector containing positions in reverse order from the given index to root. + * @param node_id The index of the node. + * @return A vector containing positions in the path defined by `node_id`, in reverse order, + * i.e., [index, root). * @throw std::out_of_range if the index is out of range. */ [[nodiscard]] auto get_reversed_positions(id_t node_id) const -> std::vector; From 4005e41c27074cb233a7ba7a335c9544afbb86e1 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 4 Dec 2024 07:11:57 -0500 Subject: [PATCH 297/323] Grammar fix. --- src/log_surgeon/finite_automata/PrefixTree.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 47c7bdb8..79e8cd17 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -45,7 +45,7 @@ class PrefixTree { /** * @param node_id The index of the node. - * @return A vector containing positions in the path defined by `node_id`, in reverse order, + * @return A vector containing positions along the path defined by `node_id`, in reverse order, * i.e., [index, root). * @throw std::out_of_range if the index is out of range. */ From e38940c3b38e3bc4422485adbb8805aad3f2c050 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 4 Dec 2024 07:15:17 -0500 Subject: [PATCH 298/323] Add maybe_unusued. --- src/log_surgeon/finite_automata/PrefixTree.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 79e8cd17..d6f74eef 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -30,7 +30,7 @@ class PrefixTree { * @return The index of the newly inserted node in the tree. * @throw std::out_of_range if the parent's index is out of range. */ - auto insert(id_t const parent_node_id, position_t const position) -> id_t { + [[maybe_unused]] auto insert(id_t const parent_node_id, position_t const position) -> id_t { if (m_nodes.size() <= parent_node_id) { throw std::out_of_range("Predecessor index out of range."); } From d71368d36c5b1d7160a1e27ef3a129e2165e4545 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 4 Dec 2024 07:21:37 -0500 Subject: [PATCH 299/323] Update src/log_surgeon/finite_automata/RegisterHandler.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegisterHandler.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index 3e85cd84..3c61bdc5 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -34,8 +34,8 @@ class RegisterHandler { } auto append_position(uint32_t const reg_id, PrefixTree::position_t const position) -> void { - auto& reg{m_registers.at(reg_id)}; - reg = m_prefix_tree.insert(reg, position); + auto const node_id{m_registers.at(reg_id)}; + m_registers.at(reg_id) = m_prefix_tree.insert(node_id, position); } [[nodiscard]] auto get_reversed_positions(uint32_t const reg_id From dd4b6e1a2a644b624928e4b4358fba05ae31b11f Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 4 Dec 2024 07:23:34 -0500 Subject: [PATCH 300/323] Update test case names to document code names better. --- tests/test-prefix-tree.cpp | 2 +- tests/test-register-handler.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp index ec19c156..153c077b 100644 --- a/tests/test-prefix-tree.cpp +++ b/tests/test-prefix-tree.cpp @@ -6,7 +6,7 @@ using log_surgeon::finite_automata::PrefixTree; -TEST_CASE("Prefix tree operations", "[PrefixTree]") { +TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { SECTION("Newly constructed tree works correctly") { PrefixTree const tree; diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp index 67fa3d95..8c470d98 100644 --- a/tests/test-register-handler.cpp +++ b/tests/test-register-handler.cpp @@ -8,7 +8,7 @@ using log_surgeon::finite_automata::RegisterHandler; using std::make_unique; using std::unique_ptr; -TEST_CASE("RegisterHandler tests", "[RegisterHandler]") { +TEST_CASE("`RegisterHandler` tests", "[RegisterHandler]") { RegisterHandler handler; SECTION("Initial state is empty") { @@ -40,7 +40,7 @@ TEST_CASE("RegisterHandler tests", "[RegisterHandler]") { REQUIRE(std::vector{5} == handler.get_reversed_positions(1)); } - SECTION("append_position appends position correctly") { + SECTION("`append_position` appends position correctly") { handler.set_register(0, 5); handler.append_position(0, 7); REQUIRE(std::vector{7, 5} == handler.get_reversed_positions(0)); From 7322852be0525d9284f2e2d4c41f86b210b1531e Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 4 Dec 2024 07:32:58 -0500 Subject: [PATCH 301/323] Implicitily use auto in vectors. --- tests/test-prefix-tree.cpp | 29 ++++++++++++++++------------- tests/test-register-handler.cpp | 8 ++++---- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp index 153c077b..3f16b5fd 100644 --- a/tests/test-prefix-tree.cpp +++ b/tests/test-prefix-tree.cpp @@ -5,6 +5,8 @@ #include using log_surgeon::finite_automata::PrefixTree; +using id_t = PrefixTree::id_t; +using position_t = PrefixTree::position_t; TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { SECTION("Newly constructed tree works correctly") { @@ -21,19 +23,20 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { auto const node_id_1{tree.insert(0, 4)}; auto const node_id_2{tree.insert(node_id_1, 7)}; auto const node_id_3{tree.insert(node_id_2, 9)}; - REQUIRE(std::vector{4} == tree.get_reversed_positions(node_id_1)); - REQUIRE(std::vector{7, 4} == tree.get_reversed_positions(node_id_2)); - REQUIRE(std::vector{9, 7, 4} == tree.get_reversed_positions(node_id_3)); + REQUIRE(std::vector{4} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{7, 4} == tree.get_reversed_positions(node_id_2)); + REQUIRE(std::vector{9, 7, 4} == tree.get_reversed_positions(node_id_3)); // Test insertion with large position values - auto const node_id_4{tree.insert(0, std::numeric_limits::max())}; - REQUIRE(std::numeric_limits::max() == tree.get_reversed_positions(node_id_4)[0]); + auto const node_id_4{tree.insert(0, std::numeric_limits::max())}; + REQUIRE(std::numeric_limits::max() == tree.get_reversed_positions(node_id_4)[0] + ); // Test insertion with negative position values auto const node_id_5{tree.insert(0, -1)}; auto const node_id_6{tree.insert(node_id_5, -100)}; - REQUIRE(std::vector{-1} == tree.get_reversed_positions(node_id_5)); - REQUIRE(std::vector{-100, -1} == tree.get_reversed_positions(node_id_6)); + REQUIRE(std::vector{-1} == tree.get_reversed_positions(node_id_5)); + REQUIRE(std::vector{-100, -1} == tree.get_reversed_positions(node_id_6)); } SECTION("Invalid index access throws correctly") { @@ -45,7 +48,7 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { REQUIRE_THROWS_AS(tree.get_reversed_positions(3), std::out_of_range); REQUIRE_THROWS_AS( - tree.get_reversed_positions(std::numeric_limits::max()), + tree.get_reversed_positions(std::numeric_limits::max()), std::out_of_range ); } @@ -60,19 +63,19 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { auto const node_id_2{tree.insert(node_id_1, 7)}; tree.set(node_id_1, 10); tree.set(node_id_2, 12); - REQUIRE(std::vector{10} == tree.get_reversed_positions(node_id_1)); - REQUIRE(std::vector{12, 10} == tree.get_reversed_positions(node_id_2)); + REQUIRE(std::vector{10} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{12, 10} == tree.get_reversed_positions(node_id_2)); // Test multiple updates to the same node tree.set(node_id_2, 15); tree.set(node_id_2, 20); - REQUIRE(std::vector{20, 10} == tree.get_reversed_positions(node_id_2)); + REQUIRE(std::vector{20, 10} == tree.get_reversed_positions(node_id_2)); // Test that updates don't affect unrelated paths auto const node_id_3{tree.insert(0, 30)}; tree.set(node_id_3, 25); - REQUIRE(std::vector{10} == tree.get_reversed_positions(node_id_1)); - REQUIRE(std::vector{20, 10} == tree.get_reversed_positions(node_id_2)); + REQUIRE(std::vector{10} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{20, 10} == tree.get_reversed_positions(node_id_2)); } SECTION("Set position for an invalid index throws correctly") { diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp index 8c470d98..3ba5de4d 100644 --- a/tests/test-register-handler.cpp +++ b/tests/test-register-handler.cpp @@ -22,7 +22,7 @@ TEST_CASE("`RegisterHandler` tests", "[RegisterHandler]") { SECTION("Set register position correctly") { handler.set_register(0, 5); - REQUIRE(std::vector{5} == handler.get_reversed_positions(0)); + REQUIRE(std::vector{5} == handler.get_reversed_positions(0)); } SECTION("Register relationships are maintained") { @@ -31,19 +31,19 @@ TEST_CASE("`RegisterHandler` tests", "[RegisterHandler]") { handler.set_register(2, 15); auto positions{handler.get_reversed_positions(2)}; - REQUIRE(std::vector{15, 10, 5} == handler.get_reversed_positions(2)); + REQUIRE(std::vector{15, 10, 5} == handler.get_reversed_positions(2)); } SECTION("Copy register index correctly") { handler.set_register(0, 5); handler.copy_register(1, 0); - REQUIRE(std::vector{5} == handler.get_reversed_positions(1)); + REQUIRE(std::vector{5} == handler.get_reversed_positions(1)); } SECTION("`append_position` appends position correctly") { handler.set_register(0, 5); handler.append_position(0, 7); - REQUIRE(std::vector{7, 5} == handler.get_reversed_positions(0)); + REQUIRE(std::vector{7, 5} == handler.get_reversed_positions(0)); } SECTION("Throws out of range correctly") { From dba1a183aae90eaf4e557f37e1efab10fd70ed5a Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 4 Dec 2024 07:36:42 -0500 Subject: [PATCH 302/323] Explicitily use position_t for vectors. --- tests/test-prefix-tree.cpp | 20 ++++++++++---------- tests/test-register-handler.cpp | 10 ++++++---- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp index 3f16b5fd..e11b749e 100644 --- a/tests/test-prefix-tree.cpp +++ b/tests/test-prefix-tree.cpp @@ -23,9 +23,9 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { auto const node_id_1{tree.insert(0, 4)}; auto const node_id_2{tree.insert(node_id_1, 7)}; auto const node_id_3{tree.insert(node_id_2, 9)}; - REQUIRE(std::vector{4} == tree.get_reversed_positions(node_id_1)); - REQUIRE(std::vector{7, 4} == tree.get_reversed_positions(node_id_2)); - REQUIRE(std::vector{9, 7, 4} == tree.get_reversed_positions(node_id_3)); + REQUIRE(std::vector{4} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{7, 4} == tree.get_reversed_positions(node_id_2)); + REQUIRE(std::vector{9, 7, 4} == tree.get_reversed_positions(node_id_3)); // Test insertion with large position values auto const node_id_4{tree.insert(0, std::numeric_limits::max())}; @@ -35,8 +35,8 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { // Test insertion with negative position values auto const node_id_5{tree.insert(0, -1)}; auto const node_id_6{tree.insert(node_id_5, -100)}; - REQUIRE(std::vector{-1} == tree.get_reversed_positions(node_id_5)); - REQUIRE(std::vector{-100, -1} == tree.get_reversed_positions(node_id_6)); + REQUIRE(std::vector{-1} == tree.get_reversed_positions(node_id_5)); + REQUIRE(std::vector{-100, -1} == tree.get_reversed_positions(node_id_6)); } SECTION("Invalid index access throws correctly") { @@ -63,19 +63,19 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { auto const node_id_2{tree.insert(node_id_1, 7)}; tree.set(node_id_1, 10); tree.set(node_id_2, 12); - REQUIRE(std::vector{10} == tree.get_reversed_positions(node_id_1)); - REQUIRE(std::vector{12, 10} == tree.get_reversed_positions(node_id_2)); + REQUIRE(std::vector{10} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{12, 10} == tree.get_reversed_positions(node_id_2)); // Test multiple updates to the same node tree.set(node_id_2, 15); tree.set(node_id_2, 20); - REQUIRE(std::vector{20, 10} == tree.get_reversed_positions(node_id_2)); + REQUIRE(std::vector{20, 10} == tree.get_reversed_positions(node_id_2)); // Test that updates don't affect unrelated paths auto const node_id_3{tree.insert(0, 30)}; tree.set(node_id_3, 25); - REQUIRE(std::vector{10} == tree.get_reversed_positions(node_id_1)); - REQUIRE(std::vector{20, 10} == tree.get_reversed_positions(node_id_2)); + REQUIRE(std::vector{10} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{20, 10} == tree.get_reversed_positions(node_id_2)); } SECTION("Set position for an invalid index throws correctly") { diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp index 3ba5de4d..815e6f2f 100644 --- a/tests/test-register-handler.cpp +++ b/tests/test-register-handler.cpp @@ -2,10 +2,12 @@ #include +#include #include using log_surgeon::finite_automata::RegisterHandler; using std::make_unique; +using position_t = log_surgeon::finite_automata::PrefixTree::position_t; using std::unique_ptr; TEST_CASE("`RegisterHandler` tests", "[RegisterHandler]") { @@ -22,7 +24,7 @@ TEST_CASE("`RegisterHandler` tests", "[RegisterHandler]") { SECTION("Set register position correctly") { handler.set_register(0, 5); - REQUIRE(std::vector{5} == handler.get_reversed_positions(0)); + REQUIRE(std::vector{5} == handler.get_reversed_positions(0)); } SECTION("Register relationships are maintained") { @@ -31,19 +33,19 @@ TEST_CASE("`RegisterHandler` tests", "[RegisterHandler]") { handler.set_register(2, 15); auto positions{handler.get_reversed_positions(2)}; - REQUIRE(std::vector{15, 10, 5} == handler.get_reversed_positions(2)); + REQUIRE(std::vector{15, 10, 5} == handler.get_reversed_positions(2)); } SECTION("Copy register index correctly") { handler.set_register(0, 5); handler.copy_register(1, 0); - REQUIRE(std::vector{5} == handler.get_reversed_positions(1)); + REQUIRE(std::vector{5} == handler.get_reversed_positions(1)); } SECTION("`append_position` appends position correctly") { handler.set_register(0, 5); handler.append_position(0, 7); - REQUIRE(std::vector{7, 5} == handler.get_reversed_positions(0)); + REQUIRE(std::vector{7, 5} == handler.get_reversed_positions(0)); } SECTION("Throws out of range correctly") { From ee6efab78936cc32722f488724bab93478cd8c03 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 4 Dec 2024 07:37:12 -0500 Subject: [PATCH 303/323] Update tests/test-register-handler.cpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- tests/test-register-handler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp index 815e6f2f..8da726ca 100644 --- a/tests/test-register-handler.cpp +++ b/tests/test-register-handler.cpp @@ -18,7 +18,7 @@ TEST_CASE("`RegisterHandler` tests", "[RegisterHandler]") { } constexpr uint32_t num_registers{5}; - for (uint32_t i = 0; i < num_registers; i++) { + for (uint32_t i{0}; i < num_registers; ++i) { handler.add_register(i, 0); } From 9ba980cc4492b7f3bba0667caea4667514d538be Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 4 Dec 2024 07:40:49 -0500 Subject: [PATCH 304/323] Switch to size_t. --- src/log_surgeon/finite_automata/RegisterHandler.hpp | 8 ++++---- tests/test-register-handler.cpp | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index 3c61bdc5..6690fef0 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -25,20 +25,20 @@ class RegisterHandler { m_registers.emplace_back(prefix_tree_node_id); } - auto set_register(uint32_t const reg_id, PrefixTree::position_t const position) -> void { + auto set_register(size_t const reg_id, PrefixTree::position_t const position) -> void { m_prefix_tree.set(m_registers.at(reg_id), position); } - auto copy_register(uint32_t const dest_reg_id, uint32_t const source_reg_id) -> void { + auto copy_register(size_t const dest_reg_id, size_t const source_reg_id) -> void { m_registers.at(dest_reg_id) = m_registers.at(source_reg_id); } - auto append_position(uint32_t const reg_id, PrefixTree::position_t const position) -> void { + auto append_position(size_t const reg_id, PrefixTree::position_t const position) -> void { auto const node_id{m_registers.at(reg_id)}; m_registers.at(reg_id) = m_prefix_tree.insert(node_id, position); } - [[nodiscard]] auto get_reversed_positions(uint32_t const reg_id + [[nodiscard]] auto get_reversed_positions(size_t const reg_id ) const -> std::vector { return m_prefix_tree.get_reversed_positions(m_registers.at(reg_id)); } diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp index 8da726ca..7c413890 100644 --- a/tests/test-register-handler.cpp +++ b/tests/test-register-handler.cpp @@ -18,7 +18,7 @@ TEST_CASE("`RegisterHandler` tests", "[RegisterHandler]") { } constexpr uint32_t num_registers{5}; - for (uint32_t i{0}; i < num_registers; ++i) { + for (size_t i{0}; i < num_registers; ++i) { handler.add_register(i, 0); } From 27b324c762a0355bbac94a806f7c4e5147e29b83 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 4 Dec 2024 09:21:06 -0500 Subject: [PATCH 305/323] Clang-tidy: Remove magic numbers + Fix headers. --- .../finite_automata/RegisterHandler.hpp | 2 +- tests/test-prefix-tree.cpp | 81 ++++++++++++------- tests/test-register-handler.cpp | 61 ++++++++------ 3 files changed, 87 insertions(+), 57 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index 6690fef0..2c245907 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -1,7 +1,7 @@ #ifndef LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER_HPP #define LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER_HPP -#include +#include #include #include diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp index e11b749e..988961df 100644 --- a/tests/test-prefix-tree.cpp +++ b/tests/test-prefix-tree.cpp @@ -1,4 +1,6 @@ -#include +#include +#include +#include #include @@ -8,6 +10,18 @@ using log_surgeon::finite_automata::PrefixTree; using id_t = PrefixTree::id_t; using position_t = PrefixTree::position_t; +constexpr id_t cInvaidNodeId{100}; +constexpr position_t cInsertPos1{4}; +constexpr position_t cInsertPos2{7}; +constexpr position_t cInsertPos3{9}; +constexpr position_t cMaxPos{std::numeric_limits::max()}; +constexpr position_t cNegativePos1{-1}; +constexpr position_t cNegativePos2{-100}; +constexpr position_t cSetPos1{10}; +constexpr position_t cSetPos2{12}; +constexpr position_t cSetPos3{15}; +constexpr position_t cSetPos4{20}; + TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { SECTION("Newly constructed tree works correctly") { PrefixTree const tree; @@ -20,23 +34,25 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { PrefixTree tree; // Test basic insertions - auto const node_id_1{tree.insert(0, 4)}; - auto const node_id_2{tree.insert(node_id_1, 7)}; - auto const node_id_3{tree.insert(node_id_2, 9)}; - REQUIRE(std::vector{4} == tree.get_reversed_positions(node_id_1)); - REQUIRE(std::vector{7, 4} == tree.get_reversed_positions(node_id_2)); - REQUIRE(std::vector{9, 7, 4} == tree.get_reversed_positions(node_id_3)); + auto const node_id_1{tree.insert(0, cInsertPos1)}; + auto const node_id_2{tree.insert(node_id_1, cInsertPos2)}; + auto const node_id_3{tree.insert(node_id_2, cInsertPos3)}; + REQUIRE(std::vector{cInsertPos1} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{cInsertPos2, cInsertPos1} + == tree.get_reversed_positions(node_id_2)); + REQUIRE(std::vector{cInsertPos3, cInsertPos2, cInsertPos1} + == tree.get_reversed_positions(node_id_3)); // Test insertion with large position values - auto const node_id_4{tree.insert(0, std::numeric_limits::max())}; - REQUIRE(std::numeric_limits::max() == tree.get_reversed_positions(node_id_4)[0] - ); + auto const node_id_4{tree.insert(0, cMaxPos)}; + REQUIRE(cMaxPos == tree.get_reversed_positions(node_id_4)[0]); // Test insertion with negative position values - auto const node_id_5{tree.insert(0, -1)}; - auto const node_id_6{tree.insert(node_id_5, -100)}; - REQUIRE(std::vector{-1} == tree.get_reversed_positions(node_id_5)); - REQUIRE(std::vector{-100, -1} == tree.get_reversed_positions(node_id_6)); + auto const node_id_5{tree.insert(0, cNegativePos1)}; + auto const node_id_6{tree.insert(node_id_5, cNegativePos2)}; + REQUIRE(std::vector{cNegativePos1} == tree.get_reversed_positions(node_id_5)); + REQUIRE(std::vector{cNegativePos2, cNegativePos1} + == tree.get_reversed_positions(node_id_6)); } SECTION("Invalid index access throws correctly") { @@ -56,36 +72,39 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { SECTION("Set position for a valid index works correctly") { PrefixTree tree; // Test that you can set the root node for sanity, although this value is not used - tree.set(0, 10); + tree.set(0, cSetPos1); // Test updates to different nodes - auto const node_id_1{tree.insert(0, 4)}; - auto const node_id_2{tree.insert(node_id_1, 7)}; - tree.set(node_id_1, 10); - tree.set(node_id_2, 12); - REQUIRE(std::vector{10} == tree.get_reversed_positions(node_id_1)); - REQUIRE(std::vector{12, 10} == tree.get_reversed_positions(node_id_2)); + auto const node_id_1{tree.insert(0, cInsertPos1)}; + auto const node_id_2{tree.insert(node_id_1, cInsertPos1)}; + tree.set(node_id_1, cSetPos1); + tree.set(node_id_2, cSetPos2); + REQUIRE(std::vector{cSetPos1} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{cSetPos2, cSetPos1} + == tree.get_reversed_positions(node_id_2)); // Test multiple updates to the same node - tree.set(node_id_2, 15); - tree.set(node_id_2, 20); - REQUIRE(std::vector{20, 10} == tree.get_reversed_positions(node_id_2)); + tree.set(node_id_2, cSetPos3); + tree.set(node_id_2, cSetPos4); + REQUIRE(std::vector{cSetPos4, cSetPos1} + == tree.get_reversed_positions(node_id_2)); // Test that updates don't affect unrelated paths - auto const node_id_3{tree.insert(0, 30)}; - tree.set(node_id_3, 25); - REQUIRE(std::vector{10} == tree.get_reversed_positions(node_id_1)); - REQUIRE(std::vector{20, 10} == tree.get_reversed_positions(node_id_2)); + auto const node_id_3{tree.insert(0, cSetPos2)}; + tree.set(node_id_3, cSetPos3); + REQUIRE(std::vector{cSetPos1} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{cSetPos4, cSetPos1} + == tree.get_reversed_positions(node_id_2)); } SECTION("Set position for an invalid index throws correctly") { PrefixTree tree; // Test setting position before any insertions - REQUIRE_THROWS_AS(tree.set(100, 20), std::out_of_range); + REQUIRE_THROWS_AS(tree.set(cInvaidNodeId, cSetPos4), std::out_of_range); // Test setting position just beyond valid range - auto const node_id_1{tree.insert(0, 4)}; - REQUIRE_THROWS_AS(tree.set(node_id_1 + 1, 20), std::out_of_range); + auto const node_id_1{tree.insert(0, cInsertPos1)}; + REQUIRE_THROWS_AS(tree.set(node_id_1 + 1, cSetPos4), std::out_of_range); } } diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp index 7c413890..4f741961 100644 --- a/tests/test-register-handler.cpp +++ b/tests/test-register-handler.cpp @@ -1,4 +1,6 @@ -#include +#include +#include +#include #include @@ -6,53 +8,62 @@ #include using log_surgeon::finite_automata::RegisterHandler; -using std::make_unique; using position_t = log_surgeon::finite_automata::PrefixTree::position_t; -using std::unique_ptr; + +constexpr position_t cInitialPos{0}; +constexpr position_t cSetPos1{5}; +constexpr position_t cSetPos2{10}; +constexpr position_t cSetPos3{15}; +constexpr size_t cNumRegisters{5}; +constexpr size_t cRegId1{0}; +constexpr size_t cRegId2{1}; +constexpr size_t cRegId3{2}; +constexpr size_t cInvalidRegId{10}; TEST_CASE("`RegisterHandler` tests", "[RegisterHandler]") { RegisterHandler handler; SECTION("Initial state is empty") { - REQUIRE_THROWS_AS(handler.get_reversed_positions(0), std::out_of_range); + REQUIRE_THROWS_AS(handler.get_reversed_positions(cRegId1), std::out_of_range); } - constexpr uint32_t num_registers{5}; - for (size_t i{0}; i < num_registers; ++i) { - handler.add_register(i, 0); + for (size_t i{0}; i < cNumRegisters; ++i) { + handler.add_register(i, cInitialPos); } SECTION("Set register position correctly") { - handler.set_register(0, 5); - REQUIRE(std::vector{5} == handler.get_reversed_positions(0)); + handler.set_register(cRegId1, cSetPos1); + REQUIRE(std::vector{cSetPos1} == handler.get_reversed_positions(cRegId1)); } SECTION("Register relationships are maintained") { - handler.set_register(0, 5); - handler.set_register(1, 10); - handler.set_register(2, 15); + handler.set_register(cRegId1, cSetPos1); + handler.set_register(cRegId2, cSetPos2); + handler.set_register(cRegId3, cSetPos3); - auto positions{handler.get_reversed_positions(2)}; - REQUIRE(std::vector{15, 10, 5} == handler.get_reversed_positions(2)); + auto positions{handler.get_reversed_positions(cRegId3)}; + REQUIRE(std::vector{cSetPos3, cSetPos2, cSetPos1} + == handler.get_reversed_positions(cRegId3)); } SECTION("Copy register index correctly") { - handler.set_register(0, 5); - handler.copy_register(1, 0); - REQUIRE(std::vector{5} == handler.get_reversed_positions(1)); + handler.set_register(cRegId1, cSetPos1); + handler.copy_register(cRegId2, cRegId1); + REQUIRE(std::vector{cSetPos1} == handler.get_reversed_positions(cRegId2)); } SECTION("`append_position` appends position correctly") { - handler.set_register(0, 5); - handler.append_position(0, 7); - REQUIRE(std::vector{7, 5} == handler.get_reversed_positions(0)); + handler.set_register(cRegId1, cSetPos1); + handler.append_position(cRegId1, cSetPos2); + REQUIRE(std::vector{cSetPos2, cSetPos1} + == handler.get_reversed_positions(cRegId1)); } SECTION("Throws out of range correctly") { - REQUIRE_THROWS_AS(handler.set_register(10, 5), std::out_of_range); - REQUIRE_THROWS_AS(handler.copy_register(10, 1), std::out_of_range); - REQUIRE_THROWS_AS(handler.copy_register(0, 10), std::out_of_range); - REQUIRE_THROWS_AS(handler.append_position(10, 5), std::out_of_range); - REQUIRE_THROWS_AS(handler.get_reversed_positions(10), std::out_of_range); + REQUIRE_THROWS_AS(handler.set_register(cInvalidRegId, cSetPos1), std::out_of_range); + REQUIRE_THROWS_AS(handler.copy_register(cInvalidRegId, cRegId2), std::out_of_range); + REQUIRE_THROWS_AS(handler.copy_register(cRegId1, cInvalidRegId), std::out_of_range); + REQUIRE_THROWS_AS(handler.append_position(cInvalidRegId, cSetPos1), std::out_of_range); + REQUIRE_THROWS_AS(handler.get_reversed_positions(cInvalidRegId), std::out_of_range); } } From f651a24f677246a56c3262eff5f5b71a5af02604 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 4 Dec 2024 09:32:51 -0500 Subject: [PATCH 306/323] Reduce complexity for clang-tidy. --- tests/test-register-handler.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp index 4f741961..4ec3ccf1 100644 --- a/tests/test-register-handler.cpp +++ b/tests/test-register-handler.cpp @@ -20,6 +20,14 @@ constexpr size_t cRegId2{1}; constexpr size_t cRegId3{2}; constexpr size_t cInvalidRegId{10}; +namespace { +auto add_register_to_handler(RegisterHandler& handler) -> void { + for (size_t i{0}; i < cNumRegisters; ++i) { + handler.add_register(i, 0); + } +} +} // namespace + TEST_CASE("`RegisterHandler` tests", "[RegisterHandler]") { RegisterHandler handler; @@ -27,9 +35,7 @@ TEST_CASE("`RegisterHandler` tests", "[RegisterHandler]") { REQUIRE_THROWS_AS(handler.get_reversed_positions(cRegId1), std::out_of_range); } - for (size_t i{0}; i < cNumRegisters; ++i) { - handler.add_register(i, cInitialPos); - } + add_register_to_handler(handler); SECTION("Set register position correctly") { handler.set_register(cRegId1, cSetPos1); From fc6f4262a9bec09a98da6e89a619c8082aa8164f Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 4 Dec 2024 09:37:07 -0500 Subject: [PATCH 307/323] Add negative pos test case in test-register-handler.cpp. --- tests/test-register-handler.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp index 4ec3ccf1..b3a71f37 100644 --- a/tests/test-register-handler.cpp +++ b/tests/test-register-handler.cpp @@ -11,6 +11,8 @@ using log_surgeon::finite_automata::RegisterHandler; using position_t = log_surgeon::finite_automata::PrefixTree::position_t; constexpr position_t cInitialPos{0}; +constexpr position_t cNegativePos1{-1}; +constexpr position_t cNegativePos2{-100}; constexpr position_t cSetPos1{5}; constexpr position_t cSetPos2{10}; constexpr position_t cSetPos3{15}; @@ -72,4 +74,12 @@ TEST_CASE("`RegisterHandler` tests", "[RegisterHandler]") { REQUIRE_THROWS_AS(handler.append_position(cInvalidRegId, cSetPos1), std::out_of_range); REQUIRE_THROWS_AS(handler.get_reversed_positions(cInvalidRegId), std::out_of_range); } + + SECTION("Handles negative position values correctly") { + handler.set_register(cRegId1, cNegativePos1); + handler.append_position(cRegId1, cSetPos1); + handler.append_position(cRegId1, cNegativePos2); + REQUIRE(std::vector{cNegativePos2, cSetPos1, cNegativePos1} + == handler.get_reversed_positions(cRegId1)); + } } From c8fb570a95a58e26d422009d86154c4b2f45d80a Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 4 Dec 2024 09:39:45 -0500 Subject: [PATCH 308/323] Alternate b/w positive and negative positions in test-prefix-tree negative position test as this is what is seen in practice when using negative positions. --- tests/test-prefix-tree.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp index 988961df..eb748a58 100644 --- a/tests/test-prefix-tree.cpp +++ b/tests/test-prefix-tree.cpp @@ -49,10 +49,13 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { // Test insertion with negative position values auto const node_id_5{tree.insert(0, cNegativePos1)}; - auto const node_id_6{tree.insert(node_id_5, cNegativePos2)}; + auto const node_id_6{tree.insert(node_id_5, cInsertPos1)}; + auto const node_id_7{tree.insert(node_id_6, cNegativePos2)}; REQUIRE(std::vector{cNegativePos1} == tree.get_reversed_positions(node_id_5)); - REQUIRE(std::vector{cNegativePos2, cNegativePos1} + REQUIRE(std::vector{cInsertPos1, cNegativePos1} == tree.get_reversed_positions(node_id_6)); + REQUIRE(std::vector{cNegativePos2, cInsertPos1, cNegativePos1} + == tree.get_reversed_positions(node_id_7)); } SECTION("Invalid index access throws correctly") { From 1f66918ea8aae2d4b1030d55cfc9ad986f4dd233 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 4 Dec 2024 09:58:53 -0500 Subject: [PATCH 309/323] Add cRootId and size() to PrefixTree. --- .../finite_automata/PrefixTree.hpp | 4 +++ tests/test-prefix-tree.cpp | 28 +++++++++++-------- tests/test-register-handler.cpp | 2 +- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index d6f74eef..7a76c4a4 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -22,6 +22,8 @@ class PrefixTree { using id_t = uint32_t; using position_t = int32_t; + static constexpr id_t cRootId{0}; + PrefixTree() : m_nodes{{std::nullopt, -1}} {} /** @@ -43,6 +45,8 @@ class PrefixTree { m_nodes.at(node_id).set_position(position); } + [[nodiscard]] auto size() const -> size_t { return m_nodes.size(); } + /** * @param node_id The index of the node. * @return A vector containing positions along the path defined by `node_id`, in reverse order, diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp index eb748a58..47262bdd 100644 --- a/tests/test-prefix-tree.cpp +++ b/tests/test-prefix-tree.cpp @@ -10,6 +10,7 @@ using log_surgeon::finite_automata::PrefixTree; using id_t = PrefixTree::id_t; using position_t = PrefixTree::position_t; +constexpr auto cRootId{PrefixTree::cRootId}; constexpr id_t cInvaidNodeId{100}; constexpr position_t cInsertPos1{4}; constexpr position_t cInsertPos2{7}; @@ -21,20 +22,22 @@ constexpr position_t cSetPos1{10}; constexpr position_t cSetPos2{12}; constexpr position_t cSetPos3{15}; constexpr position_t cSetPos4{20}; +constexpr position_t cTreeSize1{4}; +constexpr position_t cTreeSize2{8}; TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { SECTION("Newly constructed tree works correctly") { PrefixTree const tree; // A newly constructed tree should return no positions as the root node is ignored - REQUIRE(tree.get_reversed_positions(0).empty()); + REQUIRE(tree.get_reversed_positions(cRootId).empty()); } SECTION("Inserting nodes into the prefix tree works correctly") { PrefixTree tree; // Test basic insertions - auto const node_id_1{tree.insert(0, cInsertPos1)}; + auto const node_id_1{tree.insert(cRootId, cInsertPos1)}; auto const node_id_2{tree.insert(node_id_1, cInsertPos2)}; auto const node_id_3{tree.insert(node_id_2, cInsertPos3)}; REQUIRE(std::vector{cInsertPos1} == tree.get_reversed_positions(node_id_1)); @@ -42,13 +45,14 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { == tree.get_reversed_positions(node_id_2)); REQUIRE(std::vector{cInsertPos3, cInsertPos2, cInsertPos1} == tree.get_reversed_positions(node_id_3)); + REQUIRE(cTreeSize1 == tree.size()); // Test insertion with large position values - auto const node_id_4{tree.insert(0, cMaxPos)}; + auto const node_id_4{tree.insert(cRootId, cMaxPos)}; REQUIRE(cMaxPos == tree.get_reversed_positions(node_id_4)[0]); // Test insertion with negative position values - auto const node_id_5{tree.insert(0, cNegativePos1)}; + auto const node_id_5{tree.insert(cRootId, cNegativePos1)}; auto const node_id_6{tree.insert(node_id_5, cInsertPos1)}; auto const node_id_7{tree.insert(node_id_6, cNegativePos2)}; REQUIRE(std::vector{cNegativePos1} == tree.get_reversed_positions(node_id_5)); @@ -56,15 +60,15 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { == tree.get_reversed_positions(node_id_6)); REQUIRE(std::vector{cNegativePos2, cInsertPos1, cNegativePos1} == tree.get_reversed_positions(node_id_7)); + REQUIRE(cTreeSize2 == tree.size()); } SECTION("Invalid index access throws correctly") { PrefixTree tree; - REQUIRE_THROWS_AS(tree.get_reversed_positions(1), std::out_of_range); + REQUIRE_THROWS_AS(tree.get_reversed_positions(tree.size()), std::out_of_range); - tree.insert(0, 4); - REQUIRE_THROWS_AS(tree.get_reversed_positions(2), std::out_of_range); - REQUIRE_THROWS_AS(tree.get_reversed_positions(3), std::out_of_range); + tree.insert(cRootId, cInsertPos1); + REQUIRE_THROWS_AS(tree.get_reversed_positions(tree.size()), std::out_of_range); REQUIRE_THROWS_AS( tree.get_reversed_positions(std::numeric_limits::max()), @@ -75,10 +79,10 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { SECTION("Set position for a valid index works correctly") { PrefixTree tree; // Test that you can set the root node for sanity, although this value is not used - tree.set(0, cSetPos1); + tree.set(cRootId, cSetPos1); // Test updates to different nodes - auto const node_id_1{tree.insert(0, cInsertPos1)}; + auto const node_id_1{tree.insert(cRootId, cInsertPos1)}; auto const node_id_2{tree.insert(node_id_1, cInsertPos1)}; tree.set(node_id_1, cSetPos1); tree.set(node_id_2, cSetPos2); @@ -93,7 +97,7 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { == tree.get_reversed_positions(node_id_2)); // Test that updates don't affect unrelated paths - auto const node_id_3{tree.insert(0, cSetPos2)}; + auto const node_id_3{tree.insert(cRootId, cSetPos2)}; tree.set(node_id_3, cSetPos3); REQUIRE(std::vector{cSetPos1} == tree.get_reversed_positions(node_id_1)); REQUIRE(std::vector{cSetPos4, cSetPos1} @@ -107,7 +111,7 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { REQUIRE_THROWS_AS(tree.set(cInvaidNodeId, cSetPos4), std::out_of_range); // Test setting position just beyond valid range - auto const node_id_1{tree.insert(0, cInsertPos1)}; + auto const node_id_1{tree.insert(cRootId, cInsertPos1)}; REQUIRE_THROWS_AS(tree.set(node_id_1 + 1, cSetPos4), std::out_of_range); } } diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp index b3a71f37..9cec3ff5 100644 --- a/tests/test-register-handler.cpp +++ b/tests/test-register-handler.cpp @@ -25,7 +25,7 @@ constexpr size_t cInvalidRegId{10}; namespace { auto add_register_to_handler(RegisterHandler& handler) -> void { for (size_t i{0}; i < cNumRegisters; ++i) { - handler.add_register(i, 0); + handler.add_register(i, cInitialPos); } } } // namespace From a388c809a60ed9a8072e6301fc2e17354a096972 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 4 Dec 2024 10:06:41 -0500 Subject: [PATCH 310/323] Update note. --- src/log_surgeon/finite_automata/RegisterHandler.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index 2c245907..86a06f21 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -12,8 +12,8 @@ namespace log_surgeon::finite_automata { * The register handler also contains a vector of registers, and performs the set, copy, and append * operations for these registers. * - * Note: for efficiency these registers may be re-used, but are not required to be re-initialized. - * It is the responsibility of the DFA to set the register value when needed. + * NOTE: For efficiency, registers are not initialized when lexing a new string; instead, it is the + * responsibility of the DFA to set the register values when needed. */ class RegisterHandler { public: From 340eaf7ae986bdd09fa692776c7b9eddd3780e27 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 4 Dec 2024 10:10:01 -0500 Subject: [PATCH 311/323] Update docstring. --- src/log_surgeon/finite_automata/PrefixTree.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 7a76c4a4..a111cee7 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -49,8 +49,8 @@ class PrefixTree { /** * @param node_id The index of the node. - * @return A vector containing positions along the path defined by `node_id`, in reverse order, - * i.e., [index, root). + * @return A vector containing positions in order from the given index up to but not including + * the root node. * @throw std::out_of_range if the index is out of range. */ [[nodiscard]] auto get_reversed_positions(id_t node_id) const -> std::vector; From 22cf931a3333a32c6543458cc0be1ca1ff2bccf1 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 4 Dec 2024 10:12:10 -0500 Subject: [PATCH 312/323] Fix typo. --- src/log_surgeon/finite_automata/PrefixTree.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index a111cee7..815c7dda 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -9,7 +9,7 @@ namespace log_surgeon::finite_automata { /** * Represents a prefix tree to store register data during TDFA simulation. Each node in the tree - * stores a single posiiton in the lexed string. Each path from the root to an index corresponds to + * stores a single position in the lexed string. Each path from the root to an index corresponds to * a sequence of positions for an individual tag: * - Positive position node: Indicates the tag was matched at the position. * - Negative position node: Indicates the tag was unmatched. If a negative node is the entire path, From c61f2d96b562c79c4f37e570714d1df1640e99af Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 5 Dec 2024 10:19:02 -0500 Subject: [PATCH 313/323] Update header for size_t. --- src/log_surgeon/finite_automata/PrefixTree.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 815c7dda..60ba1ac0 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -1,6 +1,7 @@ #ifndef LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE_HPP #define LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE_HPP +#include #include #include #include From 417bde8e2ab694e1e892704d4cc58a3cc7012899 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Thu, 5 Dec 2024 10:19:36 -0500 Subject: [PATCH 314/323] Update src/log_surgeon/finite_automata/PrefixTree.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/PrefixTree.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 60ba1ac0..37637d87 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -14,9 +14,9 @@ namespace log_surgeon::finite_automata { * a sequence of positions for an individual tag: * - Positive position node: Indicates the tag was matched at the position. * - Negative position node: Indicates the tag was unmatched. If a negative node is the entire path, - * it indicates the tag was never matched. If the negative tag is along a path containing positive - * nodes, it functions as a placeholder. This can be useful for nested capture groups, to maintain a - * one-to-one mapping between the contained capture group and the enclosing capture group. + * it indicates the tag was never matched. If the negative tag is along a path containing positive + * nodes, it functions as a placeholder. This can be useful for nested capture groups, to maintain + * a one-to-one mapping between the contained capture group and the enclosing capture group. */ class PrefixTree { public: From 738876de2bd474f96748471ee240c7128e88edf2 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Thu, 5 Dec 2024 10:19:43 -0500 Subject: [PATCH 315/323] Update src/log_surgeon/finite_automata/PrefixTree.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/PrefixTree.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index 37637d87..b3ebfa2a 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -80,7 +80,6 @@ class PrefixTree { std::vector m_nodes; }; - } // namespace log_surgeon::finite_automata #endif // LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE_HPP From 93c03a0880a4490d9c7a3c00ec8a0b5bd785714d Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Thu, 5 Dec 2024 10:20:16 -0500 Subject: [PATCH 316/323] Update src/log_surgeon/finite_automata/RegisterHandler.hpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- src/log_surgeon/finite_automata/RegisterHandler.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index 86a06f21..d61240e3 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -13,7 +13,7 @@ namespace log_surgeon::finite_automata { * operations for these registers. * * NOTE: For efficiency, registers are not initialized when lexing a new string; instead, it is the - * responsibility of the DFA to set the register values when needed. + * DFA's responsibility to set the register values when needed. */ class RegisterHandler { public: From 6481e5f3eba7f4c1f6da8c6b63a68ea3ef5385c8 Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Thu, 5 Dec 2024 10:21:08 -0500 Subject: [PATCH 317/323] Update tests/test-prefix-tree.cpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- tests/test-prefix-tree.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp index 47262bdd..629f8233 100644 --- a/tests/test-prefix-tree.cpp +++ b/tests/test-prefix-tree.cpp @@ -11,7 +11,7 @@ using id_t = PrefixTree::id_t; using position_t = PrefixTree::position_t; constexpr auto cRootId{PrefixTree::cRootId}; -constexpr id_t cInvaidNodeId{100}; +constexpr id_t cInvalidNodeId{100}; constexpr position_t cInsertPos1{4}; constexpr position_t cInsertPos2{7}; constexpr position_t cInsertPos3{9}; From 6a9a4a42991af6da998f33fca06f973bacd0b732 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 5 Dec 2024 10:25:46 -0500 Subject: [PATCH 318/323] Clean up register initialization helper; Fix typo. --- tests/test-prefix-tree.cpp | 2 +- tests/test-register-handler.cpp | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp index 629f8233..27c79882 100644 --- a/tests/test-prefix-tree.cpp +++ b/tests/test-prefix-tree.cpp @@ -108,7 +108,7 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { PrefixTree tree; // Test setting position before any insertions - REQUIRE_THROWS_AS(tree.set(cInvaidNodeId, cSetPos4), std::out_of_range); + REQUIRE_THROWS_AS(tree.set(cInvalidNodeId, cSetPos4), std::out_of_range); // Test setting position just beyond valid range auto const node_id_1{tree.insert(cRootId, cInsertPos1)}; diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp index 9cec3ff5..6e9f0109 100644 --- a/tests/test-register-handler.cpp +++ b/tests/test-register-handler.cpp @@ -23,8 +23,14 @@ constexpr size_t cRegId3{2}; constexpr size_t cInvalidRegId{10}; namespace { -auto add_register_to_handler(RegisterHandler& handler) -> void { - for (size_t i{0}; i < cNumRegisters; ++i) { +/** + * @param handler The register handler that will contain the new registers. + * @param num_registers The number of registers to initialize. + */ +auto registers_init(RegisterHandler& handler, size_t num_registers) -> void; + +auto registers_init(RegisterHandler& handler, size_t const num_registers) -> void { + for (size_t i{0}; i < num_registers; ++i) { handler.add_register(i, cInitialPos); } } @@ -37,7 +43,7 @@ TEST_CASE("`RegisterHandler` tests", "[RegisterHandler]") { REQUIRE_THROWS_AS(handler.get_reversed_positions(cRegId1), std::out_of_range); } - add_register_to_handler(handler); + registers_init(handler, cNumRegisters); SECTION("Set register position correctly") { handler.set_register(cRegId1, cSetPos1); From 052d86fa9c4a7763c03f55e00fe0a6aeb2f9abb6 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 5 Dec 2024 10:34:01 -0500 Subject: [PATCH 319/323] Update get_parent_id to clarify its unsafe and suppress warning. --- src/log_surgeon/finite_automata/PrefixTree.cpp | 2 +- src/log_surgeon/finite_automata/PrefixTree.hpp | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/log_surgeon/finite_automata/PrefixTree.cpp b/src/log_surgeon/finite_automata/PrefixTree.cpp index bf0705c8..4a652346 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.cpp +++ b/src/log_surgeon/finite_automata/PrefixTree.cpp @@ -13,7 +13,7 @@ auto PrefixTree::get_reversed_positions(id_t const node_id) const -> std::vector auto current_node{m_nodes[node_id]}; while (false == current_node.is_root()) { reversed_positions.push_back(current_node.get_position()); - current_node = m_nodes[current_node.get_parent_node_id().value()]; + current_node = m_nodes[current_node.get_parent_id_unsafe()]; } return reversed_positions; } diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index b3ebfa2a..ab88d805 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -59,14 +59,15 @@ class PrefixTree { private: class Node { public: - Node(std::optional const parent_node_id, position_t const position) - : m_parent_node_id{parent_node_id}, + Node(std::optional const parent_id, position_t const position) + : m_parent_id{parent_id}, m_position{position} {} - [[nodiscard]] auto is_root() const -> bool { return false == m_parent_node_id.has_value(); } + [[nodiscard]] auto is_root() const -> bool { return false == m_parent_id.has_value(); } - [[nodiscard]] auto get_parent_node_id() const -> std::optional { - return m_parent_node_id; + [[nodiscard]] auto get_parent_id_unsafe() const -> id_t { + // NOLINTNEXTLINE(bugprone-unchecked-optional-access) + return m_parent_id.value(); } auto set_position(position_t const position) -> void { m_position = position; } @@ -74,7 +75,7 @@ class PrefixTree { [[nodiscard]] auto get_position() const -> position_t { return m_position; } private: - std::optional m_parent_node_id; + std::optional m_parent_id; position_t m_position; }; From ed70bd5250a4919650df109e622248b7bbe37d63 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 5 Dec 2024 11:01:59 -0500 Subject: [PATCH 320/323] Move constants in test-register-handler.hpp to minimize scope. --- tests/test-register-handler.cpp | 62 ++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp index 6e9f0109..2371fc9e 100644 --- a/tests/test-register-handler.cpp +++ b/tests/test-register-handler.cpp @@ -10,18 +10,6 @@ using log_surgeon::finite_automata::RegisterHandler; using position_t = log_surgeon::finite_automata::PrefixTree::position_t; -constexpr position_t cInitialPos{0}; -constexpr position_t cNegativePos1{-1}; -constexpr position_t cNegativePos2{-100}; -constexpr position_t cSetPos1{5}; -constexpr position_t cSetPos2{10}; -constexpr position_t cSetPos3{15}; -constexpr size_t cNumRegisters{5}; -constexpr size_t cRegId1{0}; -constexpr size_t cRegId2{1}; -constexpr size_t cRegId3{2}; -constexpr size_t cInvalidRegId{10}; - namespace { /** * @param handler The register handler that will contain the new registers. @@ -30,13 +18,20 @@ namespace { auto registers_init(RegisterHandler& handler, size_t num_registers) -> void; auto registers_init(RegisterHandler& handler, size_t const num_registers) -> void { + constexpr position_t cDefaultPos{0}; + for (size_t i{0}; i < num_registers; ++i) { - handler.add_register(i, cInitialPos); + handler.add_register(i, cDefaultPos); } } } // namespace TEST_CASE("`RegisterHandler` tests", "[RegisterHandler]") { + constexpr position_t cInitialPos1{5}; + constexpr size_t cNumRegisters{5}; + constexpr size_t cRegId1{0}; + constexpr size_t cRegId2{1}; + RegisterHandler handler; SECTION("Initial state is empty") { @@ -46,46 +41,57 @@ TEST_CASE("`RegisterHandler` tests", "[RegisterHandler]") { registers_init(handler, cNumRegisters); SECTION("Set register position correctly") { - handler.set_register(cRegId1, cSetPos1); - REQUIRE(std::vector{cSetPos1} == handler.get_reversed_positions(cRegId1)); + handler.set_register(cRegId1, cInitialPos1); + REQUIRE(std::vector{cInitialPos1} == handler.get_reversed_positions(cRegId1)); } SECTION("Register relationships are maintained") { - handler.set_register(cRegId1, cSetPos1); - handler.set_register(cRegId2, cSetPos2); - handler.set_register(cRegId3, cSetPos3); + constexpr position_t cInitialPos2{10}; + constexpr position_t cInitialPos3{15}; + constexpr size_t cRegId3{2}; + + handler.set_register(cRegId1, cInitialPos1); + handler.set_register(cRegId2, cInitialPos2); + handler.set_register(cRegId3, cInitialPos3); auto positions{handler.get_reversed_positions(cRegId3)}; - REQUIRE(std::vector{cSetPos3, cSetPos2, cSetPos1} + REQUIRE(std::vector{cInitialPos3, cInitialPos2, cInitialPos1} == handler.get_reversed_positions(cRegId3)); } SECTION("Copy register index correctly") { - handler.set_register(cRegId1, cSetPos1); + handler.set_register(cRegId1, cInitialPos1); handler.copy_register(cRegId2, cRegId1); - REQUIRE(std::vector{cSetPos1} == handler.get_reversed_positions(cRegId2)); + REQUIRE(std::vector{cInitialPos1} == handler.get_reversed_positions(cRegId2)); } SECTION("`append_position` appends position correctly") { - handler.set_register(cRegId1, cSetPos1); - handler.append_position(cRegId1, cSetPos2); - REQUIRE(std::vector{cSetPos2, cSetPos1} + constexpr position_t cAppendPos{10}; + + handler.set_register(cRegId1, cInitialPos1); + handler.append_position(cRegId1, cAppendPos); + REQUIRE(std::vector{cAppendPos, cInitialPos1} == handler.get_reversed_positions(cRegId1)); } SECTION("Throws out of range correctly") { - REQUIRE_THROWS_AS(handler.set_register(cInvalidRegId, cSetPos1), std::out_of_range); + constexpr size_t cInvalidRegId{10}; + + REQUIRE_THROWS_AS(handler.set_register(cInvalidRegId, cInitialPos1), std::out_of_range); REQUIRE_THROWS_AS(handler.copy_register(cInvalidRegId, cRegId2), std::out_of_range); REQUIRE_THROWS_AS(handler.copy_register(cRegId1, cInvalidRegId), std::out_of_range); - REQUIRE_THROWS_AS(handler.append_position(cInvalidRegId, cSetPos1), std::out_of_range); + REQUIRE_THROWS_AS(handler.append_position(cInvalidRegId, cInitialPos1), std::out_of_range); REQUIRE_THROWS_AS(handler.get_reversed_positions(cInvalidRegId), std::out_of_range); } SECTION("Handles negative position values correctly") { + constexpr position_t cNegativePos1{-1}; + constexpr position_t cNegativePos2{-100}; + handler.set_register(cRegId1, cNegativePos1); - handler.append_position(cRegId1, cSetPos1); + handler.append_position(cRegId1, cInitialPos1); handler.append_position(cRegId1, cNegativePos2); - REQUIRE(std::vector{cNegativePos2, cSetPos1, cNegativePos1} + REQUIRE(std::vector{cNegativePos2, cInitialPos1, cNegativePos1} == handler.get_reversed_positions(cRegId1)); } } From 1671e39943cedd001427076df92adafe8a91d1f8 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 5 Dec 2024 18:15:16 -0500 Subject: [PATCH 321/323] Move constants into scope for test-prefix-tree.cpp. --- tests/test-prefix-tree.cpp | 63 ++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp index 27c79882..66d8f8a0 100644 --- a/tests/test-prefix-tree.cpp +++ b/tests/test-prefix-tree.cpp @@ -10,22 +10,11 @@ using log_surgeon::finite_automata::PrefixTree; using id_t = PrefixTree::id_t; using position_t = PrefixTree::position_t; -constexpr auto cRootId{PrefixTree::cRootId}; -constexpr id_t cInvalidNodeId{100}; -constexpr position_t cInsertPos1{4}; -constexpr position_t cInsertPos2{7}; -constexpr position_t cInsertPos3{9}; -constexpr position_t cMaxPos{std::numeric_limits::max()}; -constexpr position_t cNegativePos1{-1}; -constexpr position_t cNegativePos2{-100}; -constexpr position_t cSetPos1{10}; -constexpr position_t cSetPos2{12}; -constexpr position_t cSetPos3{15}; -constexpr position_t cSetPos4{20}; -constexpr position_t cTreeSize1{4}; -constexpr position_t cTreeSize2{8}; - TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { + constexpr auto cRootId{PrefixTree::cRootId}; + constexpr position_t cInitialPos1{4}; + constexpr position_t cSetPos1{10}; + SECTION("Newly constructed tree works correctly") { PrefixTree const tree; @@ -34,16 +23,24 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { } SECTION("Inserting nodes into the prefix tree works correctly") { + constexpr position_t cInitialPos2{7}; + constexpr position_t cInitialPos3{9}; + constexpr position_t cMaxPos{std::numeric_limits::max()}; + constexpr position_t cNegativePos1{-1}; + constexpr position_t cNegativePos2{-100}; + constexpr position_t cTreeSize1{4}; + constexpr position_t cTreeSize2{8}; + PrefixTree tree; // Test basic insertions - auto const node_id_1{tree.insert(cRootId, cInsertPos1)}; - auto const node_id_2{tree.insert(node_id_1, cInsertPos2)}; - auto const node_id_3{tree.insert(node_id_2, cInsertPos3)}; - REQUIRE(std::vector{cInsertPos1} == tree.get_reversed_positions(node_id_1)); - REQUIRE(std::vector{cInsertPos2, cInsertPos1} + auto const node_id_1{tree.insert(cRootId, cInitialPos1)}; + auto const node_id_2{tree.insert(node_id_1, cInitialPos2)}; + auto const node_id_3{tree.insert(node_id_2, cInitialPos3)}; + REQUIRE(std::vector{cInitialPos1} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{cInitialPos2, cInitialPos1} == tree.get_reversed_positions(node_id_2)); - REQUIRE(std::vector{cInsertPos3, cInsertPos2, cInsertPos1} + REQUIRE(std::vector{cInitialPos3, cInitialPos2, cInitialPos1} == tree.get_reversed_positions(node_id_3)); REQUIRE(cTreeSize1 == tree.size()); @@ -53,12 +50,12 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { // Test insertion with negative position values auto const node_id_5{tree.insert(cRootId, cNegativePos1)}; - auto const node_id_6{tree.insert(node_id_5, cInsertPos1)}; + auto const node_id_6{tree.insert(node_id_5, cInitialPos1)}; auto const node_id_7{tree.insert(node_id_6, cNegativePos2)}; REQUIRE(std::vector{cNegativePos1} == tree.get_reversed_positions(node_id_5)); - REQUIRE(std::vector{cInsertPos1, cNegativePos1} + REQUIRE(std::vector{cInitialPos1, cNegativePos1} == tree.get_reversed_positions(node_id_6)); - REQUIRE(std::vector{cNegativePos2, cInsertPos1, cNegativePos1} + REQUIRE(std::vector{cNegativePos2, cInitialPos1, cNegativePos1} == tree.get_reversed_positions(node_id_7)); REQUIRE(cTreeSize2 == tree.size()); } @@ -67,7 +64,7 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { PrefixTree tree; REQUIRE_THROWS_AS(tree.get_reversed_positions(tree.size()), std::out_of_range); - tree.insert(cRootId, cInsertPos1); + tree.insert(cRootId, cInitialPos1); REQUIRE_THROWS_AS(tree.get_reversed_positions(tree.size()), std::out_of_range); REQUIRE_THROWS_AS( @@ -77,13 +74,17 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { } SECTION("Set position for a valid index works correctly") { + constexpr position_t cSetPos2{12}; + constexpr position_t cSetPos3{15}; + constexpr position_t cSetPos4{20}; + PrefixTree tree; // Test that you can set the root node for sanity, although this value is not used tree.set(cRootId, cSetPos1); // Test updates to different nodes - auto const node_id_1{tree.insert(cRootId, cInsertPos1)}; - auto const node_id_2{tree.insert(node_id_1, cInsertPos1)}; + auto const node_id_1{tree.insert(cRootId, cInitialPos1)}; + auto const node_id_2{tree.insert(node_id_1, cInitialPos1)}; tree.set(node_id_1, cSetPos1); tree.set(node_id_2, cSetPos2); REQUIRE(std::vector{cSetPos1} == tree.get_reversed_positions(node_id_1)); @@ -105,13 +106,15 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { } SECTION("Set position for an invalid index throws correctly") { + constexpr id_t cInvalidNodeId{100}; + PrefixTree tree; // Test setting position before any insertions - REQUIRE_THROWS_AS(tree.set(cInvalidNodeId, cSetPos4), std::out_of_range); + REQUIRE_THROWS_AS(tree.set(cInvalidNodeId, cSetPos1), std::out_of_range); // Test setting position just beyond valid range - auto const node_id_1{tree.insert(cRootId, cInsertPos1)}; - REQUIRE_THROWS_AS(tree.set(node_id_1 + 1, cSetPos4), std::out_of_range); + auto const node_id_1{tree.insert(cRootId, cInitialPos1)}; + REQUIRE_THROWS_AS(tree.set(node_id_1 + 1, cSetPos1), std::out_of_range); } } From 748dfc5566f6ab72df78bccbee99b1c9b503e2db Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 5 Dec 2024 18:21:05 -0500 Subject: [PATCH 322/323] Rename to handler_init and return handler. --- tests/test-register-handler.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp index 2371fc9e..e8102e22 100644 --- a/tests/test-register-handler.cpp +++ b/tests/test-register-handler.cpp @@ -12,17 +12,19 @@ using position_t = log_surgeon::finite_automata::PrefixTree::position_t; namespace { /** - * @param handler The register handler that will contain the new registers. - * @param num_registers The number of registers to initialize. + * @param num_registers The number of registers managed by the handler. + * @return The newly initialized register handler. */ -auto registers_init(RegisterHandler& handler, size_t num_registers) -> void; +[[nodiscard]] auto handler_init(size_t num_registers) -> RegisterHandler; -auto registers_init(RegisterHandler& handler, size_t const num_registers) -> void { +auto handler_init(size_t const num_registers) -> RegisterHandler { constexpr position_t cDefaultPos{0}; + RegisterHandler handler; for (size_t i{0}; i < num_registers; ++i) { handler.add_register(i, cDefaultPos); } + return handler; } } // namespace @@ -32,13 +34,12 @@ TEST_CASE("`RegisterHandler` tests", "[RegisterHandler]") { constexpr size_t cRegId1{0}; constexpr size_t cRegId2{1}; - RegisterHandler handler; - SECTION("Initial state is empty") { - REQUIRE_THROWS_AS(handler.get_reversed_positions(cRegId1), std::out_of_range); + RegisterHandler empty_handler{handler_init(0)}; + REQUIRE_THROWS_AS(empty_handler.get_reversed_positions(cRegId1), std::out_of_range); } - registers_init(handler, cNumRegisters); + RegisterHandler handler{handler_init(cNumRegisters)}; SECTION("Set register position correctly") { handler.set_register(cRegId1, cInitialPos1); From 8abf35a15dfef1d26adfdfbc4895a20cb535a323 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 5 Dec 2024 18:22:39 -0500 Subject: [PATCH 323/323] Add docstring for get_parent_id_unsafe(). --- src/log_surgeon/finite_automata/PrefixTree.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index ab88d805..e2de78aa 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -65,6 +65,11 @@ class PrefixTree { [[nodiscard]] auto is_root() const -> bool { return false == m_parent_id.has_value(); } + /** + * Gets the parent ID without checking if it's `std::nullopt`. + * NOTE: This method should only be used if the caller has checked the node is not the root. + * @return The ID of the parent node in the prefix tree. + */ [[nodiscard]] auto get_parent_id_unsafe() const -> id_t { // NOLINTNEXTLINE(bugprone-unchecked-optional-access) return m_parent_id.value();