refactor: Extract RegexDFAState class, RegexDFAStatePair class, a…

…nd `RegexDFAStateType` enum into their own files. (#57) Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com>
y-scope · Dec 11, 2024 · 081b20f · 081b20f
1 parent 99b5b08
commit 081b20f
Show file tree

Hide file tree

Showing 8 changed files with 228 additions and 198 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -97,7 +97,9 @@ set(SOURCE_FILES
     src/log_surgeon/finite_automata/PrefixTree.hpp
     src/log_surgeon/finite_automata/RegexAST.hpp
     src/log_surgeon/finite_automata/RegexDFA.hpp
-    src/log_surgeon/finite_automata/RegexDFA.tpp
+    src/log_surgeon/finite_automata/RegexDFAState.hpp
+    src/log_surgeon/finite_automata/RegexDFAStatePair.hpp
+    src/log_surgeon/finite_automata/RegexDFAStateType.hpp
     src/log_surgeon/finite_automata/RegexNFA.hpp
     src/log_surgeon/finite_automata/RegexNFAState.hpp
     src/log_surgeon/finite_automata/RegexNFAStateType.hpp

diff --git a/examples/intersect-test.cpp b/examples/intersect-test.cpp
@@ -42,7 +42,7 @@ auto get_intersect_for_query(
     }
     RegexNFA<RegexNFAByteState> nfa(std::move(rules));
     auto dfa2 = ByteLexer::nfa_to_dfa(nfa);
-    auto schema_types = dfa1->get_intersect(dfa2);
+    auto schema_types = dfa1->get_intersect(dfa2.get());
     std::cout << search_string << ":";
     for (auto const& schema_type : schema_types) {
         std::cout << m_id_symbol[schema_type] << ",";

diff --git a/src/log_surgeon/Lexer.hpp b/src/log_surgeon/Lexer.hpp
@@ -13,6 +13,7 @@
 #include <log_surgeon/Constants.hpp>
 #include <log_surgeon/finite_automata/RegexAST.hpp>
 #include <log_surgeon/finite_automata/RegexDFA.hpp>
+#include <log_surgeon/finite_automata/RegexDFAState.hpp>
 #include <log_surgeon/finite_automata/RegexNFA.hpp>
 #include <log_surgeon/LexicalRule.hpp>
 #include <log_surgeon/ParserInputBuffer.hpp>

diff --git a/src/log_surgeon/finite_automata/RegexDFA.hpp b/src/log_surgeon/finite_automata/RegexDFA.hpp
@@ -1,149 +1,75 @@
 #ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_HPP
 #define LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_HPP
 
-#include <algorithm>
 #include <cstdint>
 #include <memory>
 #include <set>
-#include <utility>
 #include <vector>
 
-#include <log_surgeon/Constants.hpp>
-#include <log_surgeon/finite_automata/RegexNFA.hpp>
-#include <log_surgeon/finite_automata/UnicodeIntervalTree.hpp>
+#include <log_surgeon/finite_automata/RegexDFAStatePair.hpp>
 
 namespace log_surgeon::finite_automata {
-enum class RegexDFAStateType {
-    Byte,
-    UTF8
-};
-
-template <RegexDFAStateType stateType>
-class RegexDFAState {
-public:
-    using Tree = UnicodeIntervalTree<RegexDFAState<stateType>*>;
-
-    auto add_matching_variable_id(uint32_t const variable_id) -> void {
-        m_matching_variable_ids.push_back(variable_id);
-    }
-
-    [[nodiscard]] auto get_matching_variable_ids() const -> std::vector<uint32_t> const& {
-        return m_matching_variable_ids;
-    }
-
-    [[nodiscard]] auto is_accepting() const -> bool { return !m_matching_variable_ids.empty(); }
-
-    auto add_byte_transition(uint8_t const& byte, RegexDFAState<stateType>* dest_state) -> void {
-        m_bytes_transition[byte] = dest_state;
-    }
-
-    /**
-     * Returns the next state the DFA transitions to on input character (byte or
-     * utf8)
-     * @param character
-     * @return RegexDFAState<stateType>*
-     */
-    [[nodiscard]] auto next(uint32_t character) const -> RegexDFAState<stateType>*;
-
-private:
-    std::vector<uint32_t> m_matching_variable_ids;
-    RegexDFAState<stateType>* m_bytes_transition[cSizeOfByte];
-    // NOTE: We don't need m_tree_transitions for the `stateType ==
-    // RegexDFAStateType::Byte` case, so we use an empty class (`std::tuple<>`)
-    // in that case.
-    std::conditional_t<stateType == RegexDFAStateType::UTF8, Tree, std::tuple<>> m_tree_transitions;
-};
-
-/**
- * Class for a pair of DFA states, where each state in the pair belongs to a different DFA.
- * This class is used to facilitate the construction of an intersection DFA from two separate DFAs.
- * Each instance represents a state in the intersection DFA and follows these rules:
- *
- * - A pair is considered accepting if both states are accepting in their respective DFAs.
- * - A pair is considered reachable if both its states are reachable in their respective DFAs
- *   from this pair's states.
- *
- * NOTE: Only the first state in the pair contains the variable types matched by the pair.
- */
-template <typename DFAState>
-class RegexDFAStatePair {
-public:
-    RegexDFAStatePair(DFAState const* state1, DFAState const* state2)
-            : m_state1(state1),
-              m_state2(state2) {};
-
-    /**
-     * Used for ordering in a set by considering the states' addresses
-     * @param rhs
-     * @return Whether m_state1 in lhs has a lower address than in rhs, or if they're equal,
-     * whether m_state2 in lhs has a lower address than in rhs
-     */
-    auto operator<(RegexDFAStatePair const& rhs) const -> bool {
-        if (m_state1 == rhs.m_state1) {
-            return m_state2 < rhs.m_state2;
-        }
-        return m_state1 < rhs.m_state1;
-    }
-
-    /**
-     * Generates all pairs reachable from the current pair via any string and store any reachable
-     * pair not previously visited in unvisited_pairs
-     * @param visited_pairs Previously visited pairs
-     * @param unvisited_pairs Set to add unvisited reachable pairs
-     */
-    auto get_reachable_pairs(
-            std::set<RegexDFAStatePair<DFAState>>& visited_pairs,
-            std::set<RegexDFAStatePair<DFAState>>& unvisited_pairs
-    ) const -> void;
-
-    [[nodiscard]] auto is_accepting() const -> bool {
-        return m_state1->is_accepting() && m_state2->is_accepting();
-    }
-
-    [[nodiscard]] auto get_matching_variable_ids() const -> std::vector<uint32_t> const& {
-        return m_state1->get_matching_variable_ids();
-    }
-
-private:
-    DFAState const* m_state1;
-    DFAState const* m_state2;
-};
-
-using RegexDFAByteState = RegexDFAState<RegexDFAStateType::Byte>;
-using RegexDFAUTF8State = RegexDFAState<RegexDFAStateType::UTF8>;
-
 // TODO: rename `RegexDFA` to `DFA`
 template <typename DFAStateType>
 class RegexDFA {
 public:
     /**
-     * Creates a new DFA state based on a set of NFA states and adds it to
-     * m_states
-     * @param nfa_state_set
-     * @return DFAStateType*
+     * Creates a new DFA state based on a set of NFA states and adds it to `m_states`.
+     * @param nfa_state_set The set of NFA states represented by this DFA state.
+     * @return A pointer to the new DFA state.
      */
     template <typename NFAStateType>
     auto new_state(std::set<NFAStateType*> const& nfa_state_set) -> DFAStateType*;
 
     auto get_root() const -> DFAStateType const* { return m_states.at(0).get(); }
 
     /**
-     * Compares this dfa with dfa_in to determine the set of schema types in
-     * this dfa that are reachable by any type in dfa_in. A type is considered
-     * reachable if there is at least one string for which: (1) this dfa returns
-     * a set of types containing the type, and (2) dfa_in returns any non-empty
-     * set of types.
-     * @param dfa_in
-     * @return The set of schema types reachable by dfa_in
+     * Compares this dfa with `dfa_in` to determine the set of schema types in this dfa that are
+     * reachable by any type in `dfa_in`. A type is considered reachable if there is at least one
+     * string for which: (1) this dfa returns a set of types containing the type, and (2) `dfa_in`
+     * returns any non-empty set of types.
+     * @param dfa_in The dfa with which to take the intersect.
+     * @return The set of schema types reachable by `dfa_in`.
      */
-    [[nodiscard]] auto get_intersect(std::unique_ptr<RegexDFA> const& dfa_in
-    ) const -> std::set<uint32_t>;
+    [[nodiscard]] auto get_intersect(RegexDFA const* dfa_in) const -> std::set<uint32_t>;
 
 private:
     std::vector<std::unique_ptr<DFAStateType>> m_states;
 };
-}  // namespace log_surgeon::finite_automata
 
-#include "RegexDFA.tpp"
+template <typename DFAStateType>
+template <typename NFAStateType>
+auto RegexDFA<DFAStateType>::new_state(std::set<NFAStateType*> const& nfa_state_set
+) -> DFAStateType* {
+    m_states.emplace_back(std::make_unique<DFAStateType>());
+    auto* dfa_state = m_states.back().get();
+    for (auto const* nfa_state : nfa_state_set) {
+        if (nfa_state->is_accepting()) {
+            dfa_state->add_matching_variable_id(nfa_state->get_matching_variable_id());
+        }
+    }
+    return dfa_state;
+}
+
+template <typename DFAStateType>
+auto RegexDFA<DFAStateType>::get_intersect(RegexDFA const* dfa_in) const -> std::set<uint32_t> {
+    std::set<uint32_t> schema_types;
+    std::set<RegexDFAStatePair<DFAStateType>> unvisited_pairs;
+    std::set<RegexDFAStatePair<DFAStateType>> visited_pairs;
+    unvisited_pairs.emplace(this->get_root(), dfa_in->get_root());
+    // TODO: Handle UTF-8 (multi-byte transitions) as well
+    while (false == unvisited_pairs.empty()) {
+        auto current_pair_it = unvisited_pairs.begin();
+        if (current_pair_it->is_accepting()) {
+            auto const& matching_variable_ids = current_pair_it->get_matching_variable_ids();
+            schema_types.insert(matching_variable_ids.cbegin(), matching_variable_ids.cend());
+        }
+        visited_pairs.insert(*current_pair_it);
+        current_pair_it->get_reachable_pairs(visited_pairs, unvisited_pairs);
+        unvisited_pairs.erase(current_pair_it);
+    }
+    return schema_types;
+}
+}  // namespace log_surgeon::finite_automata
 
 #endif  // LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_HPP
diff --git a/src/log_surgeon/finite_automata/RegexDFA.tpp b/src/log_surgeon/finite_automata/RegexDFA.tpp
diff --git a/src/log_surgeon/finite_automata/RegexDFAState.hpp b/src/log_surgeon/finite_automata/RegexDFAState.hpp
@@ -0,0 +1,80 @@
+#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE
+#define LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE
+
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <tuple>
+#include <type_traits>
+#include <vector>
+
+#include <log_surgeon/Constants.hpp>
+#include <log_surgeon/finite_automata/RegexDFAStateType.hpp>
+#include <log_surgeon/finite_automata/UnicodeIntervalTree.hpp>
+
+namespace log_surgeon::finite_automata {
+template <RegexDFAStateType state_type>
+class RegexDFAState;
+
+using RegexDFAByteState = RegexDFAState<RegexDFAStateType::Byte>;
+using RegexDFAUTF8State = RegexDFAState<RegexDFAStateType::UTF8>;
+
+template <RegexDFAStateType stateType>
+class RegexDFAState {
+public:
+    using Tree = UnicodeIntervalTree<RegexDFAState<stateType>*>;
+
+    RegexDFAState() {
+        std::fill(std::begin(m_bytes_transition), std::end(m_bytes_transition), nullptr);
+    }
+
+    auto add_matching_variable_id(uint32_t const variable_id) -> void {
+        m_matching_variable_ids.push_back(variable_id);
+    }
+
+    [[nodiscard]] auto get_matching_variable_ids() const -> std::vector<uint32_t> const& {
+        return m_matching_variable_ids;
+    }
+
+    [[nodiscard]] auto is_accepting() const -> bool {
+        return false == m_matching_variable_ids.empty();
+    }
+
+    auto add_byte_transition(uint8_t const& byte, RegexDFAState<stateType>* dest_state) -> void {
+        m_bytes_transition[byte] = dest_state;
+    }
+
+    /**
+     * @param character The character (byte or utf8) to transition on.
+     * @return A pointer to the DFA state reached after transitioning on `character`.
+     */
+    [[nodiscard]] auto next(uint32_t character) const -> RegexDFAState<stateType>*;
+
+private:
+    std::vector<uint32_t> m_matching_variable_ids;
+    RegexDFAState<stateType>* m_bytes_transition[cSizeOfByte];
+    // NOTE: We don't need m_tree_transitions for the `stateType == RegexDFAStateType::Byte` case,
+    // so we use an empty class (`std::tuple<>`) in that case.
+    std::conditional_t<stateType == RegexDFAStateType::UTF8, Tree, std::tuple<>> m_tree_transitions;
+};
+
+template <RegexDFAStateType stateType>
+auto RegexDFAState<stateType>::next(uint32_t character) const -> RegexDFAState<stateType>* {
+    if constexpr (RegexDFAStateType::Byte == stateType) {
+        return m_bytes_transition[character];
+    } else {
+        if (character < cSizeOfByte) {
+            return m_bytes_transition[character];
+        }
+        std::unique_ptr<std::vector<typename Tree::Data>> result
+                = m_tree_transitions.find(Interval(character, character));
+        assert(result->size() <= 1);
+        if (false == result->empty()) {
+            return result->front().m_value;
+        }
+        return nullptr;
+    }
+}
+}  // namespace log_surgeon::finite_automata
+
+#endif  // LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE