Skip to content

Commit

Permalink
refactor: Extract RegexDFAState class, RegexDFAStatePair class, a…
Browse files Browse the repository at this point in the history
…nd `RegexDFAStateType` enum into their own files. (#57)

Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com>
  • Loading branch information
SharafMohamed and LinZhihao-723 authored Dec 11, 2024
1 parent 99b5b08 commit 081b20f
Show file tree
Hide file tree
Showing 8 changed files with 228 additions and 198 deletions.
4 changes: 3 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,9 @@ set(SOURCE_FILES
src/log_surgeon/finite_automata/PrefixTree.hpp
src/log_surgeon/finite_automata/RegexAST.hpp
src/log_surgeon/finite_automata/RegexDFA.hpp
src/log_surgeon/finite_automata/RegexDFA.tpp
src/log_surgeon/finite_automata/RegexDFAState.hpp
src/log_surgeon/finite_automata/RegexDFAStatePair.hpp
src/log_surgeon/finite_automata/RegexDFAStateType.hpp
src/log_surgeon/finite_automata/RegexNFA.hpp
src/log_surgeon/finite_automata/RegexNFAState.hpp
src/log_surgeon/finite_automata/RegexNFAStateType.hpp
Expand Down
2 changes: 1 addition & 1 deletion examples/intersect-test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ auto get_intersect_for_query(
}
RegexNFA<RegexNFAByteState> nfa(std::move(rules));
auto dfa2 = ByteLexer::nfa_to_dfa(nfa);
auto schema_types = dfa1->get_intersect(dfa2);
auto schema_types = dfa1->get_intersect(dfa2.get());
std::cout << search_string << ":";
for (auto const& schema_type : schema_types) {
std::cout << m_id_symbol[schema_type] << ",";
Expand Down
1 change: 1 addition & 0 deletions src/log_surgeon/Lexer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <log_surgeon/Constants.hpp>
#include <log_surgeon/finite_automata/RegexAST.hpp>
#include <log_surgeon/finite_automata/RegexDFA.hpp>
#include <log_surgeon/finite_automata/RegexDFAState.hpp>
#include <log_surgeon/finite_automata/RegexNFA.hpp>
#include <log_surgeon/LexicalRule.hpp>
#include <log_surgeon/ParserInputBuffer.hpp>
Expand Down
164 changes: 45 additions & 119 deletions src/log_surgeon/finite_automata/RegexDFA.hpp
Original file line number Diff line number Diff line change
@@ -1,149 +1,75 @@
#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_HPP
#define LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_HPP

#include <algorithm>
#include <cstdint>
#include <memory>
#include <set>
#include <utility>
#include <vector>

#include <log_surgeon/Constants.hpp>
#include <log_surgeon/finite_automata/RegexNFA.hpp>
#include <log_surgeon/finite_automata/UnicodeIntervalTree.hpp>
#include <log_surgeon/finite_automata/RegexDFAStatePair.hpp>

namespace log_surgeon::finite_automata {
enum class RegexDFAStateType {
Byte,
UTF8
};

template <RegexDFAStateType stateType>
class RegexDFAState {
public:
using Tree = UnicodeIntervalTree<RegexDFAState<stateType>*>;

auto add_matching_variable_id(uint32_t const variable_id) -> void {
m_matching_variable_ids.push_back(variable_id);
}

[[nodiscard]] auto get_matching_variable_ids() const -> std::vector<uint32_t> const& {
return m_matching_variable_ids;
}

[[nodiscard]] auto is_accepting() const -> bool { return !m_matching_variable_ids.empty(); }

auto add_byte_transition(uint8_t const& byte, RegexDFAState<stateType>* dest_state) -> void {
m_bytes_transition[byte] = dest_state;
}

/**
* Returns the next state the DFA transitions to on input character (byte or
* utf8)
* @param character
* @return RegexDFAState<stateType>*
*/
[[nodiscard]] auto next(uint32_t character) const -> RegexDFAState<stateType>*;

private:
std::vector<uint32_t> m_matching_variable_ids;
RegexDFAState<stateType>* m_bytes_transition[cSizeOfByte];
// NOTE: We don't need m_tree_transitions for the `stateType ==
// RegexDFAStateType::Byte` case, so we use an empty class (`std::tuple<>`)
// in that case.
std::conditional_t<stateType == RegexDFAStateType::UTF8, Tree, std::tuple<>> m_tree_transitions;
};

/**
* Class for a pair of DFA states, where each state in the pair belongs to a different DFA.
* This class is used to facilitate the construction of an intersection DFA from two separate DFAs.
* Each instance represents a state in the intersection DFA and follows these rules:
*
* - A pair is considered accepting if both states are accepting in their respective DFAs.
* - A pair is considered reachable if both its states are reachable in their respective DFAs
* from this pair's states.
*
* NOTE: Only the first state in the pair contains the variable types matched by the pair.
*/
template <typename DFAState>
class RegexDFAStatePair {
public:
RegexDFAStatePair(DFAState const* state1, DFAState const* state2)
: m_state1(state1),
m_state2(state2) {};

/**
* Used for ordering in a set by considering the states' addresses
* @param rhs
* @return Whether m_state1 in lhs has a lower address than in rhs, or if they're equal,
* whether m_state2 in lhs has a lower address than in rhs
*/
auto operator<(RegexDFAStatePair const& rhs) const -> bool {
if (m_state1 == rhs.m_state1) {
return m_state2 < rhs.m_state2;
}
return m_state1 < rhs.m_state1;
}

/**
* Generates all pairs reachable from the current pair via any string and store any reachable
* pair not previously visited in unvisited_pairs
* @param visited_pairs Previously visited pairs
* @param unvisited_pairs Set to add unvisited reachable pairs
*/
auto get_reachable_pairs(
std::set<RegexDFAStatePair<DFAState>>& visited_pairs,
std::set<RegexDFAStatePair<DFAState>>& unvisited_pairs
) const -> void;

[[nodiscard]] auto is_accepting() const -> bool {
return m_state1->is_accepting() && m_state2->is_accepting();
}

[[nodiscard]] auto get_matching_variable_ids() const -> std::vector<uint32_t> const& {
return m_state1->get_matching_variable_ids();
}

private:
DFAState const* m_state1;
DFAState const* m_state2;
};

using RegexDFAByteState = RegexDFAState<RegexDFAStateType::Byte>;
using RegexDFAUTF8State = RegexDFAState<RegexDFAStateType::UTF8>;

// TODO: rename `RegexDFA` to `DFA`
template <typename DFAStateType>
class RegexDFA {
public:
/**
* Creates a new DFA state based on a set of NFA states and adds it to
* m_states
* @param nfa_state_set
* @return DFAStateType*
* Creates a new DFA state based on a set of NFA states and adds it to `m_states`.
* @param nfa_state_set The set of NFA states represented by this DFA state.
* @return A pointer to the new DFA state.
*/
template <typename NFAStateType>
auto new_state(std::set<NFAStateType*> const& nfa_state_set) -> DFAStateType*;

auto get_root() const -> DFAStateType const* { return m_states.at(0).get(); }

/**
* Compares this dfa with dfa_in to determine the set of schema types in
* this dfa that are reachable by any type in dfa_in. A type is considered
* reachable if there is at least one string for which: (1) this dfa returns
* a set of types containing the type, and (2) dfa_in returns any non-empty
* set of types.
* @param dfa_in
* @return The set of schema types reachable by dfa_in
* Compares this dfa with `dfa_in` to determine the set of schema types in this dfa that are
* reachable by any type in `dfa_in`. A type is considered reachable if there is at least one
* string for which: (1) this dfa returns a set of types containing the type, and (2) `dfa_in`
* returns any non-empty set of types.
* @param dfa_in The dfa with which to take the intersect.
* @return The set of schema types reachable by `dfa_in`.
*/
[[nodiscard]] auto get_intersect(std::unique_ptr<RegexDFA> const& dfa_in
) const -> std::set<uint32_t>;
[[nodiscard]] auto get_intersect(RegexDFA const* dfa_in) const -> std::set<uint32_t>;

private:
std::vector<std::unique_ptr<DFAStateType>> m_states;
};
} // namespace log_surgeon::finite_automata

#include "RegexDFA.tpp"
template <typename DFAStateType>
template <typename NFAStateType>
auto RegexDFA<DFAStateType>::new_state(std::set<NFAStateType*> const& nfa_state_set
) -> DFAStateType* {
m_states.emplace_back(std::make_unique<DFAStateType>());
auto* dfa_state = m_states.back().get();
for (auto const* nfa_state : nfa_state_set) {
if (nfa_state->is_accepting()) {
dfa_state->add_matching_variable_id(nfa_state->get_matching_variable_id());
}
}
return dfa_state;
}

template <typename DFAStateType>
auto RegexDFA<DFAStateType>::get_intersect(RegexDFA const* dfa_in) const -> std::set<uint32_t> {
std::set<uint32_t> schema_types;
std::set<RegexDFAStatePair<DFAStateType>> unvisited_pairs;
std::set<RegexDFAStatePair<DFAStateType>> visited_pairs;
unvisited_pairs.emplace(this->get_root(), dfa_in->get_root());
// TODO: Handle UTF-8 (multi-byte transitions) as well
while (false == unvisited_pairs.empty()) {
auto current_pair_it = unvisited_pairs.begin();
if (current_pair_it->is_accepting()) {
auto const& matching_variable_ids = current_pair_it->get_matching_variable_ids();
schema_types.insert(matching_variable_ids.cbegin(), matching_variable_ids.cend());
}
visited_pairs.insert(*current_pair_it);
current_pair_it->get_reachable_pairs(visited_pairs, unvisited_pairs);
unvisited_pairs.erase(current_pair_it);
}
return schema_types;
}
} // namespace log_surgeon::finite_automata

#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_HPP
77 changes: 0 additions & 77 deletions src/log_surgeon/finite_automata/RegexDFA.tpp

This file was deleted.

80 changes: 80 additions & 0 deletions src/log_surgeon/finite_automata/RegexDFAState.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE
#define LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE

#include <cassert>
#include <cstdint>
#include <memory>
#include <tuple>
#include <type_traits>
#include <vector>

#include <log_surgeon/Constants.hpp>
#include <log_surgeon/finite_automata/RegexDFAStateType.hpp>
#include <log_surgeon/finite_automata/UnicodeIntervalTree.hpp>

namespace log_surgeon::finite_automata {
template <RegexDFAStateType state_type>
class RegexDFAState;

using RegexDFAByteState = RegexDFAState<RegexDFAStateType::Byte>;
using RegexDFAUTF8State = RegexDFAState<RegexDFAStateType::UTF8>;

template <RegexDFAStateType stateType>
class RegexDFAState {
public:
using Tree = UnicodeIntervalTree<RegexDFAState<stateType>*>;

RegexDFAState() {
std::fill(std::begin(m_bytes_transition), std::end(m_bytes_transition), nullptr);
}

auto add_matching_variable_id(uint32_t const variable_id) -> void {
m_matching_variable_ids.push_back(variable_id);
}

[[nodiscard]] auto get_matching_variable_ids() const -> std::vector<uint32_t> const& {
return m_matching_variable_ids;
}

[[nodiscard]] auto is_accepting() const -> bool {
return false == m_matching_variable_ids.empty();
}

auto add_byte_transition(uint8_t const& byte, RegexDFAState<stateType>* dest_state) -> void {
m_bytes_transition[byte] = dest_state;
}

/**
* @param character The character (byte or utf8) to transition on.
* @return A pointer to the DFA state reached after transitioning on `character`.
*/
[[nodiscard]] auto next(uint32_t character) const -> RegexDFAState<stateType>*;

private:
std::vector<uint32_t> m_matching_variable_ids;
RegexDFAState<stateType>* m_bytes_transition[cSizeOfByte];
// NOTE: We don't need m_tree_transitions for the `stateType == RegexDFAStateType::Byte` case,
// so we use an empty class (`std::tuple<>`) in that case.
std::conditional_t<stateType == RegexDFAStateType::UTF8, Tree, std::tuple<>> m_tree_transitions;
};

template <RegexDFAStateType stateType>
auto RegexDFAState<stateType>::next(uint32_t character) const -> RegexDFAState<stateType>* {
if constexpr (RegexDFAStateType::Byte == stateType) {
return m_bytes_transition[character];
} else {
if (character < cSizeOfByte) {
return m_bytes_transition[character];
}
std::unique_ptr<std::vector<typename Tree::Data>> result
= m_tree_transitions.find(Interval(character, character));
assert(result->size() <= 1);
if (false == result->empty()) {
return result->front().m_value;
}
return nullptr;
}
}
} // namespace log_surgeon::finite_automata

#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE
Loading

0 comments on commit 081b20f

Please sign in to comment.