diff --git a/components/core/src/clp/regex_utils/CMakeLists.txt b/components/core/src/clp/regex_utils/CMakeLists.txt index fa611bd02..0c621def5 100644 --- a/components/core/src/clp/regex_utils/CMakeLists.txt +++ b/components/core/src/clp/regex_utils/CMakeLists.txt @@ -1,12 +1,22 @@ set( REGEX_UTILS_HEADER_LIST + "ErrorCode.hpp" + "RegexToWildcardTranslatorConfig.hpp" + "constants.hpp" "regex_utils.hpp" ) add_library( regex_utils regex_utils.cpp + regex_utils_anchors.cpp + ErrorCode.cpp ${REGEX_UTILS_HEADER_LIST} ) add_library(clp::regex_utils ALIAS regex_utils) -target_include_directories(regex_utils PUBLIC ../) +target_include_directories(regex_utils + PUBLIC + ../ + PRIVATE + "${PROJECT_SOURCE_DIR}/submodules" +) target_compile_features(regex_utils PRIVATE cxx_std_20) diff --git a/components/core/src/clp/regex_utils/ErrorCode.cpp b/components/core/src/clp/regex_utils/ErrorCode.cpp new file mode 100644 index 000000000..acc59abeb --- /dev/null +++ b/components/core/src/clp/regex_utils/ErrorCode.cpp @@ -0,0 +1,93 @@ +#include "regex_utils/ErrorCode.hpp" + +#include +#include +#include + +using std::error_category; +using std::error_code; +using std::string; +using std::string_view; + +namespace clp::regex_utils { + +/** + * Class for giving the error codes more detailed string descriptions. + * This class does not need to be seen outside the std error code wrapper implementation. + */ +class ErrorCodeCategory : public error_category { +public: + /** + * @return The class of errors. + */ + [[nodiscard]] char const* name() const noexcept override; + + /** + * @param The error code encoded in int. + * @return The descriptive message for the error. + */ + [[nodiscard]] string message(int ev) const override; +}; + +auto ErrorCodeCategory::name() const noexcept -> char const* { + return "regex utility"; +} + +auto ErrorCodeCategory::message(int ev) const -> string { + switch (static_cast(ev)) { + case ErrorCode::Success: + return "Success."; + + case ErrorCode::IllegalState: + return "Unrecognized state."; + + case ErrorCode::Star: + return "Failed to translate due to metachar `*` (zero or more occurences)."; + + case ErrorCode::Plus: + return "Failed to translate due to metachar `+` (one or more occurences)."; + + case ErrorCode::Question: + return "Currently does not support returning a list of wildcard translations. The " + "metachar `?` (lazy match) may be supported in the future."; + + case ErrorCode::Pipe: + return "Currently does not support returning a list of wildcard translations. The " + "regex OR condition feature may be supported in the future."; + + case ErrorCode::Caret: + return "Failed to translate due to start anchor `^` in the middle of the string."; + + case ErrorCode::Dollar: + return "Failed to translate due to end anchor `$` in the middle of the string."; + + case ErrorCode::DisallowedEscapeSequence: + return "Disallowed escape sequence."; + + case ErrorCode::UnmatchedParenthesis: + return "Unmatched opening `(` or closing `)`."; + + case ErrorCode::UnsupportedCharsets: + return "Currently only supports case-insensitive single-char charset (i.e. [aA] [bB])."; + + case ErrorCode::IncompleteCharsetStructure: + return "Unmatched closing `]` at the end of the string."; + + case ErrorCode::UnsupportedQuantifier: + return "Currently only supports exact positive number of repetitions in regex syntax."; + + case ErrorCode::TokenUnquantifiable: + return "The preceding token is not quantifiable."; + + default: + return "(unrecognized error)"; + } +} + +ErrorCodeCategory const cTheErrorCodeCategory{}; + +auto make_error_code(ErrorCode e) -> error_code { + return {static_cast(e), cTheErrorCodeCategory}; +} + +} // namespace clp::regex_utils diff --git a/components/core/src/clp/regex_utils/ErrorCode.hpp b/components/core/src/clp/regex_utils/ErrorCode.hpp new file mode 100644 index 000000000..4fa9204fc --- /dev/null +++ b/components/core/src/clp/regex_utils/ErrorCode.hpp @@ -0,0 +1,46 @@ +#ifndef CLP_REGEX_UTILS_ERRORCODE_HPP +#define CLP_REGEX_UTILS_ERRORCODE_HPP + +#include +#include +#include + +namespace clp::regex_utils { + +/** + * Enum class for propagating and handling various regex utility errors. + * More detailed descriptions can be found in ErrorCode.cpp. + */ +enum class ErrorCode : uint8_t { + Success = 0, + IllegalState, + Star, + Plus, + Question, + Pipe, + Caret, + Dollar, + DisallowedEscapeSequence, + UnmatchedParenthesis, + UnsupportedCharsets, + IncompleteCharsetStructure, + UnsupportedQuantifier, + TokenUnquantifiable, +}; + +/** + * Wrapper function to turn a regular enum class into an std::error_code. + * + * @param An error code enum. + * @return The corresponding std::error_code type variable. + */ +[[nodiscard]] auto make_error_code(ErrorCode ec) -> std::error_code; + +} // namespace clp::regex_utils + +namespace std { +template <> +struct is_error_code_enum : true_type {}; +} // namespace std + +#endif // CLP_REGEX_UTILS_ERRORCODE_HPP diff --git a/components/core/src/clp/regex_utils/RegexToWildcardTranslatorConfig.hpp b/components/core/src/clp/regex_utils/RegexToWildcardTranslatorConfig.hpp new file mode 100644 index 000000000..379b327e5 --- /dev/null +++ b/components/core/src/clp/regex_utils/RegexToWildcardTranslatorConfig.hpp @@ -0,0 +1,42 @@ +#ifndef CLP_REGEX_UTILS_REGEXTOWILDCARDTRANSLATORCONFIG_HPP +#define CLP_REGEX_UTILS_REGEXTOWILDCARDTRANSLATORCONFIG_HPP + +namespace clp::regex_utils { + +class RegexToWildcardTranslatorConfig { +public: + // Constructors + RegexToWildcardTranslatorConfig() = default; + + // Getters + [[nodiscard]] auto case_insensitive_wildcard() const -> bool { + return m_case_insensitive_wildcard; + } + + [[nodiscard]] auto allow_anchors() const -> bool { return m_allow_anchors; } + + [[nodiscard]] auto add_prefix_suffix_wildcards() const -> bool { + return m_add_prefix_suffix_wildcards; + } + + // Setters + void set_case_insensitive_wildcard(bool case_insensitive_wildcard) { + m_case_insensitive_wildcard = case_insensitive_wildcard; + } + + void set_allow_anchors(bool allow_anchors) { m_allow_anchors = allow_anchors; } + + void set_add_prefix_suffix_wildcards(bool add_prefix_suffix_wildcards) { + m_add_prefix_suffix_wildcards = add_prefix_suffix_wildcards; + } + +private: + // Variables + bool m_case_insensitive_wildcard = false; + bool m_allow_anchors = true; + bool m_add_prefix_suffix_wildcards = false; +}; + +} // namespace clp::regex_utils + +#endif // CLP_REGEX_UTILS_REGEXTOWILDCARDTRANSLATORCONFIG_HPP diff --git a/components/core/src/clp/regex_utils/constants.hpp b/components/core/src/clp/regex_utils/constants.hpp new file mode 100644 index 000000000..e05ccfe83 --- /dev/null +++ b/components/core/src/clp/regex_utils/constants.hpp @@ -0,0 +1,48 @@ +#ifndef CLP_REGEX_UTILS_CONSTANTS_HPP +#define CLP_REGEX_UTILS_CONSTANTS_HPP + +#include +#include +#include + +namespace clp::regex_utils { + +constexpr size_t cCharBitarraySize = 128; + +/** + * Create an ASCII character lookup table (bit array) at compile time. + * + * @param char_str A string that contains the characters to look up. + * @return The lookup table as bit array + */ +[[nodiscard]] constexpr auto create_char_bit_array(std::string_view char_str +) -> std::array { + std::array bit_array{}; + bit_array.fill(false); + for (char const ch : char_str) { + bit_array.at(ch) = true; + } + return bit_array; +} + +constexpr char cZeroOrMoreCharsWildcard{'*'}; +constexpr char cSingleCharWildcard{'?'}; +constexpr char cRegexZeroOrMore{'*'}; +constexpr char cRegexOneOrMore{'+'}; +constexpr char cRegexZeroOrOne{'+'}; +constexpr char cRegexStartAnchor{'^'}; +constexpr char cRegexEndAnchor{'$'}; +constexpr char cEscapeChar{'\\'}; +constexpr char cCharsetNegate{'^'}; + +// This is a more complete set of meta characters than necessary, as the user might not be fully +// knowledgeable on which meta characters to escape, and may introduce unnecessary escape sequences. +constexpr auto cRegexEscapeSeqAcceptedMetaChars = create_char_bit_array("^$.*{}[]()+|?<>-_/=!\\"); +// This is the set of meta characters that need escaping in the wildcard syntax. +constexpr auto cRegexEscapeSeqWildcardOnlyMetaChars = create_char_bit_array("?*\\"); +// This is the set of meta characters that need escaping in the character set. +constexpr auto cRegexCharsetEscapeSeqMetaChars = create_char_bit_array("^-]\\"); + +} // namespace clp::regex_utils + +#endif // CLP_REGEX_UTILS_CONSTANTS_HPP diff --git a/components/core/src/clp/regex_utils/regex_utils.cpp b/components/core/src/clp/regex_utils/regex_utils.cpp index 50ada53fb..b7a6838f4 100644 --- a/components/core/src/clp/regex_utils/regex_utils.cpp +++ b/components/core/src/clp/regex_utils/regex_utils.cpp @@ -1,241 +1,615 @@ #include "regex_utils/regex_utils.hpp" -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "regex_utils/constants.hpp" +#include "regex_utils/ErrorCode.hpp" +#include "regex_utils/RegexToWildcardTranslatorConfig.hpp" + +using clp::string_utils::is_alphabet; +using clp::string_utils::is_decimal_digit; +using std::error_code; +using std::get; +using std::make_pair; +using std::monostate; +using std::pair; +using std::string; +using std::string_view; +using std::variant; -#define CHAR_BITARRAY_SIZE 128 +namespace clp::regex_utils { -using std::array; -using std::invalid_argument; -using std::runtime_error; -using std::string; +/** + * Class for storing regex translation config, states, capture group and quantifier information. + */ +class TranslatorState { +public: + enum class RegexPatternState : uint8_t { + // The initial state, where characters have no special meanings and are treated literally. + NORMAL = 0, + // Encountered a period `.`. Expecting wildcard expression. + DOT, + // Encountered a backslash `\`, used to suppress special meanings of regex meta characters. + ESCAPED, + // Enclosed by parenthesis `()`, used to specify a capture group. + GROUP, + // Encountered a backslash `\` in the capture group. + GROUPESCAPED, + // Enclosed by square brackets `[]`, used to specify a character set. + CHARSET, + // Encountered a backslash `\` in the character set.. + CHARSETESCAPED, + // Enclosed by curly brackets `{}`, used to specify a quantity to repeat. + QUANTIFIER, + // Encountered a dollar sign `$`, meaning the regex string has reached the end anchor. + END, + }; + + // Constructor + TranslatorState(RegexToWildcardTranslatorConfig const& config, string_view regex_str) + : m_config(config), + m_it(regex_str.begin()) {} + + // Getters + [[nodiscard]] auto get_config() const -> RegexToWildcardTranslatorConfig const& { + return m_config; + } -using std::cout; -using std::endl; - -namespace { - -inline constexpr array createCharBitArray( - char const* charStr -) { - array bitArray; - bitArray.fill(false); - int idx = 0; - for (int idx = 0; charStr[idx] != '\0'; ++idx) { - bitArray.at(charStr[idx]) = true; - } - return bitArray; -} - -constexpr char ZeroOrMoreCharsWildcard = '*'; -constexpr char SingleCharWildcard = '?'; -constexpr char RegexStartAnchor = '^'; -constexpr char RegexEndAnchor = '$'; -constexpr char EscapeChar = '\\'; - -enum RegexPatternState { - NORMAL, - DOT, // Preceded by a single period `.`, used to start the wildcard syntax - ESCAPED, // Preceded by an escape backslash `\\`, used to suppress special meanings of meta characters - GROUP, // Enclosed by parenthesis `()`, used to specify a capture group - CHARSET, // Enclosed by square brackets `[]`, used to specify a character set - QUANTIFIER, // Enclosed by curly brackets `{}`, used to specify a quantity to repeat - END, // Regex string has reached the end anchor `$` -}; + [[nodiscard]] auto get_state() const -> RegexPatternState const& { return m_state; } -constexpr auto RegexNormalStateNonTransitionalMetaChars = createCharBitArray("^|?*+"); + [[nodiscard]] auto get_marked_iterator() const -> string_view::const_iterator const& { + return m_it; + } -} // namespace + [[nodiscard]] auto get_preceding_token( + ) const -> BOOST_OUTCOME_V2_NAMESPACE::std_result; + [[nodiscard]] auto get_quantifier() const -> BOOST_OUTCOME_V2_NAMESPACE::std_result; -namespace clp::regex_utils { + [[nodiscard]] auto get_quantifier_as_str() const -> string { return m_quantifier_str; } -void regexPatternStateFinalCheck( - string& wildcardStr, - RegexPatternState& state, - string& currQuantifier -); + [[nodiscard]] auto quantifier_number_start() const -> bool { + return m_quantifier_str.empty() || ',' == m_quantifier_str.back(); + } -} // namespace clp::regex_utils + // Setters + void set_next_state(RegexPatternState const& state) { m_state = state; } -namespace clp::regex_utils { + void mark_iterator(string_view::const_iterator const& it) { m_it = it; } + + void invalidate_preceding_token() { m_preceding_token = monostate{}; } + + void set_preceding_token(char ch) { m_preceding_token = ch; } + + void set_preceding_token(string const& s) { m_preceding_token = s; } + + void reset_quantifiers() { + m_quantifier = size_t{0}; + m_quantifier_str.clear(); + } + + void add_to_quantifier(char ch); + + void switch_to_second_quantifier() { + m_quantifier = make_pair(get(m_quantifier), 0); + m_quantifier_str += ','; + } + + void inc_nested_group_count() { ++m_nested_group_count; } + + [[nodiscard]] auto dec_nested_group_count() -> BOOST_OUTCOME_V2_NAMESPACE::std_result; + +private: + // Variables + RegexToWildcardTranslatorConfig m_config; + RegexPatternState m_state = RegexPatternState::NORMAL; + string_view::const_iterator m_it; + variant m_preceding_token; + variant> m_quantifier; + string m_quantifier_str; + size_t m_nested_group_count = 0; +}; + +auto TranslatorState::get_preceding_token( +) const -> BOOST_OUTCOME_V2_NAMESPACE::std_result { + switch (m_preceding_token.index()) { + case 0: + return ErrorCode::TokenUnquantifiable; + case 1: + return string{get(m_preceding_token)}; + case 2: + return get(m_preceding_token); + default: + return ErrorCode::IllegalState; + } +} -string regexToWildcard(string const& regexStr) { - if (regexStr.empty()) { +auto TranslatorState::get_quantifier() const -> BOOST_OUTCOME_V2_NAMESPACE::std_result { + switch (m_quantifier.index()) { + case 0: + return get(m_quantifier); + case 1: + // Maybe we can support a ranged pair of quantifiers in the future + return ErrorCode::UnsupportedQuantifier; + default: + return ErrorCode::IllegalState; + } +} + +void TranslatorState::add_to_quantifier(char ch) { + int const num{ch - '0'}; + int const base = 10; + switch (m_quantifier.index()) { + case 0: + m_quantifier = get<0>(m_quantifier) * base + num; + break; + case 1: + get<1>(m_quantifier).second = get<1>(m_quantifier).second * base + num; + break; + default: + break; + } + m_quantifier_str += ch; +} + +auto TranslatorState::dec_nested_group_count() -> BOOST_OUTCOME_V2_NAMESPACE::std_result { + if (0 == m_nested_group_count) { + return ErrorCode::UnmatchedParenthesis; + } + --m_nested_group_count; + return m_nested_group_count; +} + +// State transition functions common signature +// typedef [[nodiscard]] auto +// StateTransitionFunc(TranslatorState&, string_view::const_iterator&, string&) -> error_code; + +using StateTransitionFunc + = auto(TranslatorState&, string_view::const_iterator&, string&) -> error_code; + +// State transition functions +[[nodiscard]] StateTransitionFunc normal_state_transition; +[[nodiscard]] StateTransitionFunc dot_state_transition; +[[nodiscard]] StateTransitionFunc escaped_state_transition; +[[nodiscard]] StateTransitionFunc group_state_transition; +[[nodiscard]] StateTransitionFunc group_escaped_state_transition; +[[nodiscard]] StateTransitionFunc charset_state_transition; +[[nodiscard]] StateTransitionFunc charset_escaped_state_transition; +[[nodiscard]] StateTransitionFunc quantifier_state_transition; +[[nodiscard]] StateTransitionFunc end_state_transition; +[[nodiscard]] StateTransitionFunc final_state_cleanup; + +// Helper function +void append_incomplete_quantifier_structure(TranslatorState& state, string& wildcard_str); +[[nodiscard]] auto matching_upper_lower_case_char_pair(char ch0, char ch1) -> bool; + +// Main API +auto regex_to_wildcard(string_view regex_str) -> BOOST_OUTCOME_V2_NAMESPACE::std_result { + RegexToWildcardTranslatorConfig const default_config{}; + return regex_to_wildcard(regex_str, default_config); +} + +auto regex_to_wildcard(string_view regex_str, RegexToWildcardTranslatorConfig const& config) + -> BOOST_OUTCOME_V2_NAMESPACE::std_result { + if (regex_str.empty()) { return string(); } - // Initialize scan position, scan state, and return string - int idx = 0; - RegexPatternState state = NORMAL; - string wildcardStr; + // Initialize translation state, scan position, and return string + TranslatorState state{config, regex_str}; + string_view::const_iterator it = regex_str.cbegin(); + string wildcard_str; + + // If there is no starting anchor character, append multichar wildcard prefix + if (cRegexStartAnchor == *it) { + if (config.allow_anchors()) { + ++it; + } else { + return ErrorCode::Caret; + } + } else if (config.add_prefix_suffix_wildcards()) { + wildcard_str += cZeroOrMoreCharsWildcard; + } - // If there is no starting anchor character, append multichar wildcard prefix - if (RegexStartAnchor == regexStr.at(0)) { - idx++; - } else { - wildcardStr += ZeroOrMoreCharsWildcard; - } - - // Initialize various string buffers - string currGroup; - string currQuantifier; - for (; idx < regexStr.length(); ++idx) { - // Main state transition table - const char ch = regexStr.at(idx); - switch (state) { - case NORMAL: - switch (ch) { - case '.': - state = DOT; - break; - case EscapeChar: - state = ESCAPED; - break; - case '(': - currGroup.clear(); - state = GROUP; - break; - case '[': - state = CHARSET; - break; - case '{': - state = QUANTIFIER; - break; - case RegexEndAnchor: - state = END; - break; - case '|': - throw runtime_error( - "Currently does not support returning a list of wildcard options." - ); - default: - if (RegexNormalStateNonTransitionalMetaChars.at(ch)) { - throw invalid_argument( - "Cannot translate due to an unescaped meta character " + ch - ); - } - wildcardStr += ch; - break; - } + error_code ec{}; + while (it != regex_str.end()) { + switch (state.get_state()) { + case TranslatorState::RegexPatternState::NORMAL: + ec = normal_state_transition(state, it, wildcard_str); + break; + case TranslatorState::RegexPatternState::DOT: + ec = dot_state_transition(state, it, wildcard_str); break; - case DOT: - if ('*' == ch) { - wildcardStr += ZeroOrMoreCharsWildcard; - } else { - wildcardStr += SingleCharWildcard; - // Backtrack one position and handle the current char in the next iteration - --idx; - } - state = NORMAL; + case TranslatorState::RegexPatternState::ESCAPED: + ec = escaped_state_transition(state, it, wildcard_str); break; - case END: - if (RegexEndAnchor != ch) { - throw invalid_argument("Encountered non-anchor characters past the end of $."); - } + case TranslatorState::RegexPatternState::GROUP: + ec = group_state_transition(state, it, wildcard_str); + break; + case TranslatorState::RegexPatternState::GROUPESCAPED: + ec = group_escaped_state_transition(state, it, wildcard_str); + break; + case TranslatorState::RegexPatternState::CHARSET: + ec = charset_state_transition(state, it, wildcard_str); + break; + case TranslatorState::RegexPatternState::CHARSETESCAPED: + ec = charset_escaped_state_transition(state, it, wildcard_str); + break; + case TranslatorState::RegexPatternState::QUANTIFIER: + ec = quantifier_state_transition(state, it, wildcard_str); + break; + case TranslatorState::RegexPatternState::END: + ec = end_state_transition(state, it, wildcard_str); break; default: - throw runtime_error("Entered illegal regex pattern state " + state); + ec = ErrorCode::IllegalState; break; } + if (ec) { + return ec; + } + ++it; } // Do the final state check and clean up - // TODO: in the future there may be a need to backtrack to a previous scan position and rescan - // from a different state. - regexPatternStateFinalCheck(wildcardStr, state, currQuantifier); + ec = final_state_cleanup(state, it, wildcard_str); + if (ec) { + return ec; + } - return wildcardStr; + return wildcard_str; } -void regexPatternStateFinalCheck( - string& wildcardStr, - RegexPatternState& state, - string& currQuantifier -) { - switch (state) { - case DOT: - // The last character is a single `.`, without the possibility of becoming a - // multichar wildcard - wildcardStr += SingleCharWildcard; +auto normal_state_transition( + TranslatorState& state, + string_view::const_iterator& it, + string& wildcard_str +) -> error_code { + char const ch = *it; + auto const& config = state.get_config(); + switch (ch) { + case '.': + state.set_next_state(TranslatorState::RegexPatternState::DOT); + break; + case cEscapeChar: + state.set_next_state(TranslatorState::RegexPatternState::ESCAPED); break; - case ESCAPED: - throw invalid_argument("Incomplete escape sequence at the end."); + case '(': + state.inc_nested_group_count(); + state.mark_iterator(it + 1); // Mark the beginning of group expression + state.set_next_state(TranslatorState::RegexPatternState::GROUP); break; - case GROUP: - throw invalid_argument("Unmatched closing `)` at the end."); + case '[': + state.mark_iterator(it + 1); // Mark the beginning of charset expression + state.set_next_state(TranslatorState::RegexPatternState::CHARSET); break; - case CHARSET: - throw invalid_argument("Unmatched closing `]` at the end."); + case '{': + state.reset_quantifiers(); + state.set_next_state(TranslatorState::RegexPatternState::QUANTIFIER); break; - case QUANTIFIER: - // Not a valid quantifier expression due to no closing curly bracket, but - // everything inside the bracket is purely numeric, so append directly. - wildcardStr += '{'; - wildcardStr += currQuantifier; + case cRegexEndAnchor: + if (!config.allow_anchors()) { + return ErrorCode::Dollar; + } + state.set_next_state(TranslatorState::RegexPatternState::END); break; + case '*': + return ErrorCode::Star; + case '+': + return ErrorCode::Plus; + case '?': + return ErrorCode::Question; + case '|': + return ErrorCode::Pipe; + case cRegexStartAnchor: + return ErrorCode::Caret; + case ')': + return ErrorCode::UnmatchedParenthesis; default: + wildcard_str += ch; + state.set_preceding_token(ch); break; } - if (END != state) { - wildcardStr += ZeroOrMoreCharsWildcard; + return ErrorCode::Success; +} + +auto dot_state_transition( + TranslatorState& state, + string_view::const_iterator& it, + string& wildcard_str +) -> error_code { + switch (*it) { + case '*': + // .* gets translated to * + wildcard_str += cZeroOrMoreCharsWildcard; + state.invalidate_preceding_token(); + break; + case '+': + // .+ gets translated to ?* + wildcard_str = wildcard_str + cSingleCharWildcard + cZeroOrMoreCharsWildcard; + state.invalidate_preceding_token(); + break; + default: + // . gets translated to ? + wildcard_str += cSingleCharWildcard; + state.set_preceding_token(cSingleCharWildcard); + // Backtrack the scan by one position to handle the current char in the next iteration. + --it; + break; + } + state.set_next_state(TranslatorState::RegexPatternState::NORMAL); + return ErrorCode::Success; +} + +auto escaped_state_transition( + TranslatorState& state, + string_view::const_iterator& it, + string& wildcard_str +) -> error_code { + char const ch = *it; + if (!cRegexEscapeSeqAcceptedMetaChars.at(ch)) { + return ErrorCode::DisallowedEscapeSequence; + } + if (cRegexEscapeSeqWildcardOnlyMetaChars.at(ch)) { + // Need to keep the backslash for characters that are special in the wildcard syntax too + string const escape_seq = string{cEscapeChar} + ch; + wildcard_str += escape_seq; + state.set_preceding_token(escape_seq); + } else { + wildcard_str += ch; + state.set_preceding_token(ch); } + state.set_next_state(TranslatorState::RegexPatternState::NORMAL); + return ErrorCode::Success; } +auto group_state_transition( + TranslatorState& state, + string_view::const_iterator& it, + string& wildcard_str +) -> error_code { + char const ch = *it; + if (cEscapeChar == ch) { + state.set_next_state(TranslatorState::RegexPatternState::GROUPESCAPED); + return ErrorCode::Success; + } + // TODO: make the group unrolling iterative + if ('(' == ch) { + state.inc_nested_group_count(); + return ErrorCode::Success; + } + if (')' != ch) { + return ErrorCode::Success; + } + auto num_nested_group = state.dec_nested_group_count(); + if (num_nested_group.has_error()) { + return num_nested_group.error(); + } + if (num_nested_group.value() > 0) { + // Still within nested group + return ErrorCode::Success; + } -string regexTrimLineAnchors(string const& regexStr) { - const int lastIdx = regexStr.length() - 1; + // End of group: translate the captured group expression. + // capture group should not enable anchors or prefix/suffix wildcards. + string const captured_group(state.get_marked_iterator(), it); + auto config{state.get_config()}; + config.set_allow_anchors(false); + config.set_add_prefix_suffix_wildcards(false); + + // Perform translation + auto translated_group = regex_to_wildcard(captured_group, config); + if (translated_group.has_error()) { + return translated_group.error(); + } + + wildcard_str += translated_group.value(); + state.set_preceding_token(translated_group.value()); + state.set_next_state(TranslatorState::RegexPatternState::NORMAL); + return ErrorCode::Success; +} - int beginPos = 0; - int endPos = lastIdx; +auto group_escaped_state_transition( + TranslatorState& state, + string_view::const_iterator& /*it*/, + string& /*wildcard_str*/ +) -> error_code { + // Defer the handling of escape sequences to entire capture group translation. + state.set_next_state(TranslatorState::RegexPatternState::GROUP); + return ErrorCode::Success; +} - // Find the position of the first non-caret character - while (beginPos <= endPos && RegexStartAnchor == regexStr.at(beginPos)) { - ++beginPos; +auto charset_state_transition( + TranslatorState& state, + string_view::const_iterator& it, + string& wildcard_str +) -> error_code { + char const ch = *it; + string_view::const_iterator const& charset_start = state.get_marked_iterator(); + size_t const charset_len = it - charset_start; + if (cEscapeChar == ch) { + state.set_next_state(TranslatorState::RegexPatternState::CHARSETESCAPED); + return ErrorCode::Success; + } + if (charset_len > 2) { + // Short circuit: the currently accepted charset is at most 2-char long. + return ErrorCode::UnsupportedCharsets; + } + if (']' != ch) { + return ErrorCode::Success; } - // Backtrack one char to include at least one start anchor, if there was any. - if (beginPos > 0) { - --beginPos; + if (0 == charset_len) { + // Empty charset + return ErrorCode::UnsupportedCharsets; } - // Find the position of the last non-dollar-sign character - while (beginPos <= endPos && RegexEndAnchor == regexStr.at(endPos)) { - --endPos; + // End of charset: perform analysis on accepted charset patterns. + char const ch0 = *charset_start; + char const ch1 = *(charset_start + 1); + auto config{state.get_config()}; + char parsed_char{}; + + if (1 == charset_len) { + if (cCharsetNegate == ch0 || cEscapeChar == ch0) { + return ErrorCode::UnsupportedCharsets; + } + parsed_char = ch0; + } else { // 2 == charset_len + if (cEscapeChar == ch0 && cRegexCharsetEscapeSeqMetaChars.at(ch1)) { + // 2-char escape sequence + parsed_char = ch1; + } else if (config.case_insensitive_wildcard() + && matching_upper_lower_case_char_pair(ch0, ch1)) + { + // case-insensitive patterns like [aA] [Bb] etc. + parsed_char = ch0 > ch1 ? ch0 : ch1; // Get the lower case char + } else { + return ErrorCode::UnsupportedCharsets; + } } - if (endPos < lastIdx) { - // There was at least one end anchor so we include it by advancing one char - ++endPos; + + // Add the parsed character to the string + if (cRegexEscapeSeqWildcardOnlyMetaChars.at(parsed_char)) { + auto escaped_char = string{cEscapeChar} + parsed_char; + wildcard_str += escaped_char; + state.set_preceding_token(escaped_char); + } else { + wildcard_str += parsed_char; + state.set_preceding_token(parsed_char); } + state.set_next_state(TranslatorState::RegexPatternState::NORMAL); + return ErrorCode::Success; +} - // If there was more than one end anchor, we need to check if the current end anchor is escaped. - // If so, it's not a real end anchor, and we need to advance the end position once more to - // append a real end anchor. - string trimmedRegexStr = regexStr.substr(beginPos, endPos - beginPos + 1); - if (endPos < lastIdx && !regexHasEndAnchor(trimmedRegexStr)) { - trimmedRegexStr += RegexEndAnchor; +auto matching_upper_lower_case_char_pair(char ch0, char ch1) -> bool { + int const upper_lower_case_ascii_offset = 'a' - 'A'; + return (is_alphabet(ch0) && is_alphabet(ch1) + && ((ch0 - ch1 == upper_lower_case_ascii_offset) + || (ch1 - ch0 == upper_lower_case_ascii_offset))); +} + +auto charset_escaped_state_transition( + TranslatorState& state, + string_view::const_iterator& /*it*/, + string& /*wildcard_str*/ +) -> error_code { + // Defer the handling of escape sequences to entire character set analysis.. + state.set_next_state(TranslatorState::RegexPatternState::CHARSET); + return ErrorCode::Success; +} + +auto quantifier_state_transition( + TranslatorState& state, + string_view::const_iterator& it, + string& wildcard_str +) -> error_code { + char const ch = *it; + if ('-' == ch && state.quantifier_number_start()) { + // Disallow negative quantifiers + return ErrorCode::UnsupportedQuantifier; } - return trimmedRegexStr; + if (',' == ch) { + // Expecting a pair of quantifiers + state.switch_to_second_quantifier(); + } else if (is_decimal_digit(ch)) { + // Is a regular decimal digit + state.add_to_quantifier(ch); + } else if ('}' != ch) { + // Invalid quantifier syntax. In such case, the special meaning of `(` is suppressed. + // So far we've only seen opening bracket/digits/comma, so append directly. + append_incomplete_quantifier_structure(state, wildcard_str); + // Backtrack the scan by one position to handle the current char in the next iteration. + --it; + state.set_next_state(TranslatorState::RegexPatternState::NORMAL); + } else { + // Quantifier expression complete. Perform repetition + auto quantifier = state.get_quantifier(); + if (quantifier.has_error()) { + return quantifier.error(); + } + auto prev_token = state.get_preceding_token(); + if (prev_token.has_error()) { + return prev_token.error(); + } + + size_t const q_val = quantifier.value(); + string const token = prev_token.value(); + if (0 == q_val) { + // Zero repetition removes the token from the string + wildcard_str.erase(wildcard_str.length() - token.length()); + } else { + // Repeat the token for n-1 times + for (size_t i{1}; i < q_val; ++i) { + wildcard_str += token; + } + } + // Compound repetition is not allowed. + state.invalidate_preceding_token(); + state.set_next_state(TranslatorState::RegexPatternState::NORMAL); + } + return ErrorCode::Success; } -bool regexHasStartAnchor(string const& regexStr) { - return !regexStr.empty() && RegexStartAnchor == regexStr.at(0); +auto end_state_transition( + TranslatorState& /*state*/, + string_view::const_iterator& it, + string& /*wildcard_str*/ +) -> error_code { + if (cRegexEndAnchor != *it) { + return ErrorCode::Dollar; + } + return ErrorCode::Success; } -bool regexHasEndAnchor(string const& regexStr) { - int len = regexStr.length(); - if (len <= 0 || RegexEndAnchor != regexStr.back()) { - return false; +auto final_state_cleanup( + TranslatorState& state, + string_view::const_iterator& /*it*/, + string& wildcard_str +) -> error_code { + switch (state.get_state()) { + case TranslatorState::RegexPatternState::DOT: + // The last character is a single `.`, without the possibility of becoming a + // multichar wildcard + wildcard_str += cSingleCharWildcard; + break; + case TranslatorState::RegexPatternState::ESCAPED: + return ErrorCode::DisallowedEscapeSequence; + case TranslatorState::RegexPatternState::GROUP: + case TranslatorState::RegexPatternState::GROUPESCAPED: + return ErrorCode::UnmatchedParenthesis; + case TranslatorState::RegexPatternState::CHARSET: + return ErrorCode::IncompleteCharsetStructure; + case TranslatorState::RegexPatternState::QUANTIFIER: + append_incomplete_quantifier_structure(state, wildcard_str); + break; + default: + break; } - // Check that ending regex dollar sigh char is unescaped. - // We need to scan the suffix until we encounter a character that is not an - // escape char, since escape chars can escape themselves. - bool escaped = false; - for (int idx = len - 2; idx >= 0 && EscapeChar == regexStr.at(idx); --idx) { - escaped = !escaped; + auto const& config = state.get_config(); + if (TranslatorState::RegexPatternState::END != state.get_state() + && config.add_prefix_suffix_wildcards()) + { + wildcard_str += cZeroOrMoreCharsWildcard; } - return !escaped; + return ErrorCode::Success; +} + +void append_incomplete_quantifier_structure(TranslatorState& state, string& wildcard_str) { + // Invalid quantifier syntax. So far we've only seen digits/comma so append directly. + string const invalid_quantifier_str = string{'{'} + state.get_quantifier_as_str(); + wildcard_str += invalid_quantifier_str; + state.set_preceding_token(invalid_quantifier_str.back()); } } // namespace clp::regex_utils diff --git a/components/core/src/clp/regex_utils/regex_utils.hpp b/components/core/src/clp/regex_utils/regex_utils.hpp index d831250f4..2d1bf43f0 100644 --- a/components/core/src/clp/regex_utils/regex_utils.hpp +++ b/components/core/src/clp/regex_utils/regex_utils.hpp @@ -2,37 +2,48 @@ #define CLP_REGEX_UTILS_REGEX_UTILS_HPP #include +#include + +#include +#include + +#include "regex_utils/RegexToWildcardTranslatorConfig.hpp" namespace clp::regex_utils { -std::string regexToWildcard(std::string const& regexStr); +[[nodiscard]] auto regex_to_wildcard(std::string_view regex_str +) -> BOOST_OUTCOME_V2_NAMESPACE::std_result; + +[[nodiscard]] auto regex_to_wildcard( + std::string_view regex_str, + RegexToWildcardTranslatorConfig const& config +) -> BOOST_OUTCOME_V2_NAMESPACE::std_result; /** * If a regex expression contains multiple starting or ending anchors, remove the duplicates. * - * @param regexStr + * @param regex_str * @return Trimmed the regex string, leaving at most one starting or ending anchor. */ -std::string regexTrimLineAnchors(std::string const& regexStr); +[[nodiscard]] auto regex_trim_line_anchors(std::string_view regex_str) -> std::string; /** * Check if a regex string has a starting anchor character `^` (caret). * - * @param regexStr + * @param regex_str * @return True if the regex string begins with `^`, false otherwise. */ -bool regexHasStartAnchor(std::string const& regexStr); - +[[nodiscard]] auto regex_has_start_anchor(std::string_view regex_str) -> bool; /** * Check if a regex string has an ending anchor character `$` (dollar sign). * Note that the regex string may end with an escaped `$`, in which case the `$` character retain * its literal meaning. * - * @param regexStr + * @param regex_str * @return True if the regex string ends with an unescaped `$`, false otherwise. */ -bool regexHasEndAnchor(std::string const& regexStr); +[[nodiscard]] auto regex_has_end_anchor(std::string_view regex_str) -> bool; } // namespace clp::regex_utils #endif // CLP_REGEX_UTILS_REGEX_UTILS_HPP diff --git a/components/core/src/clp/regex_utils/regex_utils_anchors.cpp b/components/core/src/clp/regex_utils/regex_utils_anchors.cpp new file mode 100644 index 000000000..a204a3cfc --- /dev/null +++ b/components/core/src/clp/regex_utils/regex_utils_anchors.cpp @@ -0,0 +1,64 @@ +#include +#include + +#include "regex_utils/constants.hpp" +#include "regex_utils/regex_utils.hpp" + +using std::string; +using std::string_view; + +namespace clp::regex_utils { + +auto regex_trim_line_anchors(string_view regex_str) -> string { + string_view::const_iterator left(regex_str.begin()); + string_view::const_iterator right(regex_str.end()); + + // Find the position of the first non-caret character + while (left != right && cRegexStartAnchor == *left) { + ++left; + } + // Backtrack one char to include at least one start anchor, if there was any. + if (left != regex_str.begin()) { + --left; + } + + // Find the position of the last non-dollar-sign character + while (left != right && cRegexEndAnchor == *(right - 1)) { + --right; + } + if (left != right && right != regex_str.end()) { + // There was at least one end anchor so we include it by advancing one char + ++right; + } + + // If there was more than one end anchor, we need to check if the current end anchor is escaped. + // If so, it's not a real end anchor, and we need to advance the end position once more to + // append a real end anchor. + string trimmed_regex_str(left, right); + if (right != regex_str.end() && !regex_has_end_anchor(trimmed_regex_str)) { + trimmed_regex_str += cRegexEndAnchor; + } + return trimmed_regex_str; +} + +auto regex_has_start_anchor(string_view regex_str) -> bool { + return !regex_str.empty() && cRegexStartAnchor == regex_str.at(0); +} + +auto regex_has_end_anchor(string_view regex_str) -> bool { + auto it{regex_str.rbegin()}; + if (it == regex_str.rend() || cRegexEndAnchor != *it) { + return false; + } + + // Check that ending regex dollar sigh char is unescaped. + // We need to scan the suffix until we encounter a character that is not an + // escape char, since escape chars can escape themselves. + bool escaped{false}; + for (++it; it != regex_str.rend() && cEscapeChar == *it; ++it) { + escaped = !escaped; + } + return !escaped; +} + +} // namespace clp::regex_utils diff --git a/components/core/tests/test-regex_utils.cpp b/components/core/tests/test-regex_utils.cpp index 7eb36308b..2e36e186b 100644 --- a/components/core/tests/test-regex_utils.cpp +++ b/components/core/tests/test-regex_utils.cpp @@ -1,64 +1,229 @@ #include +#include #include +#include +#include #include -using clp::regex_utils::regexToWildcard; -using clp::regex_utils::regexTrimLineAnchors; -using clp::regex_utils::regexHasStartAnchor; -using clp::regex_utils::regexHasEndAnchor; +using clp::regex_utils::regex_has_end_anchor; +using clp::regex_utils::regex_has_start_anchor; +using clp::regex_utils::regex_to_wildcard; +using clp::regex_utils::regex_trim_line_anchors; -TEST_CASE("regexToWildcard", "[regex_utils][regexToWildcard]") { +TEST_CASE("regex_to_wildcard", "[regex_utils][regex_to_wildcard]") { // Test empty string - REQUIRE(regexToWildcard("") == ""); - - // Test anchors - REQUIRE(regexToWildcard("^") == "*"); - REQUIRE(regexToWildcard("$") == "*"); - REQUIRE(regexToWildcard("xyz") == "*xyz*"); + REQUIRE(regex_to_wildcard("").value() == ""); // Test simple wildcard translations - REQUIRE(regexToWildcard("^xyz$") == "xyz"); - REQUIRE(regexToWildcard("^. xyz .* zyx .$") == "? xyz * zyx ?"); - REQUIRE(regexToWildcard("^. xyz .* zyx .*$") == "? xyz * zyx *"); + REQUIRE(regex_to_wildcard("^xyz$").value() == "xyz"); + REQUIRE(regex_to_wildcard("xyz").value() == "xyz"); + REQUIRE(regex_to_wildcard(". xyz .* zyx .").value() == "? xyz * zyx ?"); + REQUIRE(regex_to_wildcard(". xyz .+ zyx .*").value() == "? xyz ?* zyx *"); // Test unescaped meta characters - REQUIRE_THROWS_AS(regexToWildcard("^. xyz ^.* zyx .$"), std::invalid_argument); - REQUIRE_THROWS_AS(regexToWildcard("^. xyz |.* zyx .$"), std::runtime_error); - REQUIRE_THROWS_AS(regexToWildcard("^. xyz ?.* zyx .$"), std::invalid_argument); - REQUIRE_THROWS_AS(regexToWildcard("^. xyz .** zyx .$"), std::invalid_argument); - REQUIRE_THROWS_AS(regexToWildcard("^. xyz .*+ zyx .$"), std::invalid_argument); + REQUIRE(regex_to_wildcard(".? xyz .* zyx .").error() == clp::regex_utils::ErrorCode::Question); + REQUIRE(regex_to_wildcard(". xyz .** zyx .").error() == clp::regex_utils::ErrorCode::Star); + REQUIRE(regex_to_wildcard(". xyz .*+ zyx .").error() == clp::regex_utils::ErrorCode::Plus); + REQUIRE(regex_to_wildcard(". xyz |.* zyx .").error() == clp::regex_utils::ErrorCode::Pipe); + REQUIRE(regex_to_wildcard(". xyz ^.* zyx .").error() == clp::regex_utils::ErrorCode::Caret); + + // Test properly escaped meta characters + REQUIRE(regex_to_wildcard("\\^\\$\\.\\*\\{\\}\\[\\]\\(\\)\\+\\|\\?\\<\\>\\-\\_\\/\\=\\!\\\\") + .value() + == "^$.\\*{}[]()+|\\?<>-_/=!\\\\"); + REQUIRE(regex_to_wildcard("abc\\Qdefghi\\Ejkl").error() + == clp::regex_utils::ErrorCode::DisallowedEscapeSequence); + + // Test quantifiers + REQUIRE(regex_to_wildcard("abc{3}").value() == "abccc"); + REQUIRE(regex_to_wildcard("abc{4}").value() == "abcccc"); + REQUIRE(regex_to_wildcard("abc{0}").value() == "ab"); + REQUIRE(regex_to_wildcard("abc.{4}").value() == "abc????"); + REQUIRE(regex_to_wildcard("abc\\[{4}").value() == "abc[[[["); + REQUIRE(regex_to_wildcard("abc\\^{4}").value() == "abc^^^^"); + REQUIRE(regex_to_wildcard("abc\\*{4}").value() == "abc\\*\\*\\*\\*"); + REQUIRE(regex_to_wildcard("abc\\?{4}").value() == "abc\\?\\?\\?\\?"); + REQUIRE(regex_to_wildcard("abc{123").value() == "abc{123"); + REQUIRE(regex_to_wildcard("abc{123,456").value() == "abc{123,456"); + REQUIRE(regex_to_wildcard("abc{00123\\*").value() == "abc{00123\\*"); + REQUIRE(regex_to_wildcard("abc{3,4{{{{3}").value() == "abc{3,4{{{{{"); + REQUIRE(regex_to_wildcard("abc{3,4{3,4{3,{3}").value() == "abc{3,4{3,4{3,,,"); + REQUIRE(regex_to_wildcard("abc{3,4{3,4{3,4{3}").value() == "abc{3,4{3,4{3,444"); + REQUIRE(regex_to_wildcard("abc{3,4{3,4{3,4.*").value() == "abc{3,4{3,4{3,4*"); + REQUIRE(regex_to_wildcard("abc{3,4{3,4{3,4\\[a-z]").value() == "abc{3,4{3,4{3,4[a-z]"); + REQUIRE(regex_to_wildcard("abc{3,4{3,4{3,4\\*{4}").value() == "abc{3,4{3,4{3,4\\*\\*\\*\\*"); + + REQUIRE(regex_to_wildcard("abc{-3}").error() + == clp::regex_utils::ErrorCode::UnsupportedQuantifier); + REQUIRE(regex_to_wildcard("abc{3,4}").error() + == clp::regex_utils::ErrorCode::UnsupportedQuantifier); + + REQUIRE(regex_to_wildcard("{3}abc").error() == clp::regex_utils::ErrorCode::TokenUnquantifiable + ); + REQUIRE(regex_to_wildcard("abc{3}{3}").error() + == clp::regex_utils::ErrorCode::TokenUnquantifiable); + REQUIRE(regex_to_wildcard("abc.*{3}").error() + == clp::regex_utils::ErrorCode::TokenUnquantifiable); + REQUIRE(regex_to_wildcard("abc.+{3}").error() + == clp::regex_utils::ErrorCode::TokenUnquantifiable); + + // Test grouping and quantifiers + REQUIRE(regex_to_wildcard("(xyz)").value() == "xyz"); + REQUIRE(regex_to_wildcard("abc (xyz) def").value() == "abc xyz def"); + REQUIRE(regex_to_wildcard("abc () def").value() == "abc def"); + REQUIRE(regex_to_wildcard("abc (. xyz .+ zyx .*){2} def").value() + == "abc ? xyz ?* zyx *? xyz ?* zyx * def"); + REQUIRE(regex_to_wildcard("abc (.{3} xyz .+ zyx .*){2} def").value() + == "abc ??? xyz ?* zyx *??? xyz ?* zyx * def"); + REQUIRE(regex_to_wildcard("abc (\\)){2} def").value() == "abc )) def"); + REQUIRE(regex_to_wildcard("abc (\\)\\*){2} def").value() == "abc )\\*)\\* def"); + REQUIRE(regex_to_wildcard("abc (x(\\*){3}z){2} def").value() == "abc x\\*\\*\\*zx\\*\\*\\*z def" + ); + + REQUIRE(regex_to_wildcard("abc (. xyz .+ zyx .*{2} def").error() + == clp::regex_utils::ErrorCode::UnmatchedParenthesis); + REQUIRE(regex_to_wildcard("abc (x(\\*{3}z){2} def").error() + == clp::regex_utils::ErrorCode::UnmatchedParenthesis); + REQUIRE(regex_to_wildcard("abc (x(\\*){3}z{2} def").error() + == clp::regex_utils::ErrorCode::UnmatchedParenthesis); + REQUIRE(regex_to_wildcard("abc x(\\*){3}z){2} def").error() + == clp::regex_utils::ErrorCode::UnmatchedParenthesis); + REQUIRE(regex_to_wildcard("abc (x\\*){3}z){2} def").error() + == clp::regex_utils::ErrorCode::UnmatchedParenthesis); + REQUIRE(regex_to_wildcard("abc (abc | def){2} def").error() == clp::regex_utils::ErrorCode::Pipe + ); + REQUIRE(regex_to_wildcard("abc (* xyz .+ zyx .*){2} def").error() + == clp::regex_utils::ErrorCode::Star); + REQUIRE(regex_to_wildcard("abc (+ xyz .+ zyx .*){2} def").error() + == clp::regex_utils::ErrorCode::Plus); + REQUIRE(regex_to_wildcard("abc (.{3}{3} xyz .+ zyx .*){2} def").error() + == clp::regex_utils::ErrorCode::TokenUnquantifiable); + REQUIRE(regex_to_wildcard("abc (. xyz .+{3} zyx .*){2} def").error() + == clp::regex_utils::ErrorCode::TokenUnquantifiable); + + // Test charset and quantifiers + REQUIRE(regex_to_wildcard("x[y]z").value() == "xyz"); + REQUIRE(regex_to_wildcard("x[y]{2}z").value() == "xyyz"); + REQUIRE(regex_to_wildcard("x[+]{2}z").value() == "x++z"); + REQUIRE(regex_to_wildcard("x[-]{2}z").value() == "x--z"); + REQUIRE(regex_to_wildcard("x[|]{2}z").value() == "x||z"); + REQUIRE(regex_to_wildcard("x[\\-]{2}z").value() == "x--z"); + REQUIRE(regex_to_wildcard("x[\\^]{2}z").value() == "x^^z"); + REQUIRE(regex_to_wildcard("x[\\]]{2}z").value() == "x]]z"); + REQUIRE(regex_to_wildcard("x[*]{2}z").value() == "x\\*\\*z"); + REQUIRE(regex_to_wildcard("x[?]{2}z").value() == "x\\?\\?z"); + REQUIRE(regex_to_wildcard("x[\\\\]{2}z").value() == "x\\\\\\\\z"); + + REQUIRE(regex_to_wildcard("abc (x[*]{2}z){2} def").value() == "abc x\\*\\*zx\\*\\*z def"); + REQUIRE(regex_to_wildcard("abc (x[\\]]{2}z){2} def").value() == "abc x]]zx]]z def"); + + REQUIRE(regex_to_wildcard("x[aA").error() + == clp::regex_utils::ErrorCode::IncompleteCharsetStructure); + REQUIRE(regex_to_wildcard("x[]{2}z").error() == clp::regex_utils::ErrorCode::UnsupportedCharsets + ); + REQUIRE(regex_to_wildcard("x[^]{2}z").error() + == clp::regex_utils::ErrorCode::UnsupportedCharsets); + REQUIRE(regex_to_wildcard("x[\\]{2}z").error() + == clp::regex_utils::ErrorCode::UnsupportedCharsets); + + // Need to set case-insensitive wildcard config for the following to work + REQUIRE(regex_to_wildcard("[aA]").error() == clp::regex_utils::ErrorCode::UnsupportedCharsets); + REQUIRE(regex_to_wildcard("[Aa]").error() == clp::regex_utils::ErrorCode::UnsupportedCharsets); + REQUIRE(regex_to_wildcard("[Ee][Xx][Cc][Ee][Pp][Tt][Ii][Oo][Nn]").error() + == clp::regex_utils::ErrorCode::UnsupportedCharsets); + REQUIRE(regex_to_wildcard("[eE][Xx][cC][eE][pP][Tt][iI][Oo]{2}[Nn]").error() + == clp::regex_utils::ErrorCode::UnsupportedCharsets); +} + +TEST_CASE( + "regex_to_wildcard_case_insensitive_wildcard", + "[regex_utils][regex_to_wildcard][case_insensitive_wildcard]" +) { + clp::regex_utils::RegexToWildcardTranslatorConfig config; + config.set_case_insensitive_wildcard(true); + + REQUIRE(regex_to_wildcard("[aA]", config).value() == "a"); + REQUIRE(regex_to_wildcard("[Aa]", config).value() == "a"); + REQUIRE(regex_to_wildcard("[Aa][pP]{2}[Ll][eE]", config).value() == "apple"); + REQUIRE(regex_to_wildcard("[Ee][Xx][Cc][Ee][Pp][Tt][Ii][Oo][Nn]", config).value() == "exception" + ); + REQUIRE(regex_to_wildcard("[eE][Xx][cC][eE][pP][Tt][iI][Oo]{2}[Nn]", config).value() + == "exceptioon"); + + REQUIRE(regex_to_wildcard("[eE][Xx][cC][eE][pP][Tk][iI][Oo][Nn]", config).error() + == clp::regex_utils::ErrorCode::UnsupportedCharsets); + + // The other test cases should not be affected + REQUIRE(regex_to_wildcard("x[y]z", config).value() == "xyz"); + REQUIRE(regex_to_wildcard("x[y]{2}z", config).value() == "xyyz"); + REQUIRE(regex_to_wildcard("x[+]{2}z", config).value() == "x++z"); + REQUIRE(regex_to_wildcard("x[-]{2}z", config).value() == "x--z"); + REQUIRE(regex_to_wildcard("x[|]{2}z", config).value() == "x||z"); + REQUIRE(regex_to_wildcard("x[\\-]{2}z", config).value() == "x--z"); + REQUIRE(regex_to_wildcard("x[\\^]{2}z", config).value() == "x^^z"); + REQUIRE(regex_to_wildcard("x[\\]]{2}z", config).value() == "x]]z"); + REQUIRE(regex_to_wildcard("x[*]{2}z", config).value() == "x\\*\\*z"); + REQUIRE(regex_to_wildcard("x[?]{2}z", config).value() == "x\\?\\?z"); + REQUIRE(regex_to_wildcard("x[\\\\]{2}z", config).value() == "x\\\\\\\\z"); + + REQUIRE(regex_to_wildcard("abc (x[*]{2}z){2} def", config).value() == "abc x\\*\\*zx\\*\\*z def" + ); + REQUIRE(regex_to_wildcard("abc (x[\\]]{2}z){2} def", config).value() == "abc x]]zx]]z def"); + + REQUIRE(regex_to_wildcard("x[]{2}z", config).error() + == clp::regex_utils::ErrorCode::UnsupportedCharsets); + REQUIRE(regex_to_wildcard("x[^]{2}z", config).error() + == clp::regex_utils::ErrorCode::UnsupportedCharsets); + REQUIRE(regex_to_wildcard("x[\\]{2}z", config).error() + == clp::regex_utils::ErrorCode::UnsupportedCharsets); } -TEST_CASE("regexTrimLineAnchors", "[regex_utils][regexTrimLineAnchors]") { - REQUIRE(regexTrimLineAnchors("") == ""); - REQUIRE(regexTrimLineAnchors("^^^hello$$$") == "^hello$"); - REQUIRE(regexTrimLineAnchors("^^\\^hello$$$") == "^\\^hello$"); - REQUIRE(regexTrimLineAnchors("^^^hello\\$$$") == "^hello\\$$"); - REQUIRE(regexTrimLineAnchors("^^\\^hello\\$$$") == "^\\^hello\\$$"); - REQUIRE(regexTrimLineAnchors("^^^hello\\\\\\\\\\\\\\$$$") == "^hello\\\\\\\\\\\\\\$$"); - REQUIRE(regexTrimLineAnchors("^^^\\\\goodbye\\\\\\\\\\\\$$$") == "^\\\\goodbye\\\\\\\\\\\\$"); +TEST_CASE("regex_to_wildcard_anchor_config", "[regex_utils][regex_to_wildcard][anchor_config]") { + // Test anchors and prefix/suffix wildcards + clp::regex_utils::RegexToWildcardTranslatorConfig config; + config.set_add_prefix_suffix_wildcards(true); + REQUIRE(regex_to_wildcard("^", config).value() == "*"); + REQUIRE(regex_to_wildcard("$", config).value() == "*"); + REQUIRE(regex_to_wildcard("^xyz$", config).value() == "xyz"); + REQUIRE(regex_to_wildcard("xyz", config).value() == "*xyz*"); + + // Test in groups + REQUIRE(regex_to_wildcard("xyz(. xyz .* zyx .)zyx", config).value() == "*xyz? xyz * zyx ?zyx*"); + REQUIRE(regex_to_wildcard("xyz(^. xyz .* zyx .)zyx", config).error() + == clp::regex_utils::ErrorCode::Caret); + REQUIRE(regex_to_wildcard("xyz(. xyz .* zyx .$)zyx", config).error() + == clp::regex_utils::ErrorCode::Dollar); } -TEST_CASE("regexHasStartAnchor", "[regex_utils][regexHasStartAnchor]") { - REQUIRE_FALSE(regexHasStartAnchor("")); - REQUIRE(regexHasStartAnchor("^hello$")); - REQUIRE_FALSE(regexHasStartAnchor("\\^hello$")); - REQUIRE(regexHasStartAnchor("^hello\\$")); - REQUIRE_FALSE(regexHasStartAnchor("\\^hello\\$")); - REQUIRE(regexHasStartAnchor("^hello\\\\\\\\\\\\\\$")); - REQUIRE(regexHasStartAnchor("^\\\\goodbye\\\\\\\\\\\\$")); +TEST_CASE("regex_trim_line_anchors", "[regex_utils][regex_trim_line_anchors]") { + REQUIRE(regex_trim_line_anchors("") == ""); + REQUIRE(regex_trim_line_anchors("^^^hello$$$") == "^hello$"); + REQUIRE(regex_trim_line_anchors("^^\\^hello$$$") == "^\\^hello$"); + REQUIRE(regex_trim_line_anchors("^^^hello\\$$$") == "^hello\\$$"); + REQUIRE(regex_trim_line_anchors("^^\\^hello\\$$$") == "^\\^hello\\$$"); + REQUIRE(regex_trim_line_anchors("^^^hello\\\\\\\\\\\\\\$$$") == "^hello\\\\\\\\\\\\\\$$"); + REQUIRE(regex_trim_line_anchors("^^^\\\\goodbye\\\\\\\\$$$") == "^\\\\goodbye\\\\\\\\$"); } -TEST_CASE("regexHasEndAnchor", "[regex_utils][regexHasEndAnchor]") { - REQUIRE_FALSE(regexHasEndAnchor("")); - REQUIRE(regexHasEndAnchor("^hello$")); - REQUIRE(regexHasEndAnchor("\\^hello$")); - REQUIRE_FALSE(regexHasEndAnchor("^hello\\$")); - REQUIRE_FALSE(regexHasEndAnchor("\\^hello\\$")); - REQUIRE_FALSE(regexHasEndAnchor("^hello\\\\\\\\\\\\\\$")); - REQUIRE(regexHasEndAnchor("^\\\\goodbye\\\\\\\\\\\\$")); - REQUIRE(regexHasEndAnchor("\\\\\\\\\\\\$")); - REQUIRE_FALSE(regexHasEndAnchor("\\\\\\\\\\\\\\$")); +TEST_CASE("regex_has_start_anchor", "[regex_utils][regex_has_start_anchor]") { + REQUIRE_FALSE(regex_has_start_anchor("")); + REQUIRE(regex_has_start_anchor("^hello$")); + REQUIRE_FALSE(regex_has_start_anchor("\\^hello$")); + REQUIRE(regex_has_start_anchor("^hello\\$")); + REQUIRE_FALSE(regex_has_start_anchor("\\^hello\\$")); + REQUIRE(regex_has_start_anchor("^hello\\\\\\\\\\\\\\$")); + REQUIRE(regex_has_start_anchor("^\\\\goodbye\\\\\\\\\\\\$")); } +TEST_CASE("regex_has_end_anchor", "[regex_utils][regex_has_end_anchor]") { + REQUIRE_FALSE(regex_has_end_anchor("")); + REQUIRE(regex_has_end_anchor("^hello$")); + REQUIRE(regex_has_end_anchor("\\^hello$")); + REQUIRE_FALSE(regex_has_end_anchor("^hello\\$")); + REQUIRE_FALSE(regex_has_end_anchor("\\^hello\\$")); + REQUIRE_FALSE(regex_has_end_anchor("^hello\\\\\\\\\\\\\\$")); + REQUIRE(regex_has_end_anchor("^\\\\goodbye\\\\\\\\\\\\$")); + REQUIRE(regex_has_end_anchor("\\\\\\\\\\\\$")); + REQUIRE_FALSE(regex_has_end_anchor("\\\\\\\\\\\\\\$")); +}