diff --git a/CHANGELOG.md b/CHANGELOG.md index b2c69b7a7..6695e9fc8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,18 @@ This is a list of notable changes to Hyperscan, in reverse chronological order. +## [5.0.0] 2018-07-09 +- Introduce chimera hybrid engine of Hyperscan and PCRE, to fully support + PCRE syntax as well as to take advantage of the high performance nature of + Hyperscan. +- New API feature: logical combinations (AND, OR and NOT) of patterns in a + given pattern set. +- Windows porting: hsbench, hscheck, hscollider and hsdump tools now available + on Windows 8 or newer. +- Improve undirected graph implementation to avoid graph copy and reduce + compile time. +- Bugfix for issue #86: enable hscollider for installed PCRE package. + ## [4.7.0] 2018-01-24 - Introduced hscollider pattern testing tool, for validating Hyperscan match behaviour against PCRE. diff --git a/CMakeLists.txt b/CMakeLists.txt index 56f17c5b2..070002708 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,8 +1,8 @@ cmake_minimum_required (VERSION 2.8.11) project (hyperscan C CXX) -set (HS_MAJOR_VERSION 4) -set (HS_MINOR_VERSION 7) +set (HS_MAJOR_VERSION 5) +set (HS_MINOR_VERSION 0) set (HS_PATCH_VERSION 0) set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION}) @@ -154,7 +154,7 @@ if(MSVC OR MSVC_IDE) # todo: change these as required set(ARCH_C_FLAGS "/arch:AVX2") set(ARCH_CXX_FLAGS "/arch:AVX2") - set(MSVC_WARNS "/wd4101 /wd4146 /wd4172 /wd4200 /wd4244 /wd4267 /wd4307 /wd4334 /wd4805 -D_CRT_SECURE_NO_WARNINGS") + set(MSVC_WARNS "/wd4101 /wd4146 /wd4172 /wd4200 /wd4244 /wd4267 /wd4307 /wd4334 /wd4805 /wd4996 -D_CRT_SECURE_NO_WARNINGS") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O2 ${MSVC_WARNS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 ${MSVC_WARNS} /wd4800 -DBOOST_DETAIL_NO_CONTAINER_FWD") endif() @@ -446,11 +446,32 @@ else() endif() add_subdirectory(util) -add_subdirectory(unit) add_subdirectory(doc/dev-reference) + +if (NOT WIN32) +# PCRE check, we have a fixed requirement for PCRE to use Chimera +# and hscollider +set(PCRE_REQUIRED_MAJOR_VERSION 8) +set(PCRE_REQUIRED_MINOR_VERSION 41) +set(PCRE_REQUIRED_VERSION ${PCRE_REQUIRED_MAJOR_VERSION}.${PCRE_REQUIRED_MINOR_VERSION}) +include (${CMAKE_MODULE_PATH}/pcre.cmake) +if (NOT CORRECT_PCRE_VERSION) + message(STATUS "PCRE ${PCRE_REQUIRED_VERSION} not found") +endif() + +# we need static libs for Chimera - too much deep magic for shared libs +if (CORRECT_PCRE_VERSION AND PCRE_BUILD_SOURCE AND BUILD_STATIC_LIBS) + set(BUILD_CHIMERA TRUE) +endif() + +add_subdirectory(unit) if (EXISTS ${CMAKE_SOURCE_DIR}/tools/CMakeLists.txt) add_subdirectory(tools) endif() +if (EXISTS ${CMAKE_SOURCE_DIR}/chimera/CMakeLists.txt AND BUILD_CHIMERA) + add_subdirectory(chimera) +endif() +endif() # do substitutions configure_file(${CMAKE_MODULE_PATH}/config.h.in ${PROJECT_BINARY_DIR}/config.h) @@ -479,6 +500,31 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}") endif() +if (WIN32) +# PCRE check, we have a fixed requirement for PCRE to use Chimera +# and hscollider +set(PCRE_REQUIRED_MAJOR_VERSION 8) +set(PCRE_REQUIRED_MINOR_VERSION 41) +set(PCRE_REQUIRED_VERSION ${PCRE_REQUIRED_MAJOR_VERSION}.${PCRE_REQUIRED_MINOR_VERSION}) +include (${CMAKE_MODULE_PATH}/pcre.cmake) +if (NOT CORRECT_PCRE_VERSION) + message(STATUS "PCRE ${PCRE_REQUIRED_VERSION} not found") +endif() + +# we need static libs for Chimera - too much deep magic for shared libs +if (CORRECT_PCRE_VERSION AND PCRE_BUILD_SOURCE AND BUILD_STATIC_LIBS) + set(BUILD_CHIMERA TRUE) +endif() + +add_subdirectory(unit) +if (EXISTS ${CMAKE_SOURCE_DIR}/tools/CMakeLists.txt) + add_subdirectory(tools) +endif() +if (EXISTS ${CMAKE_SOURCE_DIR}/chimera/CMakeLists.txt AND BUILD_CHIMERA) + add_subdirectory(chimera) +endif() +endif() + if(NOT WIN32) set(RAGEL_C_FLAGS "-Wno-unused") endif() @@ -860,7 +906,6 @@ SET (hs_compile_SRCS src/nfagraph/ng_stop.h src/nfagraph/ng_uncalc_components.cpp src/nfagraph/ng_uncalc_components.h - src/nfagraph/ng_undirected.h src/nfagraph/ng_utf8.cpp src/nfagraph/ng_utf8.h src/nfagraph/ng_util.cpp @@ -915,6 +960,8 @@ SET (hs_compile_SRCS src/parser/check_refs.h src/parser/control_verbs.cpp src/parser/control_verbs.h + src/parser/logical_combination.cpp + src/parser/logical_combination.h src/parser/parse_error.cpp src/parser/parse_error.h src/parser/parser_util.cpp @@ -1014,6 +1061,7 @@ SET (hs_compile_SRCS src/util/graph.h src/util/graph_range.h src/util/graph_small_color_map.h + src/util/graph_undirected.h src/util/hash.h src/util/hash_dynamic_bitset.h src/util/insertion_ordered.h diff --git a/chimera/CMakeLists.txt b/chimera/CMakeLists.txt new file mode 100644 index 000000000..1cd66a3f5 --- /dev/null +++ b/chimera/CMakeLists.txt @@ -0,0 +1,49 @@ +# Chimera lib + +include_directories(${PCRE_INCLUDE_DIRS}) + +# only set these after all tests are done +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}") + +SET(chimera_HEADERS + ch.h + ch_common.h + ch_compile.h + ch_runtime.h +) +install(FILES ${chimera_HEADERS} DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/hs") + +SET(chimera_SRCS + ${chimera_HEADERS} + ch_alloc.c + ch_alloc.h + ch_compile.cpp + ch_database.c + ch_database.h + ch_internal.h + ch_runtime.c + ch_scratch.h + ch_scratch.c +) + +add_library(chimera STATIC ${chimera_SRCS}) +add_dependencies(chimera hs pcre) +target_link_libraries(chimera hs pcre) + +install(TARGETS chimera DESTINATION ${CMAKE_INSTALL_LIBDIR}) + +if (NOT WIN32) + # expand out library names for pkgconfig static link info + foreach (LIB ${CMAKE_CXX_IMPLICIT_LINK_LIBRARIES}) + # this is fragile, but protects us from toolchain specific files + if (NOT EXISTS ${LIB}) + set(PRIVATE_LIBS "${PRIVATE_LIBS} -l${LIB}") + endif() + endforeach() + set(PRIVATE_LIBS "${PRIVATE_LIBS} -L${LIBDIR} -lpcre") + + configure_file(libch.pc.in libch.pc @ONLY) # only replace @ quoted vars + install(FILES ${CMAKE_BINARY_DIR}/chimera/libch.pc + DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig") +endif() diff --git a/chimera/ch.h b/chimera/ch.h new file mode 100644 index 000000000..9838f0da2 --- /dev/null +++ b/chimera/ch.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CH_H_ +#define CH_H_ + +/** + * @file + * @brief The complete Chimera API definition. + * + * Chimera is a hybrid solution of Hyperscan and PCRE. + * + * This header includes both the Chimera compiler and runtime components. See + * the individual component headers for documentation. + */ + +#include "ch_compile.h" +#include "ch_runtime.h" + +#endif /* CH_H_ */ diff --git a/chimera/ch_alloc.c b/chimera/ch_alloc.c new file mode 100644 index 000000000..047f12381 --- /dev/null +++ b/chimera/ch_alloc.c @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Runtime functions for setting custom allocators. + */ + +#include "ch.h" +#include "ch_common.h" +#include "ch_internal.h" +#include "hs.h" +#include "ue2common.h" + +#define default_malloc malloc +#define default_free free + +ch_alloc_t ch_database_alloc = default_malloc; +ch_alloc_t ch_misc_alloc = default_malloc; +ch_alloc_t ch_scratch_alloc = default_malloc; + +ch_free_t ch_database_free = default_free; +ch_free_t ch_misc_free = default_free; +ch_free_t ch_scratch_free = default_free; + +static +ch_alloc_t normalise_alloc(ch_alloc_t a) { + if (!a) { + return default_malloc; + } else { + return a; + } +} + +static +ch_free_t normalise_free(ch_free_t f) { + if (!f) { + return default_free; + } else { + return f; + } +} + +HS_PUBLIC_API +ch_error_t HS_CDECL ch_set_allocator(ch_alloc_t allocfunc, + ch_free_t freefunc) { + ch_set_database_allocator(allocfunc, freefunc); + ch_set_misc_allocator(allocfunc, freefunc); + ch_set_scratch_allocator(allocfunc, freefunc); + + // Set core Hyperscan alloc/free. + hs_error_t ret = hs_set_allocator(allocfunc, freefunc); + + return ret; +} + +HS_PUBLIC_API +ch_error_t HS_CDECL ch_set_database_allocator(ch_alloc_t allocfunc, + ch_free_t freefunc) { + ch_database_alloc = normalise_alloc(allocfunc); + ch_database_free = normalise_free(freefunc); + + // Set Hyperscan database alloc/free. + return hs_set_database_allocator(allocfunc, freefunc); +} + +HS_PUBLIC_API +ch_error_t HS_CDECL ch_set_misc_allocator(ch_alloc_t allocfunc, + ch_free_t freefunc) { + ch_misc_alloc = normalise_alloc(allocfunc); + ch_misc_free = normalise_free(freefunc); + + // Set Hyperscan misc alloc/free. + return hs_set_misc_allocator(allocfunc, freefunc); +} + +HS_PUBLIC_API +ch_error_t HS_CDECL ch_set_scratch_allocator(ch_alloc_t allocfunc, + ch_free_t freefunc) { + ch_scratch_alloc = normalise_alloc(allocfunc); + ch_scratch_free = normalise_free(freefunc); + + // Set Hyperscan scratch alloc/free. + return hs_set_scratch_allocator(allocfunc, freefunc); +} diff --git a/chimera/ch_alloc.h b/chimera/ch_alloc.h new file mode 100644 index 000000000..243df00be --- /dev/null +++ b/chimera/ch_alloc.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CH_ALLOC_H +#define CH_ALLOC_H + +#include "hs_common.h" +#include "ue2common.h" +#include "ch_common.h" + +#ifdef __cplusplus +extern "C" +{ +#endif +extern hs_alloc_t ch_database_alloc; +extern hs_alloc_t ch_misc_alloc; +extern hs_alloc_t ch_scratch_alloc; + +extern hs_free_t ch_database_free; +extern hs_free_t ch_misc_free; +extern hs_free_t ch_scratch_free; +#ifdef __cplusplus +} /* extern C */ +#endif +/** \brief Check the results of an alloc done with hs_alloc for alignment. + * + * If we have incorrect alignment, return an error. Caller should free the + * offending block. */ +static really_inline +ch_error_t ch_check_alloc(const void *mem) { + ch_error_t ret = CH_SUCCESS; + if (!mem) { + ret = CH_NOMEM; + } else if (!ISALIGNED_N(mem, alignof(unsigned long long))) { + ret = CH_BAD_ALLOC; + } + return ret; +} + +#endif diff --git a/chimera/ch_common.h b/chimera/ch_common.h new file mode 100644 index 000000000..8caa44407 --- /dev/null +++ b/chimera/ch_common.h @@ -0,0 +1,360 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CH_COMMON_H_ +#define CH_COMMON_H_ + +#include "hs_common.h" + +#include + +/** + * @file + * @brief The Chimera common API definition. + * + * Chimera is a hybrid of Hyperscan and PCRE. + * + * This header contains functions available to both the Chimera compiler and + * runtime. + */ + +#ifdef __cplusplus +extern "C" +{ +#endif + +struct ch_database; + +/** + * A Chimera pattern database. + * + * Generated by one of the Chimera compiler functions: + * - @ref ch_compile() + * - @ref ch_compile_multi() + * - @ref ch_compile_ext_multi() + */ +typedef struct ch_database ch_database_t; + +/** + * A type for errors returned by Chimera functions. + */ +typedef int ch_error_t; + +/** + * Free a compiled pattern database. + * + * The free callback set by @ref ch_set_allocator()) will be used by this + * function. + * + * @param db + * A compiled pattern database. NULL may also be safely provided, in which + * case the function does nothing. + * + * @return + * @ref CH_SUCCESS on success, other values on failure. + */ +ch_error_t HS_CDECL ch_free_database(ch_database_t *db); + +/** + * Utility function for identifying this release version. + * + * @return + * A string containing the version number of this release build and the + * date of the build. It is allocated statically, so it does not need to + * be freed by the caller. + */ +const char * HS_CDECL ch_version(void); + +/** + * Returns the size of the given database. + * + * @param database + * Pointer to compiled expression database. + * + * @param database_size + * On success, the size of the compiled database in bytes is placed in this + * parameter. + * + * @return + * @ref CH_SUCCESS on success, other values on failure. + */ +ch_error_t HS_CDECL ch_database_size(const ch_database_t *database, + size_t *database_size); + +/** + * Utility function providing information about a database. + * + * @param database + * Pointer to a compiled database. + * + * @param info + * On success, a string containing the version and platform information for + * the supplied database is placed in the parameter. The string is + * allocated using the allocator supplied in @ref hs_set_allocator() + * (or malloc() if no allocator was set) and should be freed by the caller. + * + * @return + * @ref CH_SUCCESS on success, other values on failure. + */ +ch_error_t HS_CDECL ch_database_info(const ch_database_t *database, + char **info); + +/** + * The type of the callback function that will be used by Chimera to allocate + * more memory at runtime as required. + * + * If Chimera is to be used in a multi-threaded, or similarly concurrent + * environment, the allocation function will need to be re-entrant, or + * similarly safe for concurrent use. + * + * @param size + * The number of bytes to allocate. + * @return + * A pointer to the region of memory allocated, or NULL on error. + */ +typedef void *(HS_CDECL *ch_alloc_t)(size_t size); + +/** + * The type of the callback function that will be used by Chimera to free + * memory regions previously allocated using the @ref ch_alloc_t function. + * + * @param ptr + * The region of memory to be freed. + */ +typedef void (HS_CDECL *ch_free_t)(void *ptr); + +/** + * Set the allocate and free functions used by Chimera for allocating + * memory at runtime for stream state, scratch space, database bytecode, + * and various other data structure returned by the Chimera API. + * + * The function is equivalent to calling @ref ch_set_scratch_allocator(), + * @ref ch_set_database_allocator() and + * @ref ch_set_misc_allocator() with the provided parameters. + * + * This call will override any previous allocators that have been set. + * + * Note: there is no way to change the allocator used for temporary objects + * created during the various compile calls (@ref ch_compile() and @ref + * ch_compile_multi()). + * + * @param alloc_func + * A callback function pointer that allocates memory. This function must + * return memory suitably aligned for the largest representable data type + * on this platform. + * + * @param free_func + * A callback function pointer that frees allocated memory. + * + * @return + * @ref CH_SUCCESS on success, other values on failure. + */ +ch_error_t HS_CDECL ch_set_allocator(ch_alloc_t alloc_func, + ch_free_t free_func); + +/** + * Set the allocate and free functions used by Chimera for allocating memory + * for database bytecode produced by the compile calls (@ref ch_compile() and @ref + * ch_compile_multi()). + * + * If no database allocation functions are set, or if NULL is used in place of + * both parameters, then memory allocation will default to standard methods + * (such as the system malloc() and free() calls). + * + * This call will override any previous database allocators that have been set. + * + * Note: the database allocator may also be set by calling @ref + * ch_set_allocator(). + * + * Note: there is no way to change how temporary objects created during the + * various compile calls (@ref ch_compile() and @ref ch_compile_multi()) are + * allocated. + * + * @param alloc_func + * A callback function pointer that allocates memory. This function must + * return memory suitably aligned for the largest representable data type + * on this platform. + * + * @param free_func + * A callback function pointer that frees allocated memory. + * + * @return + * @ref HS_SUCCESS on success, other values on failure. + */ +ch_error_t HS_CDECL ch_set_database_allocator(ch_alloc_t alloc_func, + ch_free_t free_func); + +/** + * Set the allocate and free functions used by Chimera for allocating memory + * for items returned by the Chimera API such as @ref ch_compile_error_t. + * + * If no misc allocation functions are set, or if NULL is used in place of both + * parameters, then memory allocation will default to standard methods (such as + * the system malloc() and free() calls). + * + * This call will override any previous misc allocators that have been set. + * + * Note: the misc allocator may also be set by calling @ref ch_set_allocator(). + * + * @param alloc_func + * A callback function pointer that allocates memory. This function must + * return memory suitably aligned for the largest representable data type + * on this platform. + * + * @param free_func + * A callback function pointer that frees allocated memory. + * + * @return + * @ref CH_SUCCESS on success, other values on failure. + */ +ch_error_t HS_CDECL ch_set_misc_allocator(ch_alloc_t alloc_func, + ch_free_t free_func); + +/** + * Set the allocate and free functions used by Chimera for allocating memory + * for scratch space by @ref ch_alloc_scratch() and @ref ch_clone_scratch(). + * + * If no scratch allocation functions are set, or if NULL is used in place of + * both parameters, then memory allocation will default to standard methods + * (such as the system malloc() and free() calls). + * + * This call will override any previous scratch allocators that have been set. + * + * Note: the scratch allocator may also be set by calling @ref + * ch_set_allocator(). + * + * @param alloc_func + * A callback function pointer that allocates memory. This function must + * return memory suitably aligned for the largest representable data type + * on this platform. + * + * @param free_func + * A callback function pointer that frees allocated memory. + * + * @return + * @ref CH_SUCCESS on success, other values on failure. + */ +ch_error_t HS_CDECL ch_set_scratch_allocator(ch_alloc_t alloc_func, + ch_free_t free_func); + +/** + * @defgroup CH_ERROR ch_error_t values + * + * @{ + */ + +/** + * The engine completed normally. + */ +#define CH_SUCCESS 0 + +/** + * A parameter passed to this function was invalid. + */ +#define CH_INVALID (-1) + +/** + * A memory allocation failed. + */ +#define CH_NOMEM (-2) + +/** + * The engine was terminated by callback. + * + * This return value indicates that the target buffer was partially scanned, + * but that the callback function requested that scanning cease after a match + * was located. + */ +#define CH_SCAN_TERMINATED (-3) + +/** + * The pattern compiler failed, and the @ref ch_compile_error_t should be + * inspected for more detail. + */ +#define CH_COMPILER_ERROR (-4) + +/** + * The given database was built for a different version of the Chimera matcher. + */ +#define CH_DB_VERSION_ERROR (-5) + +/** + * The given database was built for a different platform (i.e., CPU type). + */ +#define CH_DB_PLATFORM_ERROR (-6) + +/** + * The given database was built for a different mode of operation. This error + * is returned when streaming calls are used with a non-streaming database and + * vice versa. + */ +#define CH_DB_MODE_ERROR (-7) + +/** + * A parameter passed to this function was not correctly aligned. + */ +#define CH_BAD_ALIGN (-8) + +/** + * The memory allocator did not correctly return memory suitably aligned for + * the largest representable data type on this platform. + */ +#define CH_BAD_ALLOC (-9) + +/** + * The scratch region was already in use. + * + * This error is returned when Chimera is able to detect that the scratch + * region given is already in use by another Chimera API call. + * + * A separate scratch region, allocated with @ref ch_alloc_scratch() or @ref + * ch_clone_scratch(), is required for every concurrent caller of the Chimera + * API. + * + * For example, this error might be returned when @ref ch_scan() has been + * called inside a callback delivered by a currently-executing @ref ch_scan() + * call using the same scratch region. + * + * Note: Not all concurrent uses of scratch regions may be detected. This error + * is intended as a best-effort debugging tool, not a guarantee. + */ +#define CH_SCRATCH_IN_USE (-10) + +/** + * Returned when pcre_exec (called for some expressions internally from @ref + * ch_scan) failed due to a fatal error. + */ +#define CH_FAIL_INTERNAL (-32) + +/** @} */ + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* CH_COMMON_H_ */ diff --git a/chimera/ch_compile.cpp b/chimera/ch_compile.cpp new file mode 100644 index 000000000..c71e26e0f --- /dev/null +++ b/chimera/ch_compile.cpp @@ -0,0 +1,878 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Compiler front-end, including public API calls for compilation. + */ + +#include "ch_compile.h" +#include "ch_alloc.h" +#include "ch_internal.h" +#include "ch_database.h" +#include "grey.h" +#include "hs_common.h" +#include "hs_internal.h" +#include "ue2common.h" +#include "util/compile_error.h" +#include "util/make_unique.h" +#include "util/multibit_build.h" +#include "util/target_info.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define PCRE_ERROR_MSG "Internal error building PCRE pattern." + +using namespace std; +using namespace ue2; + +static const char failureNoMemory[] = "Unable to allocate memory."; +static const char failureInternal[] = "Internal error."; +static const char failureBadAlloc[] = "Allocator returned misaligned memory."; + +static const ch_compile_error_t ch_enomem + = { const_cast(failureNoMemory), 0 }; +static const ch_compile_error_t ch_einternal + = { const_cast(failureInternal), 0 }; +static const ch_compile_error_t ch_badalloc + = { const_cast(failureBadAlloc), 0 }; + +static +ch_compile_error_t *generateChimeraCompileError(const string &err, + int expression) { + ch_compile_error_t *ret = + (struct ch_compile_error *)ch_misc_alloc(sizeof(ch_compile_error_t)); + if (ret) { + ch_error_t e = ch_check_alloc(ret); + if (e != CH_SUCCESS) { + ch_misc_free(ret); + return const_cast(&ch_badalloc); + } + char *msg = (char *)ch_misc_alloc(err.size() + 1); + if (msg) { + e = ch_check_alloc(msg); + if (e != HS_SUCCESS) { + ch_misc_free(msg); + return const_cast(&ch_badalloc); + } + memcpy(msg, err.c_str(), err.size() + 1); + ret->message = msg; + } else { + ch_misc_free(ret); + ret = nullptr; + } + } + + if (!ret || !ret->message) { + return const_cast(&ch_enomem); + } + + ret->expression = expression; + + return ret; +} + +static +void freeChimeraCompileError(ch_compile_error_t *error) { + if (!error) { + return; + } + if (error == &ch_enomem || error == &ch_einternal || + error == &ch_badalloc) { + // These are not allocated. + return; + } + + ch_misc_free(error->message); + ch_misc_free(error); +} + +static +bool checkMode(unsigned int mode, ch_compile_error_t **comp_error) { + static const unsigned int supported = CH_MODE_GROUPS; + + if (mode & ~supported) { + *comp_error = + generateChimeraCompileError("Invalid mode flag supplied.", -1); + return false; + } + return true; +} + +/** \brief Throw a compile error if we're passed some unsupported flags. */ +static +void checkFlags(const unsigned int flags) { + static const unsigned int supported = HS_FLAG_DOTALL + | HS_FLAG_MULTILINE + | HS_FLAG_CASELESS + | HS_FLAG_SINGLEMATCH + | HS_FLAG_UCP + | HS_FLAG_UTF8; + + if (flags & ~supported) { + throw CompileError("Unrecognized flag used."); + } +} + +static +bool isHyperscanSupported(const char *expression, unsigned int flags, + const hs_platform_info *platform) { + hs_database_t *db = nullptr; + hs_compile_error *comp_error = nullptr; + + unsigned int id = 0; + hs_error_t err = hs_compile_multi(&expression, &flags, &id, + 1, HS_MODE_BLOCK, platform, &db, + &comp_error); + if (err != HS_SUCCESS) { + assert(!db); + assert(comp_error); + DEBUG_PRINTF("unsupported: %s\n", comp_error->message); + hs_free_compile_error(comp_error); + + return false; + } + + assert(db); + assert(!comp_error); + hs_free_database(db); + return true; +} + +static +bool writeHyperscanDatabase(char *ptr, hs_database_t *db) { + // Note: we must use our serialization calls to re-home the database. + char *serialized = nullptr; + size_t slen = 0; + hs_error_t err = hs_serialize_database(db, &serialized, &slen); + if (err != HS_SUCCESS) { + DEBUG_PRINTF("hs_serialize_database returned %d\n", err); + assert(0); + return false; + } + + DEBUG_PRINTF("writing database to ptr %p\n", ptr); + + // deserialize_at without the platform tests. + err = hs_deserialize_database_at(serialized, slen, (hs_database_t *)ptr); + if (err != HS_SUCCESS) { + DEBUG_PRINTF("hs_deserialize_database_at returned %d\n", err); + assert(0); + ch_misc_free(serialized); + return false; + } + + ch_misc_free(serialized); + return true; +} + +static +bool writeHyperscanDatabase(ch_bytecode *db, hs_database_t *hs_db) { + db->databaseOffset = ROUNDUP_CL(sizeof(*db)); + char *ptr = (char *)db + db->databaseOffset; + return writeHyperscanDatabase(ptr, hs_db); +} + +static +int convertFlagsToPcreOptions(unsigned int flags) { + int options = 0; + if (flags & HS_FLAG_CASELESS) { + options |= PCRE_CASELESS; + } + if (flags & HS_FLAG_DOTALL) { + options |= PCRE_DOTALL; + } + if (flags & HS_FLAG_MULTILINE) { + options |= PCRE_MULTILINE; + } + if (flags & HS_FLAG_UTF8) { + options |= PCRE_UTF8; + } + if (flags & HS_FLAG_UCP) { + options |= PCRE_UCP; + } + + // All other flags are meaningless to PCRE. + + return options; +} + +namespace { + +/** \brief Data about a single pattern. */ +struct PatternData : boost::noncopyable { + PatternData(const char *pattern, u32 flags, u32 idx, u32 id_in, + unsigned mode, unsigned long int match_limit, + unsigned long int match_limit_recursion, + const hs_platform_info *platform); + ~PatternData() { + pcre_free(compiled); + pcre_free(extra); + } + + void buildPcre(const char *pattern, u32 flags); + + size_t patternSize() const; + + void writePattern(ch_pattern *pattern) const; + + pcre *compiled; //!< pcre_compile output + pcre_extra *extra; //!< pcre_study output + size_t compiled_size; + int study_size; + int capture_cnt; + bool utf8; + u32 id; //!< ID from the user + u32 expr_index; //!< index in the expression array + bool singlematch; //!< pattern is in highlander mode + bool guard; //!< this pattern should be guarded by the multimatcher + u32 minWidth; //!< min match width + u32 maxWidth; //!< max match width + u32 fixedWidth; //!< fixed pattern width + unsigned long int matchLimit; //! pcre match limit + unsigned long int matchLimitRecursion; //! pcre match_limit_recursion +}; + +PatternData::PatternData(const char *pattern, u32 flags, u32 idx, u32 id_in, + unsigned mode, unsigned long int match_limit, + unsigned long int match_limit_recursion, + const hs_platform_info *platform) + : compiled(nullptr), extra(nullptr), id(id_in), expr_index(idx), + singlematch(flags & HS_FLAG_SINGLEMATCH), + guard(false), minWidth(0), maxWidth(UINT_MAX), + fixedWidth(UINT_MAX), matchLimit(match_limit), + matchLimitRecursion(match_limit_recursion) { + assert(pattern); + + flags |= HS_FLAG_ALLOWEMPTY; /* don't hand things off to pcre for no + reason */ + + buildPcre(pattern, flags); + + // Fetch the expression info for a prefiltering, non-singlematch version of + // this pattern, if possible. + hs_expr_info *info = nullptr; + hs_compile_error_t *error = nullptr; + u32 infoflags = (flags | HS_FLAG_PREFILTER) & ~HS_FLAG_SINGLEMATCH; + u32 rawflags = (flags | HS_FLAG_SOM_LEFTMOST) & ~HS_FLAG_SINGLEMATCH; + hs_error_t err = hs_expression_info(pattern, infoflags, &info, &error); + if (err == HS_SUCCESS) { + assert(info); + hs_expr_info *i = (hs_expr_info *)info; + minWidth = i->min_width; + maxWidth = i->max_width; + bool ordered = i->unordered_matches ? false : true; + + // Only enable capturing if required + u32 captureCnt = 0; + if (mode & CH_MODE_GROUPS) { + captureCnt = capture_cnt; + } + + // No need to confirm with PCRE if: + // 1) pattern is fixed width + // 2) pattern isn't vacuous as it can't combine with start of match + // 3) no capturing in this pattern + // 4) no offset adjust in this pattern as hyperscan match callback + // will arrive without order, i.e. [^a]\z has offset adjust + // 5) hyperscan compile succeeds without prefiltering + if (minWidth == maxWidth && minWidth && maxWidth != UINT_MAX && + !captureCnt && ordered && + isHyperscanSupported(pattern, rawflags, platform)) { + fixedWidth = maxWidth; + } + + DEBUG_PRINTF("gathered info: widths=[%u,%u]\n", minWidth, maxWidth); + + ch_misc_free(info); + + u32 guardflags; + guardflags = (flags | HS_FLAG_PREFILTER) & ~HS_FLAG_SINGLEMATCH; + guard = isHyperscanSupported(pattern, guardflags, platform); + } else { + // We can't even prefilter this pattern, so we're dependent on Big Dumb + // Pcre Scans. + DEBUG_PRINTF("hs_expression_info failed, falling back to pcre\n"); + hs_free_compile_error(error); + } +} + +void PatternData::buildPcre(const char *pattern, u32 flags) { + int options = convertFlagsToPcreOptions(flags); + const char *errptr = nullptr; + int erroffset = 0; + + compiled = pcre_compile(pattern, options, &errptr, &erroffset, nullptr); + if (!compiled) { + DEBUG_PRINTF("PCRE failed to compile: %s\n", pattern); + string err("PCRE compilation failed: "); + err += string(errptr); + err += "."; + throw CompileError(expr_index, err); + } + + extra = pcre_study(compiled, PCRE_STUDY_JIT_COMPILE, &errptr); + // Note that it's OK for pcre_study to return NULL if there's nothing + // to be found, but a non-NULL error is always bad. + if (errptr) { + DEBUG_PRINTF("PCRE could not be studied: %s\n", errptr); + string err("PCRE compilation failed: "); + err += string(errptr); + err += "."; + throw CompileError(expr_index, err); + } + + if (pcre_fullinfo(compiled, extra, PCRE_INFO_SIZE, &compiled_size)) { + throw CompileError(PCRE_ERROR_MSG); + } + + if (!extra) { + study_size = 0; + } else { + if (pcre_fullinfo(compiled, extra, PCRE_INFO_STUDYSIZE, &study_size)) { + throw CompileError(PCRE_ERROR_MSG); + } + } + + if (pcre_fullinfo(compiled, extra, PCRE_INFO_CAPTURECOUNT, &capture_cnt)) { + throw CompileError(PCRE_ERROR_MSG); + } + + /* We use the pcre rather than hs to get this information as we may need it + * even in the pure unguarded pcre mode where there is no hs available. We + * can not use the compile flags due to (*UTF8) verb */ + unsigned long int opts = 0; // PCRE_INFO_OPTIONS demands an unsigned long + if (pcre_fullinfo(compiled, extra, PCRE_INFO_OPTIONS, &opts)) { + throw CompileError(PCRE_ERROR_MSG); + } + utf8 = opts & PCRE_UTF8; +} + +size_t PatternData::patternSize() const { + size_t len = 0; + + // ch_pattern header. + len += sizeof(ch_pattern); + + len = ROUNDUP_N(len, 8); + DEBUG_PRINTF("compiled pcre at %zu\n", len); + len += compiled_size; + + // PCRE study data, which may be zero. + if (study_size) { + len = ROUNDUP_N(len, 8); + DEBUG_PRINTF("study at %zu\n", len); + len += (size_t)study_size; + } + + DEBUG_PRINTF("pattern size %zu\n", len); + return len; +} + +/** \brief Write out an ch_pattern structure, which should already be sized + * correctly according to PatternData::patternSize. */ +void PatternData::writePattern(ch_pattern *pattern) const { + assert(pattern); + assert(ISALIGNED_CL(pattern)); + + pattern->id = id; + + u32 flags = 0; + if (singlematch) { + flags |= CHIMERA_PATTERN_FLAG_SINGLEMATCH; + } + if (utf8) { + flags |= CHIMERA_PATTERN_FLAG_UTF8; + } + + pattern->flags = flags; + pattern->maxWidth = maxWidth; + pattern->minWidth = minWidth == UINT_MAX ? 0 : minWidth; + pattern->fixedWidth = fixedWidth; + + // Compiled PCRE pattern. + char *ptr = (char *)pattern; + ptr += ROUNDUP_N(sizeof(*pattern), 8); + DEBUG_PRINTF("compiled pcre at %zu\n", (size_t)(ptr - (char *)pattern)); + memcpy(ptr, compiled, compiled_size); + ptr += compiled_size; + + // PCRE match limits + pattern->extra.flags = PCRE_EXTRA_MATCH_LIMIT | + PCRE_EXTRA_MATCH_LIMIT_RECURSION; + pattern->extra.match_limit = matchLimit ? matchLimit : 10000000; + // Set to avoid segment fault + pattern->extra.match_limit_recursion = + matchLimitRecursion ? matchLimitRecursion : 1500; + + // PCRE study_data. + u32 studyOffset = 0; + if (extra) { + assert(extra->study_data); + ptr = ROUNDUP_PTR(ptr, 8); + DEBUG_PRINTF("study at %zu\n", (size_t)(ptr - (char *)pattern)); + memcpy(ptr, extra->study_data, study_size); + studyOffset = (size_t)(ptr - (char *)pattern); + + pattern->extra.flags |= PCRE_EXTRA_STUDY_DATA; + pattern->extra.study_data = ptr; + + ptr += study_size; + } else { + pattern->extra.flags &= ~PCRE_EXTRA_STUDY_DATA; + } + pattern->studyOffset = studyOffset; + + size_t pcreLen = (ptr - (char *)pattern); + assert(pcreLen <= patternSize()); + pattern->length = (u32)pcreLen; + + // We shouldn't overrun the space we've allocated for this pattern. + assert(patternSize() >= (size_t)(ptr - (char *)pattern)); +} + +} // namespace + +namespace ch { + +static +void ch_compile_multi_int(const char *const *expressions, const unsigned *flags, + const unsigned *ids, unsigned elements, + unsigned mode, unsigned long int match_limit, + unsigned long int match_limit_recursion, + const hs_platform_info_t *platform, + ch_database_t **out) { + vector> pcres; + pcres.reserve(elements); + vector unguarded; // indices of unguarded PCREs. + vector multiExpr; + vector multiFlags; + vector multiIds; + bool allConfirm = true; + bool allSingleMatch = true; + for (unsigned int i = 0; i < elements; i++) { + const char *myExpr = expressions[i]; + unsigned int myFlags = flags ? flags[i] : 0; + unsigned int myId = ids ? ids[i] : 0; + + checkFlags(myFlags); + + // First, build with libpcre. A build failure from libpcre will throw + // an exception up to the caller. + auto patternData = + ue2::make_unique(myExpr, myFlags, i, myId, mode, match_limit, + match_limit_recursion, platform); + pcres.push_back(move(patternData)); + PatternData &curr = *pcres.back(); + + if (!(myFlags & HS_FLAG_SINGLEMATCH)) { + allSingleMatch = false; + } + + // in the multimatch, we always run in prefilter mode and accept vacuous + // patterns. + myFlags |= + HS_FLAG_ALLOWEMPTY | HS_FLAG_PREFILTER; + + if (curr.fixedWidth != UINT_MAX) { + myFlags |= HS_FLAG_SOM_LEFTMOST; + DEBUG_PRINTF("fixed width, turn off prefiltering\n"); + myFlags &= ~HS_FLAG_PREFILTER; + allConfirm = false; + + // Single match can't coexist with SOM. + myFlags &= ~HS_FLAG_SINGLEMATCH; + } + + if (curr.guard) { + // We use the index into the PCREs array as the Hyperscan idx. + multiExpr.push_back(myExpr); + multiFlags.push_back(myFlags); + multiIds.push_back(i); + } else { + // No Hyperscan support, PCRE is unguarded. + unguarded.push_back(i); + } + } + + DEBUG_PRINTF("built %zu PCREs, %zu of which are unguarded\n", + pcres.size(), unguarded.size()); + + // Work out our sizing for the output database. + size_t patternSize = 0; + for (unsigned int i = 0; i < elements; i++) { + size_t len = pcres[i]->patternSize(); + patternSize += ROUNDUP_CL(len); + } + DEBUG_PRINTF("pcre bytecode takes %zu bytes\n", patternSize); + + bool noMulti = multiExpr.empty(); + size_t multiSize = 0; + hs_database *multidb = nullptr; + if (!noMulti) { + hs_compile_error_t *hs_comp_error = nullptr; + hs_error_t err = hs_compile_multi(&multiExpr[0], &multiFlags[0], + &multiIds[0], multiExpr.size(), + HS_MODE_BLOCK, platform, &multidb, + &hs_comp_error); + + if (err != HS_SUCCESS) { + assert(hs_comp_error); + DEBUG_PRINTF("hs_compile_multi returned error: %s\n", + hs_comp_error->message); + assert(0); + hs_free_compile_error(hs_comp_error); + throw CompileError("Internal error."); + } + + assert(multidb); + err = hs_database_size(multidb, &multiSize); + if (err != HS_SUCCESS) { + assert(0); + throw CompileError("Internal error."); + } + DEBUG_PRINTF("built hyperscan database with len %zu bytes\n", multiSize); + } + + size_t bytecodeLen = sizeof(ch_bytecode) + + multiSize + alignof(u32) + + (sizeof(u32) * unguarded.size()) + + (sizeof(u32) * elements) + + patternSize + + 128; // padding for alignment + size_t totalSize = sizeof(ch_database) + bytecodeLen; + + DEBUG_PRINTF("allocating %zu bytes for database\n", totalSize); + char *ptr = (char *)ch_database_alloc(totalSize); + if (ch_check_alloc(ptr) != CH_SUCCESS) { + ch_database_free(ptr); + throw std::bad_alloc(); + } + + memset(ptr, 0, totalSize); + + // First, the header. + ch_database *hydb = (ch_database *)ptr; + hydb->magic = CH_DB_MAGIC; + hydb->version = HS_VERSION_32BIT; + hydb->length = bytecodeLen; + + // Then, the bytecode. + size_t shift = (size_t)hydb->bytes & 0x3f; + hydb->bytecode = offsetof(struct ch_database, bytes) - shift; + ch_bytecode *db = (ch_bytecode *)((char *)hydb + hydb->bytecode); + db->patternCount = elements; + db->activeSize = mmbit_size(elements); + db->flags = 0; + db->length = bytecodeLen; + + if (noMulti) { + db->flags |= CHIMERA_FLAG_NO_MULTIMATCH; + } + if (mode & CH_MODE_GROUPS) { + db->flags |= CHIMERA_FLAG_GROUPS; + } + if (allConfirm) { + db->flags |= CHIMERA_FLAG_ALL_CONFIRM; + } + if (allSingleMatch) { + db->flags |= CHIMERA_FLAG_ALL_SINGLE; + } + + + // Find and set the max ovector size by looking at the capture count for + // each pcre. + u32 maxCaptureGroups = 0; + for (unsigned int i = 0; i < elements; i++) { + maxCaptureGroups = max(maxCaptureGroups, (u32)pcres[i]->capture_cnt); + } + db->maxCaptureGroups = maxCaptureGroups; + DEBUG_PRINTF("max capture groups is %u\n", maxCaptureGroups); + + if (!noMulti) { + DEBUG_PRINTF("write hyperscan database\n"); + // Write Hyperscan database directly after the header struct, then free it. + if (!writeHyperscanDatabase(db, multidb)) { + ch_database_free(hydb); + hs_free_database(multidb); + throw CompileError("Internal error."); + } + hs_free_database(multidb); + } else { + db->databaseOffset = ROUNDUP_CL(sizeof(*db)); + } + + // Then, write our unguarded PCRE list. + db->unguardedCount = unguarded.size(); + db->unguardedOffset = ROUNDUP_N(db->databaseOffset + multiSize, 4); + ptr = (char *)db + db->unguardedOffset; + copy(unguarded.begin(), unguarded.end(), (u32 *)ptr); + + // Then, write all our compiled PCRE patterns and the lookup table for + // them. + db->patternOffset = db->unguardedOffset + unguarded.size() * sizeof(u32); + u32 *patternOffset = (u32 *)((char *)db + db->patternOffset); + u32 offset = ROUNDUP_CL(db->patternOffset + elements * sizeof(u32)); + for (unsigned int i = 0; i < elements; i++) { + *patternOffset = offset; + size_t len = pcres[i]->patternSize(); + ptr = (char *)db + offset; + struct ch_pattern *pattern = (struct ch_pattern *)ptr; + pcres[i]->writePattern(pattern); + DEBUG_PRINTF("wrote pcre %u into offset %u, len %zu\n", i, offset, len); + offset += ROUNDUP_CL(len); + patternOffset++; + } + + assert(offset <= totalSize); + assert(hydb->magic == CH_DB_MAGIC); + DEBUG_PRINTF("built hybrid database, size %zu bytes\n", totalSize); + DEBUG_PRINTF("offset=%u\n", offset); + *out = hydb; +} + +} // namespace ch + +extern "C" HS_PUBLIC_API +ch_error_t HS_CDECL ch_compile(const char *expression, unsigned flags, + unsigned mode, + const hs_platform_info_t *platform, + ch_database_t **db, + ch_compile_error_t **comp_error) { + if (!comp_error) { + if (db) { + db = nullptr; + } + // nowhere to write the string, but we can still report an error code + return CH_COMPILER_ERROR; + } + if (!db) { + *comp_error = + generateChimeraCompileError("Invalid parameter: db is NULL", -1); + return CH_COMPILER_ERROR; + } + if (!expression) { + *db = nullptr; + *comp_error = + generateChimeraCompileError("Invalid parameter: expressions is\ + NULL", -1); + return CH_COMPILER_ERROR; + } + + if (!checkMode(mode, comp_error)) { + *db = nullptr; + assert(*comp_error); // set by checkMode + return CH_COMPILER_ERROR; + } + + try { + unsigned id = 0; // single expressions get zero as an ID + // Internal function to do all the work, now that we've handled all the + // argument checking. + ch::ch_compile_multi_int(&expression, &flags, &id, 1, mode, 0, 0, + platform, db); + } + catch (const CompileError &e) { + // Compiler error occurred + *db = nullptr; + *comp_error = generateChimeraCompileError(e.reason, e.hasIndex ? + (int)e.index : -1); + return CH_COMPILER_ERROR; + } + catch (std::bad_alloc) { + *db = nullptr; + *comp_error = const_cast(&ch_enomem); + return CH_COMPILER_ERROR; + } + catch (...) { + assert(!"Internal error, unexpected exception"); + *db = nullptr; + *comp_error = const_cast(&ch_einternal); + return CH_COMPILER_ERROR; + } + + DEBUG_PRINTF("success!\n"); + return CH_SUCCESS; +} + +extern "C" HS_PUBLIC_API +ch_error_t HS_CDECL ch_compile_multi(const char *const *expressions, + const unsigned *flags, const unsigned *ids, + unsigned elements, unsigned mode, + const hs_platform_info_t *platform, + ch_database_t **db, + ch_compile_error_t **comp_error) { + if (!comp_error) { + if (db) { + db = nullptr; + } + // nowhere to write the string, but we can still report an error code + return CH_COMPILER_ERROR; + } + if (!db) { + *comp_error = + generateChimeraCompileError("Invalid parameter: db is NULL", -1); + return CH_COMPILER_ERROR; + } + if (!expressions) { + *db = nullptr; + *comp_error = + generateChimeraCompileError("Invalid parameter: expressions is\ + NULL", -1); + return CH_COMPILER_ERROR; + } + if (!elements) { + *db = nullptr; + *comp_error = generateChimeraCompileError("Invalid parameter:\ + elements is zero", -1); + return CH_COMPILER_ERROR; + } + + if (!checkMode(mode, comp_error)) { + *db = nullptr; + assert(*comp_error); // set by checkMode + return CH_COMPILER_ERROR; + } + + try { + // Internal function to do all the work, now that we've handled all the + // argument checking. + ch::ch_compile_multi_int(expressions, flags, ids, elements, mode, 0, 0, + platform, db); + } + catch (const CompileError &e) { + // Compiler error occurred + *db = nullptr; + *comp_error = generateChimeraCompileError(e.reason, e.hasIndex ? + (int)e.index : -1); + return CH_COMPILER_ERROR; + } + catch (std::bad_alloc) { + *db = nullptr; + *comp_error = const_cast(&ch_enomem); + return CH_COMPILER_ERROR; + } + catch (...) { + assert(!"Internal error, unexpected exception"); + *db = nullptr; + *comp_error = const_cast(&ch_einternal); + return CH_COMPILER_ERROR; + } + + DEBUG_PRINTF("success!\n"); + return CH_SUCCESS; +} + +extern "C" HS_PUBLIC_API +ch_error_t HS_CDECL ch_compile_ext_multi( + const char *const *expressions, + const unsigned *flags, + const unsigned *ids, + unsigned elements, unsigned mode, + unsigned long int match_limit, + unsigned long int match_limit_recursion, + const hs_platform_info_t *platform, + ch_database_t **db, + ch_compile_error_t **comp_error) { + if (!comp_error) { + if (db) { + db = nullptr; + } + // nowhere to write the string, but we can still report an error code + return CH_COMPILER_ERROR; + } + if (!db) { + *comp_error = + generateChimeraCompileError("Invalid parameter: db is NULL", -1); + return CH_COMPILER_ERROR; + } + if (!expressions) { + *db = nullptr; + *comp_error = + generateChimeraCompileError("Invalid parameter: expressions is\ + NULL", -1); + return CH_COMPILER_ERROR; + } + if (!elements) { + *db = nullptr; + *comp_error = generateChimeraCompileError("Invalid parameter:\ + elements is zero", -1); + return CH_COMPILER_ERROR; + } + + if (!checkMode(mode, comp_error)) { + *db = nullptr; + assert(*comp_error); // set by checkMode + return CH_COMPILER_ERROR; + } + + try { + // Internal function to do all the work, now that we've handled all the + // argument checking. + ch::ch_compile_multi_int(expressions, flags, ids, elements, mode, + match_limit, match_limit_recursion, platform, + db); + } + catch (const CompileError &e) { + // Compiler error occurred + *db = nullptr; + *comp_error = generateChimeraCompileError(e.reason, e.hasIndex ? + (int)e.index : -1); + return CH_COMPILER_ERROR; + } + catch (std::bad_alloc) { + *db = nullptr; + *comp_error = const_cast(&ch_enomem); + return CH_COMPILER_ERROR; + } + catch (...) { + assert(!"Internal error, unexpected exception"); + *db = nullptr; + *comp_error = const_cast(&ch_einternal); + return CH_COMPILER_ERROR; + } + + DEBUG_PRINTF("success!\n"); + return CH_SUCCESS; +} + +extern "C" HS_PUBLIC_API +ch_error_t HS_CDECL ch_free_compile_error(ch_compile_error_t *error) { + freeChimeraCompileError(error); + return CH_SUCCESS; +} diff --git a/chimera/ch_compile.h b/chimera/ch_compile.h new file mode 100644 index 000000000..03c750eba --- /dev/null +++ b/chimera/ch_compile.h @@ -0,0 +1,394 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CH_COMPILE_H_ +#define CH_COMPILE_H_ + +/** + * @file + * @brief The Chimera compiler API definition. + * + * Chimera is a hybrid solution of Hyperscan and PCRE. + * + * This header contains functions for compiling regular expressions into + * Chimera databases that can be used by the Chimera runtime. + */ + +#include "ch_common.h" +#include "hs_compile.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** + * A type containing error details that is returned by the compile calls (@ref + * ch_compile() and @ref ch_compile_multi() on failure. The caller may inspect + * the values returned in this type to determine the cause of failure. + */ +typedef struct ch_compile_error { + /** + * A human-readable error message describing the error. + */ + char *message; + + /** + * The zero-based number of the expression that caused the error (if this + * can be determined). If the error is not specific to an expression, then + * this value will be less than zero. + */ + int expression; +} ch_compile_error_t; + +/** + * The basic regular expression compiler. + * + * This is the function call with which an expression is compiled into a + * Chimera database which can be passed to the runtime function ( + * @ref ch_scan()) + * + * @param expression + * The NULL-terminated expression to parse. Note that this string must + * represent ONLY the pattern to be matched, with no delimiters or flags; + * any global flags should be specified with the @a flags argument. For + * example, the expression `/abc?def/i` should be compiled by providing + * `abc?def` as the @a expression, and @ref CH_FLAG_CASELESS as the @a + * flags. + * + * @param flags + * Flags which modify the behaviour of the expression. Multiple flags may + * be used by ORing them together. Valid values are: + * - CH_FLAG_CASELESS - Matching will be performed case-insensitively. + * - CH_FLAG_DOTALL - Matching a `.` will not exclude newlines. + * - CH_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data. + * - CH_FLAG_SINGLEMATCH - Only one match will be generated for the + * expression per stream. + * - CH_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters. + * - CH_FLAG_UCP - Use Unicode properties for character classes. + * + * @param mode + * Compiler mode flag that affect the database as a whole for capturing + * groups. One of CH_MODE_NOGROUPS or CH_MODE_GROUPS must be supplied. + * See @ref CH_MODE_FLAG for more details. + * + * @param platform + * If not NULL, the platform structure is used to determine the target + * platform for the database. If NULL, a database suitable for running + * on the current host platform is produced. + * + * @param db + * On success, a pointer to the generated database will be returned in + * this parameter, or NULL on failure. The caller is responsible for + * deallocating the buffer using the @ref ch_free_database() function. + * + * @param compile_error + * If the compile fails, a pointer to a @ref ch_compile_error_t will be + * returned, providing details of the error condition. The caller is + * responsible for deallocating the buffer using the @ref + * ch_free_compile_error() function. + * + * @return + * @ref CH_SUCCESS is returned on successful compilation; @ref + * CH_COMPILER_ERROR on failure, with details provided in the error + * parameter. + */ +ch_error_t HS_CDECL ch_compile(const char *expression, unsigned int flags, + unsigned int mode, + const hs_platform_info_t *platform, + ch_database_t **db, + ch_compile_error_t **compile_error); + +/** + * The multiple regular expression compiler. + * + * This is the function call with which a set of expressions is compiled into a + * database which can be passed to the runtime function (@ref ch_scan()). + * Each expression can be labelled with a unique integer which is passed into + * the match callback to identify the pattern that has matched. + * + * @param expressions + * Array of NULL-terminated expressions to compile. Note that (as for @ref + * ch_compile()) these strings must contain only the pattern to be + * matched, with no delimiters or flags. For example, the expression + * `/abc?def/i` should be compiled by providing `abc?def` as the first + * string in the @a expressions array, and @ref CH_FLAG_CASELESS as the + * first value in the @a flags array. + * + * @param flags + * Array of flags which modify the behaviour of each expression. Multiple + * flags may be used by ORing them together. Specifying the NULL pointer + * in place of an array will set the flags value for all patterns to zero. + * Valid values are: + * - CH_FLAG_CASELESS - Matching will be performed case-insensitively. + * - CH_FLAG_DOTALL - Matching a `.` will not exclude newlines. + * - CH_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data. + * - CH_FLAG_SINGLEMATCH - Only one match will be generated by patterns + * with this match id per stream. + * - CH_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters. + * - CH_FLAG_UCP - Use Unicode properties for character classes. + * + * @param ids + * An array of integers specifying the ID number to be associated with the + * corresponding pattern in the expressions array. Specifying the NULL + * pointer in place of an array will set the ID value for all patterns to + * zero. + * + * @param elements + * The number of elements in the input arrays. + * + * @param mode + * Compiler mode flag that affect the database as a whole for capturing + * groups. One of CH_MODE_NOGROUPS or CH_MODE_GROUPS must be supplied. + * See @ref CH_MODE_FLAG for more details. + * + * @param platform + * If not NULL, the platform structure is used to determine the target + * platform for the database. If NULL, a database suitable for running + * on the current host platform is produced. + * + * @param db + * On success, a pointer to the generated database will be returned in + * this parameter, or NULL on failure. The caller is responsible for + * deallocating the buffer using the @ref ch_free_database() function. + * + * @param compile_error + * If the compile fails, a pointer to a @ref ch_compile_error_t will be + * returned, providing details of the error condition. The caller is + * responsible for deallocating the buffer using the @ref + * ch_free_compile_error() function. + * + * @return + * @ref CH_SUCCESS is returned on successful compilation; @ref + * CH_COMPILER_ERROR on failure, with details provided in the @a error + * parameter. + * + */ +ch_error_t HS_CDECL ch_compile_multi(const char *const *expressions, + const unsigned int *flags, + const unsigned int *ids, + unsigned int elements, unsigned int mode, + const hs_platform_info_t *platform, + ch_database_t **db, + ch_compile_error_t **compile_error); + +/** + * The multiple regular expression compiler with extended match limits support. + * + * This is the function call with which a set of expressions is compiled into a + * database in the same way as @ref ch_compile_multi(), but allows additional + * parameters to be specified via match_limit and match_limit_recursion to + * define match limits for PCRE runtime. + * + * @param expressions + * Array of NULL-terminated expressions to compile. Note that (as for @ref + * ch_compile()) these strings must contain only the pattern to be + * matched, with no delimiters or flags. For example, the expression + * `/abc?def/i` should be compiled by providing `abc?def` as the first + * string in the @a expressions array, and @ref CH_FLAG_CASELESS as the + * first value in the @a flags array. + * + * @param flags + * Array of flags which modify the behaviour of each expression. Multiple + * flags may be used by ORing them together. Specifying the NULL pointer + * in place of an array will set the flags value for all patterns to zero. + * Valid values are: + * - CH_FLAG_CASELESS - Matching will be performed case-insensitively. + * - CH_FLAG_DOTALL - Matching a `.` will not exclude newlines. + * - CH_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data. + * - CH_FLAG_SINGLEMATCH - Only one match will be generated by patterns + * with this match id per stream. + * - CH_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters. + * - CH_FLAG_UCP - Use Unicode properties for character classes. + * + * @param ids + * An array of integers specifying the ID number to be associated with the + * corresponding pattern in the expressions array. Specifying the NULL + * pointer in place of an array will set the ID value for all patterns to + * zero. + * + * @param elements + * The number of elements in the input arrays. + * + * @param mode + * Compiler mode flag that affect the database as a whole for capturing + * groups. One of CH_MODE_NOGROUPS or CH_MODE_GROUPS must be supplied. + * See @ref CH_MODE_FLAG for more details. + * + * @param match_limit + * A limit from pcre_extra on the amount of match function called in PCRE + * to limit backtracking that can take place. + * + * @param match_limit_recursion + * A limit from pcre_extra on the recursion depth of match function + * in PCRE. + * + * @param platform + * If not NULL, the platform structure is used to determine the target + * platform for the database. If NULL, a database suitable for running + * on the current host platform is produced. + * + * @param db + * On success, a pointer to the generated database will be returned in + * this parameter, or NULL on failure. The caller is responsible for + * deallocating the buffer using the @ref ch_free_database() function. + * + * @param compile_error + * If the compile fails, a pointer to a @ref ch_compile_error_t will be + * returned, providing details of the error condition. The caller is + * responsible for deallocating the buffer using the @ref + * ch_free_compile_error() function. + * + * @return + * @ref CH_SUCCESS is returned on successful compilation; @ref + * CH_COMPILER_ERROR on failure, with details provided in the @a error + * parameter. + * + */ +ch_error_t HS_CDECL ch_compile_ext_multi(const char *const *expressions, + const unsigned int *flags, + const unsigned int *ids, + unsigned int elements, + unsigned int mode, + unsigned long int match_limit, + unsigned long int match_limit_recursion, + const hs_platform_info_t *platform, + ch_database_t **db, + ch_compile_error_t **compile_error); + +/** + * Free an error structure generated by @ref ch_compile(), @ref + * ch_compile_multi(). + * + * @param error + * The @ref ch_compile_error_t to be freed. NULL may also be safely + * provided. + * + * @return + * @ref CH_SUCCESS on success, other values on failure. + */ +ch_error_t HS_CDECL ch_free_compile_error(ch_compile_error_t *error); + +/** + * @defgroup CH_PATTERN_FLAG Pattern flags + * + * @{ + */ + +/** + * Compile flag: Set case-insensitive matching. + * + * This flag sets the expression to be matched case-insensitively by default. + * The expression may still use PCRE tokens (notably `(?i)` and + * `(?-i)`) to switch case-insensitive matching on and off. + */ +#define CH_FLAG_CASELESS 1 + +/** + * Compile flag: Matching a `.` will not exclude newlines. + * + * This flag sets any instances of the `.` token to match newline characters as + * well as all other characters. The PCRE specification states that the `.` + * token does not match newline characters by default, so without this flag the + * `.` token will not cross line boundaries. + */ +#define CH_FLAG_DOTALL 2 + +/** + * Compile flag: Set multi-line anchoring. + * + * This flag instructs the expression to make the `^` and `$` tokens match + * newline characters as well as the start and end of the stream. If this flag + * is not specified, the `^` token will only ever match at the start of a + * stream, and the `$` token will only ever match at the end of a stream within + * the guidelines of the PCRE specification. + */ +#define CH_FLAG_MULTILINE 4 + +/** + * Compile flag: Set single-match only mode. + * + * This flag sets the expression's match ID to match at most once, only the + * first match for each invocation of @ref ch_scan() will be returned. + * + */ +#define CH_FLAG_SINGLEMATCH 8 + +/** + * Compile flag: Enable UTF-8 mode for this expression. + * + * This flag instructs Chimera to treat the pattern as a sequence of UTF-8 + * characters. The results of scanning invalid UTF-8 sequences with a Chimera + * library that has been compiled with one or more patterns using this flag are + * undefined. + */ +#define CH_FLAG_UTF8 32 + +/** + * Compile flag: Enable Unicode property support for this expression. + * + * This flag instructs Chimera to use Unicode properties, rather than the + * default ASCII interpretations, for character mnemonics like `\w` and `\s` as + * well as the POSIX character classes. It is only meaningful in conjunction + * with @ref CH_FLAG_UTF8. + */ +#define CH_FLAG_UCP 64 + +/** @} */ + +/** + * @defgroup CH_MODE_FLAG Compile mode flags + * + * The mode flags are used as values for the mode parameter of the various + * compile calls (@ref ch_compile(), @ref ch_compile_multi(). + * + * By default, the matcher will only supply the start and end offsets of the + * match when the match callback is called. Using mode flag @ref CH_MODE_GROUPS + * will also fill the `captured' array with the start and end offsets of all + * the capturing groups specified by the pattern that has matched. + * + * @{ + */ + +/** + * Compiler mode flag: Disable capturing groups. + */ +#define CH_MODE_NOGROUPS 0 + +/** + * Compiler mode flag: Enable capturing groups. + */ +#define CH_MODE_GROUPS 1048576 + +/** @} */ + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* CH_COMPILE_H_ */ diff --git a/chimera/ch_database.c b/chimera/ch_database.c new file mode 100644 index 000000000..387d076eb --- /dev/null +++ b/chimera/ch_database.c @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Chimera: database construction, etc. + */ + +#include +#include +#include +#include + +#include "allocator.h" +#include "database.h" +#include "hs.h" +#include "ch.h" +#include "hs_internal.h" +#include "ch_common.h" +#include "ch_alloc.h" +#include "ch_database.h" +#include "ch_internal.h" + +static really_inline +int db_correctly_aligned(const void *db) { + return ISALIGNED_N(db, alignof(unsigned long long)); +} + +HS_PUBLIC_API +ch_error_t HS_CDECL ch_free_database(ch_database_t *hydb) { + if (hydb && hydb->magic != CH_DB_MAGIC) { + return CH_INVALID; + } + ch_database_free(hydb); + + return CH_SUCCESS; +} + +HS_PUBLIC_API +ch_error_t HS_CDECL ch_database_size(const ch_database_t *hydb, size_t *size) { + if (!size) { + return CH_INVALID; + } + + ch_error_t ret = hydbIsValid(hydb); + if (unlikely(ret != CH_SUCCESS)) { + return ret; + } + + *size = sizeof(struct ch_database) + hydb->length; + return CH_SUCCESS; +} + +/** \brief Identifier prepended to database info. */ +static const char CHIMERA_IDENT[] = "Chimera "; + +HS_PUBLIC_API +ch_error_t HS_CDECL ch_database_info(const ch_database_t *hydb, char **info) { + if (!info) { + return CH_INVALID; + } + *info = NULL; + + if (!hydb || !db_correctly_aligned(hydb) || hydb->magic != CH_DB_MAGIC) { + return HS_INVALID; + } + + const struct ch_bytecode *bytecode = ch_get_bytecode(hydb); + char noMulti = (bytecode->flags & CHIMERA_FLAG_NO_MULTIMATCH); + if (noMulti) { + size_t len = strlen(CHIMERA_IDENT); + *info = ch_misc_alloc(len + 1); + if (!(*info)) { + return CH_INVALID; + } + memcpy((*info), CHIMERA_IDENT, len); + (*info)[len] = '\0'; + return CH_SUCCESS; + } + + char *hsinfo = NULL; + hs_error_t ret = hs_database_info(getHyperscanDatabase(bytecode), &hsinfo); + if (ret != HS_SUCCESS) { + assert(!hsinfo); + return ret; + } + + size_t hybridlen = strlen(CHIMERA_IDENT); + size_t hslen = strlen(hsinfo); + *info = ch_misc_alloc(hybridlen + hslen + 1); + if (!(*info)) { + ch_misc_free(hsinfo); + return CH_INVALID; + } + + memcpy((*info), CHIMERA_IDENT, hybridlen); + memcpy((*info) + hybridlen, hsinfo, hslen); + (*info)[hybridlen + hslen] = '\0'; + ch_misc_free(hsinfo); + + return CH_SUCCESS; +} diff --git a/chimera/ch_database.h b/chimera/ch_database.h new file mode 100644 index 000000000..28bde86ea --- /dev/null +++ b/chimera/ch_database.h @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Runtime code for ch_database manipulation. + */ + +#ifndef CH_DATABASE_H_ +#define CH_DATABASE_H_ + +#ifdef __cplusplus +extern "C" +{ +#endif + +#define PCRE_STATIC +#include + +#include "ch_compile.h" // for CH_MODE_ flags +#include "ue2common.h" +#include "hs_version.h" +#include "hs.h" + +#define CH_DB_MAGIC 0xdedededeU //!< Magic number stored in \ref ch_database + +/** \brief Main Chimera database header. */ +struct ch_database { + u32 magic; //!< must be \ref CH_DB_MAGIC + u32 version; //!< release version + u32 length; //!< total allocated length in bytes + u32 reserved0; //!< unused + u32 reserved1; //!< unused + u32 bytecode; //!< offset relative to db start + u32 padding[16]; //!< padding for alignment of rest of bytecode + char bytes[]; +}; + +/** \brief Chimera bytecode header, which follows the \ref ch_database and is + * always 64-byte aligned. */ +struct ch_bytecode { + u32 length; //!< length of bytecode including this header struct + u32 flags; //!< whole-database flags (CHIMERA_FLAG_NO_MULTIMATCH, + // CHIMERA_FLAG_GROUPS) + u32 patternCount; //!< total number of patterns + u32 activeSize; //!< size of mmbit to store active pattern ids + u32 databaseOffset; //!< offset for database following \ref ch_bytecode + // header + u32 patternOffset; //!< points to an array of u32 offsets, each pointing to + // a \ref ch_pattern + u32 unguardedOffset; //!< pointer to a list of unguarded pattern indices + u32 unguardedCount; //!< number of unguarded patterns + u32 maxCaptureGroups; //!< max number of capture groups used by any pattern +}; + +/** \brief Per-pattern header. + * + * struct is followed in bytecode by: + * 1. pcre bytecode (always present) + * 2. pcre study data (sometimes) + */ +struct ch_pattern { + u32 id; //!< pattern ID to report to the user + u32 flags; //!< per-pattern flags (e.g. \ref CHIMERA_PATTERN_FLAG_UTF8) + u32 maxWidth; //!< maximum width of a match, or UINT_MAX for inf. + u32 minWidth; //!< minimum width of a match. + u32 fixedWidth;//!< pattern has fixed width. + u32 studyOffset; //!< offset relative to struct start of study data, + // or zero if there is none + u32 length; //!< length of struct plus pcre bytecode and study data + pcre_extra extra; //!< pcre_extra struct, used to store study data ptr for + // the currently-running pcre at runtime. +}; + +static really_inline +const void *ch_get_bytecode(const struct ch_database *db) { + assert(db); + const void *bytecode = (const char *)db + db->bytecode; + assert(ISALIGNED_16(bytecode)); + return bytecode; +} + +struct hs_database; + +static really_inline +const struct hs_database *getHyperscanDatabase(const struct ch_bytecode *db) { + assert(db); + const char *ptr = (const char *)db; + const struct hs_database *hs_db; + hs_db = (const struct hs_database *)(ptr + db->databaseOffset); + assert(ISALIGNED_CL(hs_db)); + return hs_db; +} + +static really_inline +const u32 *getUnguarded(const struct ch_bytecode *db) { + assert(db); + const char *ptr = (const char *)db; + const u32 *unguarded = (const u32 *)(ptr + db->unguardedOffset); + assert(ISALIGNED_N(unguarded, sizeof(u32))); + return unguarded; +} + +static really_inline +const struct ch_pattern *getPattern(const struct ch_bytecode *db, u32 i) { + assert(db); + assert(i < db->patternCount); + const char *ptr = (const char *)db; + const u32 *patternOffset = (const u32 *)(ptr + db->patternOffset); + assert(patternOffset[i] < db->length); + return (const struct ch_pattern *)(ptr + patternOffset[i]); +} + +static really_inline +ch_error_t hydbIsValid(const struct ch_database *hydb) { + if (!hydb || hydb->magic != CH_DB_MAGIC) { + DEBUG_PRINTF("bad magic (%u != %u)\n", hydb->magic, CH_DB_MAGIC); + return CH_INVALID; + } + + if (hydb->version != HS_VERSION_32BIT) { + DEBUG_PRINTF("bad version\n"); + return CH_DB_VERSION_ERROR; + } + + return CH_SUCCESS; +} + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* CH_DATABASE_H_ */ + diff --git a/chimera/ch_internal.h b/chimera/ch_internal.h new file mode 100644 index 000000000..a54d13922 --- /dev/null +++ b/chimera/ch_internal.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Chimera: data structures and internals. + */ + +#ifndef CH_INTERNAL_H +#define CH_INTERNAL_H + +#define CHIMERA_FLAG_NO_MULTIMATCH 1 //!< Don't run a multimatch scan +#define CHIMERA_FLAG_GROUPS 2 //!< Return capturing groups +#define CHIMERA_FLAG_ALL_CONFIRM 4 //!< All patterns need confirm +#define CHIMERA_FLAG_ALL_SINGLE 8 //!< All patterns need only one match + +#define CHIMERA_PATTERN_FLAG_SINGLEMATCH 1 //!< only report the first match +#define CHIMERA_PATTERN_FLAG_UTF8 2 //!< pattern is in UTF-8 mode + +#endif diff --git a/chimera/ch_runtime.c b/chimera/ch_runtime.c new file mode 100644 index 000000000..212bbc7be --- /dev/null +++ b/chimera/ch_runtime.c @@ -0,0 +1,629 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Chimera: main runtime. + */ + +#include +#include +#include +#include + +#include "ch.h" +#include "hs.h" +#include "hs_internal.h" +#include "ue2common.h" +#include "ch_database.h" +#include "ch_internal.h" +#include "ch_scratch.h" +#include "util/multibit.h" +#include "util/unicode_def.h" + +typedef struct queue_item PQ_T; + +static +char PQ_COMP(PQ_T *pqc_items, int a, int b) { + if ((pqc_items)[a].to != (pqc_items)[b].to) { + return (pqc_items)[a].to < (pqc_items)[b].to; + } else if ((pqc_items)[a].from != (pqc_items)[b].from) { + return (pqc_items)[a].from < (pqc_items)[b].from; + } else { + return (pqc_items)[a].id < (pqc_items)[b].id; + } +} + +static +char PQ_COMP_B(PQ_T *pqc_items, int a, PQ_T b_fixed) { + if ((pqc_items)[a].to != (b_fixed).to) { + return (pqc_items)[a].to < (b_fixed).to; + } else if ((pqc_items)[a].from != (b_fixed).from) { + return (pqc_items)[a].from < (b_fixed).from; + } else { + return (pqc_items)[a].id < b_fixed.id; + } +} + +#include "util/pqueue.h" + +static really_inline +void pq_insert_with(struct match_pq *pq, int from, int to, u32 id) { + DEBUG_PRINTF("inserting pattern%u in pq at %u\n", id, to); + struct queue_item temp = { + .from = from, + .to = to, + .id = id, + }; + + pq_insert(pq->item, pq->size, temp); + ++pq->size; +} + +static really_inline +void pq_pop_nice(struct match_pq *pq) { + pq_pop(pq->item, pq->size); + pq->size--; +} + +/** dummy event handler for use when user does not provide one */ +static +int HS_CDECL null_onEvent(UNUSED unsigned id, UNUSED unsigned long long from, + UNUSED unsigned long long to, UNUSED unsigned flags, + UNUSED unsigned size, UNUSED const ch_capture_t *captured, + UNUSED void *ctxt) { + return 0; +} + +/** \brief Chimera runtime context. */ +struct HybridContext { + const char *data; //!< buffer being scanned + u32 length; //!< length of data buffer + u32 valid_utf8_highwater; //!< UTF-8 has been validated up to here. + const struct ch_bytecode *db; + struct ch_scratch *scratch; + struct match_pq *pq; + /** \brief user-supplied match callback */ + int (HS_CDECL *match_callback)(unsigned int id, unsigned long long from, + unsigned long long to, unsigned int flags, + unsigned int size, const ch_capture_t *capture, + void *ctx); + /** \brief user-supplied error callback */ + int (HS_CDECL *error_callback)(ch_error_event_t error_type, unsigned int id, + void *info, void *ctx); + /** \brief user-supplied context */ + void *context; +}; + +// Internal PCRE func. +extern int _pcre_valid_utf(const unsigned char *, int, int *); + +/** UTF-8 validity check. Returns >0 if the given region of the data is valid + * UTF-8, 0 otherwise. */ +static +char isValidUTF8(struct HybridContext *hyctx, u32 end) { + assert(hyctx); + + if (hyctx->valid_utf8_highwater >= end) { + return 1; // Already validated. + } + + const unsigned char *data = + (const unsigned char *)hyctx->data + hyctx->valid_utf8_highwater; + int validate_len = end - hyctx->valid_utf8_highwater; + + DEBUG_PRINTF("validating %d bytes\n", validate_len); + + int erroroffset = 0; + if (_pcre_valid_utf(data, validate_len, &erroroffset)) { + DEBUG_PRINTF("UTF8 invalid at offset %d\n", erroroffset); + return 0; + } + + hyctx->valid_utf8_highwater = end; + return 1; +} + +static +const pcre *getPcre(const struct ch_pattern *pattern) { + const char *ptr = (const char *)pattern; + const pcre *p = (const pcre *)(ptr + ROUNDUP_N(sizeof(*pattern), 8)); + assert(ISALIGNED_N(p, 8)); + return p; +} + +/** \brief Fill the Chimera groups array from a pcre_exec ovector. */ +static +void fillGroupsFromOvector(ch_capture_t *groups, int numPairs, int *ovector) { + assert(groups); + assert(ISALIGNED_N(groups, alignof(ch_capture_t))); + + DEBUG_PRINTF("filling %d groups (@ %p) from pcre ovector\n", + numPairs, groups); + + for (int i = 0; i < numPairs * 2; i += 2) { + if (ovector[i] == -1) { + groups->flags = CH_CAPTURE_FLAG_INACTIVE; + } else { + groups->flags = CH_CAPTURE_FLAG_ACTIVE; + assert(ovector[i] <= ovector[i + 1]); + groups->from = ovector[i]; + groups->to = ovector[i + 1]; + } + ++groups; + } +} + +static +ch_error_t handlePcreNonMatch(const struct ch_pattern *pattern, int rv, + ch_error_event_handler onError, + void *userContext) { + assert(rv < 0); + + if (rv == PCRE_ERROR_NOMATCH) { + DEBUG_PRINTF("no match found by libpcre\n"); + return CH_SUCCESS; + } else if (rv == PCRE_ERROR_MATCHLIMIT) { + DEBUG_PRINTF("pcre hit match limit\n"); + if (onError) { + return onError(CH_ERROR_MATCHLIMIT, pattern->id, NULL, + userContext); + } + return CH_SUCCESS; + } else if (rv == PCRE_ERROR_RECURSIONLIMIT) { + DEBUG_PRINTF("pcre hit recursion limit\n"); + if (onError) { + return onError(CH_ERROR_RECURSIONLIMIT, pattern->id, NULL, + userContext); + } + return CH_SUCCESS; + } + + // All other errors not handled above are fatal. + return CH_FAIL_INTERNAL; +} + +static +ch_error_t scanPcre(struct HybridContext *hyctx, UNUSED unsigned int length, + unsigned int offset, u32 id) { + const char *data = hyctx->data; + unsigned int full_length = hyctx->length; + ch_error_event_handler onError = hyctx->error_callback; + void *userContext = hyctx->context; + + const struct ch_pattern *pattern = getPattern(hyctx->db, id); + const pcre *p = getPcre(pattern); + + // Set up the PCRE extra block. + const pcre_extra *extra = &pattern->extra; + + int startoffset = offset; + + int *ovector = hyctx->scratch->ovector; + int ovectorSize = (hyctx->scratch->maxCaptureGroups + 1) * 3; + assert(ovectorSize >= 2); + + DEBUG_PRINTF("scanning %u bytes, pattern %u, startoffset %d\n", + length, id, startoffset); + + int options = 0; + if (pattern->flags & CHIMERA_PATTERN_FLAG_UTF8) { + // We do our own UTF-8 validation. + options |= PCRE_NO_UTF8_CHECK; + if (!isValidUTF8(hyctx, full_length)) { + return handlePcreNonMatch(pattern, PCRE_ERROR_BADUTF8, onError, + userContext); + } + } + + int rv = pcre_exec(p, extra, data, full_length, startoffset, options, + ovector, ovectorSize); + + DEBUG_PRINTF("pcre return code is %d\n", rv); + + // Handle all non-match or error cases, all of which involve us + // terminating the loop. + if (rv < 0) { + return handlePcreNonMatch(pattern, rv, onError, userContext); + } + + // We've found a match, and we should always have room for at least the + // start and end offsets in our ovector. Pass this info to the user. + assert(rv >= 1); + assert(rv < ovectorSize); + int from = ovector[0]; + int to = ovector[1]; + DEBUG_PRINTF("match %d -> %d\n", from, to); + + struct ch_patterndata *pd = hyctx->scratch->patternData + id; + + if (hyctx->db->flags & CHIMERA_FLAG_GROUPS) { + fillGroupsFromOvector(pd->match, rv, ovector); + } else { + rv = 0; + } + pd->groupCount = (u32)rv; + + // Insert new matched item to the queue + pq_insert_with(hyctx->pq, from, to, id); + + // Next scan starts at the first codepoint after the match. It's + // possible that we have a vacuous match, in which case we must step + // past it to ensure that we always progress. + if (from != to) { + startoffset = to; + } else if (pattern->flags & CHIMERA_PATTERN_FLAG_UTF8) { + startoffset = to + 1; + while (startoffset < (int)full_length && + ((data[startoffset] & 0xc0) == UTF_CONT_BYTE_HEADER)) { + ++startoffset; + } + } else { + startoffset = to + 1; + } + + pd->scanStart = startoffset; + DEBUG_PRINTF("new offset %u\n", pd->scanStart); + + return CH_SUCCESS; +} + +static +ch_error_t catchupPcre(struct HybridContext *hyctx, unsigned int id, + unsigned long long from, unsigned long long to) { + ch_match_event_handler onEvent = hyctx->match_callback; + void *userContext = hyctx->context; + DEBUG_PRINTF("priority queue size %u\n", hyctx->pq->size); + while (hyctx->pq->size) { + u32 num_item = hyctx->pq->size; + struct queue_item *item = pq_top(hyctx->pq->item); + size_t top_from = item->from; + size_t top_to = item->to; + u32 top_id = item->id; + + if (top_to > to) { + pq_insert_with(hyctx->pq, from, to, id); + break; + } + pq_pop_nice(hyctx->pq); + + const struct ch_pattern *pattern = getPattern(hyctx->db, top_id); + struct ch_patterndata *pd = hyctx->scratch->patternData + top_id; + + // Report match for pattern + DEBUG_PRINTF("trigger match@%zu\n", top_to); + ch_callback_t cbrv = + onEvent(pattern->id, top_from, top_to, 0 /* flags */, + pd->groupCount, pd->match, userContext); + + if (cbrv == CH_CALLBACK_TERMINATE) { + DEBUG_PRINTF("user callback told us to terminate scanning\n"); + return CH_SCAN_TERMINATED; + } else if (cbrv == CH_CALLBACK_SKIP_PATTERN) { + DEBUG_PRINTF("user callback told us to skip this pattern\n"); + pd->scanStart = hyctx->length; + } + + if (top_id == id) { + break; + } + + // Push a new match to replace the old one + unsigned int start = pd->scanStart; + unsigned int len = hyctx->length - pd->scanStart; + if (hyctx->length >= pd->scanStart && + !(pattern->flags & CHIMERA_PATTERN_FLAG_SINGLEMATCH)) { + DEBUG_PRINTF("get a new match item\n"); + int ret = scanPcre(hyctx, len, start, top_id); + + if (ret == CH_CALLBACK_TERMINATE) { + DEBUG_PRINTF("user callback told us to terminate scanning\n"); + return CH_SCAN_TERMINATED; + } else if (ret == CH_CALLBACK_SKIP_PATTERN) { + DEBUG_PRINTF("user callback told us to skip this pattern\n"); + pd->scanStart = hyctx->length; + ret = CH_SUCCESS; + } else if (ret == CH_FAIL_INTERNAL) { + return ret; + } + + // No further match is found + if (hyctx->pq->size == num_item - 1) { + pd->scanStart = hyctx->length; + } + } + } + + return CH_SUCCESS; +} + +/** \brief Callback used for internal Hyperscan multi-matcher. */ +static +int HS_CDECL multiCallback(unsigned int id, unsigned long long from, + unsigned long long to, UNUSED unsigned int flags, + void *ctx) { + assert(ctx); + struct HybridContext *hyctx = ctx; + + DEBUG_PRINTF("match for ID %u at offset %llu\n", id, to); + assert(id < hyctx->db->patternCount); + + const struct ch_pattern *pattern = getPattern(hyctx->db, id); + struct ch_patterndata *pd = hyctx->scratch->patternData + id; + char needConfirm = pattern->fixedWidth == ~0U; + + if (needConfirm && + mmbit_isset(hyctx->scratch->active, hyctx->db->patternCount, id)) { + if ((hyctx->db->flags & CHIMERA_FLAG_ALL_CONFIRM) && + mmbit_all(hyctx->scratch->active, hyctx->db->patternCount)) { + return 1; + } + return 0; + } + // Store the fact that we've seen this bit. + char already = mmbit_set(hyctx->scratch->active, + hyctx->db->patternCount, id); + DEBUG_PRINTF("match from %u to %llu\n", pd->scanStart, to); + + if (!already) { + pd->scanStart = 0; + } else if (to < pd->scanStart + pattern->minWidth) { + return 0; + } else if (pattern->flags & CHIMERA_PATTERN_FLAG_SINGLEMATCH) { + if ((hyctx->db->flags & CHIMERA_FLAG_ALL_SINGLE) && + mmbit_all(hyctx->scratch->active, hyctx->db->patternCount)) { + return 1; + } + // Note: we may have unordered match from Hyperscan, + // thus possibly get to < pd->scanStart. + return 0; + } + + int ret = HS_SUCCESS; + unsigned int start = pd->scanStart; + unsigned int len = hyctx->length - pd->scanStart; + assert(hyctx->length >= pd->scanStart); + const char *data = hyctx->data; + if (needConfirm) { + DEBUG_PRINTF("run confirm for the first time\n"); + ret = scanPcre(hyctx, len, start, id); + hyctx->scratch->ret = ret; + if (ret == CH_CALLBACK_TERMINATE) { + DEBUG_PRINTF("user callback told us to terminate scanning\n"); + return HS_SCAN_TERMINATED; + } else if (ret == CH_CALLBACK_SKIP_PATTERN) { + DEBUG_PRINTF("user callback told us to skip this pattern\n"); + pd->scanStart = hyctx->length; + ret = HS_SUCCESS; + } else if (ret == CH_FAIL_INTERNAL) { + return ret; + } + } else { + if (already) { + DEBUG_PRINTF("catch up with new matches\n"); + ret = catchupPcre(hyctx, id, from, to); + + hyctx->scratch->ret = ret; + if (pd->scanStart >= hyctx->length) { + return ret; + } + } + int startoffset = 0; + // Next scan starts at the first codepoint after the match. It's + // possible that we have a vacuous match, in which case we must step + // past it to ensure that we always progress. + if (from != to) { + startoffset = to; + } else if (pattern->flags & CHIMERA_PATTERN_FLAG_UTF8) { + startoffset = to + 1; + while (startoffset < (int)hyctx->length && + ((data[startoffset] & 0xc0) == UTF_CONT_BYTE_HEADER)) { + ++startoffset; + } + } else { + startoffset = to + 1; + } + pd->scanStart = startoffset; + int rv = 0; + if (hyctx->db->flags & CHIMERA_FLAG_GROUPS) { + ch_capture_t *groups = pd->match; + groups->flags = CH_CAPTURE_FLAG_ACTIVE; + groups->from = from; + groups->to = to; + rv = 1; + } + pd->groupCount = (u32)rv; + pq_insert_with(hyctx->pq, from, to, id); + } + + return ret; +} + +static +hs_error_t scanHyperscan(struct HybridContext *hyctx, const char *data, + unsigned int length) { + DEBUG_PRINTF("scanning %u bytes with Hyperscan\n", length); + const struct ch_bytecode *hydb = hyctx->db; + const hs_database_t *db = getHyperscanDatabase(hydb); + hs_scratch_t *scratch = hyctx->scratch->multi_scratch; + + hs_error_t err = hs_scan(db, data, length, 0, scratch, multiCallback, + hyctx); + + return err; +} + +/** \brief Init match priority queue. + * + * Add a first match offset for each pattern that is not supported by Hyperscan + * with prefiltering. + */ +static really_inline +ch_error_t initQueue(struct HybridContext *hyctx, struct match_pq *pq) { + const struct ch_bytecode *db = hyctx->db; + + u8 *active = hyctx->scratch->active; + mmbit_clear(active, db->patternCount); + + // Init match queue size + pq->size = 0; + + unsigned int length = hyctx->length; + const u32 *unguarded = getUnguarded(db); + for (u32 i = 0; i < db->unguardedCount; i++) { + u32 patternId = unguarded[i]; + DEBUG_PRINTF("switch on unguarded pcre %u\n", patternId); + mmbit_set(active, db->patternCount, patternId); + + DEBUG_PRINTF("get a new match item\n"); + int ret = scanPcre(hyctx, length, 0, patternId); + + struct ch_patterndata *pd = hyctx->scratch->patternData + patternId; + if (ret == CH_CALLBACK_TERMINATE) { + DEBUG_PRINTF("user callback told us to terminate scanning\n"); + return CH_SCAN_TERMINATED; + } else if (ret == CH_CALLBACK_SKIP_PATTERN) { + DEBUG_PRINTF("user callback told us to skip this pattern\n"); + pd->scanStart = length; + ret = CH_SUCCESS; + } else if (ret == CH_FAIL_INTERNAL) { + return ret; + } + } + + return CH_SUCCESS; +} + +static really_inline +ch_error_t ch_scan_i(const ch_database_t *hydb, + const char *data, unsigned int length, + UNUSED unsigned int flags, + ch_scratch_t *scratch, + ch_match_event_handler onEvent, + ch_error_event_handler onError, + void *userContext) { + if (unlikely(!hydb || !scratch || !data)) { + DEBUG_PRINTF("args invalid\n"); + return CH_INVALID; + } + ch_error_t ret = hydbIsValid(hydb); + if (ret != CH_SUCCESS) { + DEBUG_PRINTF("database invalid\n"); + return ret; + } + + if (!ISALIGNED_CL(scratch)) { + DEBUG_PRINTF("bad alignment %p\n", scratch); + return CH_INVALID; + } + + if (scratch->magic != CH_SCRATCH_MAGIC) { + DEBUG_PRINTF("scratch invalid\n"); + return CH_INVALID; + } + + if (unlikely(markScratchInUse(scratch))) { + return CH_SCRATCH_IN_USE; + } + + // Hyperscan underlying scratch and database validity will be checked by + // the hs_scan() call, so no need to do it here. + + // PCRE takes the data region length in as an int, so this limits our block + // size to INT_MAX. + if (length > INT_MAX) { + DEBUG_PRINTF("length invalid\n"); + unmarkScratchInUse(scratch); + return CH_INVALID; + } + + const struct ch_bytecode *db = ch_get_bytecode(hydb); + + scratch->pq.size = 0; + scratch->ret = CH_SUCCESS; + + // Firstly, we run Hyperscan in block mode and add its matches into the + // active list for subsequent confirmation with pcre. + struct HybridContext hyctx = { + .data = data, + .length = length, + .valid_utf8_highwater = 0, + .db = db, + .scratch = scratch, + .pq = &scratch->pq, + .match_callback = onEvent ? onEvent : null_onEvent, + .error_callback = onError, + .context = userContext + }; + + // Init priority queue. + ret = initQueue(&hyctx, &scratch->pq); + if (ret != CH_SUCCESS) { + DEBUG_PRINTF("Chimera returned error %d\n", ret); + unmarkScratchInUse(scratch); + return ret; + } + + if (!(db->flags & CHIMERA_FLAG_NO_MULTIMATCH)) { + ret = scanHyperscan(&hyctx, data, length); + if (ret != HS_SUCCESS && scratch->ret != CH_SUCCESS) { + DEBUG_PRINTF("Hyperscan returned error %d\n", scratch->ret); + unmarkScratchInUse(scratch); + return scratch->ret; + } + } + + DEBUG_PRINTF("Flush priority queue\n"); + // Catch up with PCRE and make up id and offsets as we don't really care + // about their values + ret = catchupPcre(&hyctx, ~0U, length, length); + if (ret != CH_SUCCESS) { + DEBUG_PRINTF("PCRE catch up returned error %d\n", ret); + unmarkScratchInUse(scratch); + return ret; + } + + unmarkScratchInUse(scratch); + return CH_SUCCESS; +} + +HS_PUBLIC_API +ch_error_t HS_CDECL ch_scan(const ch_database_t *hydb, const char *data, + unsigned int length, unsigned int flags, + ch_scratch_t *scratch, + ch_match_event_handler onEvent, + ch_error_event_handler onError, void *userContext) { + ch_error_t ret = ch_scan_i(hydb, data, length, flags, scratch, onEvent, + onError, userContext); + + return ret; +} + +HS_PUBLIC_API +const char * HS_CDECL ch_version(void) { + return HS_VERSION_STRING; +} diff --git a/chimera/ch_runtime.h b/chimera/ch_runtime.h new file mode 100644 index 000000000..6aefcad1b --- /dev/null +++ b/chimera/ch_runtime.h @@ -0,0 +1,378 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CH_RUNTIME_H_ +#define CH_RUNTIME_H_ + +#include + +/** + * @file + * @brief The Chimera runtime API definition. + * + * Chimera is a hybrid of Hyperscan and PCRE regular expression engine. + * + * This header contains functions for using compiled Chimera databases for + * scanning data at runtime. + */ + +#include "hs_common.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +struct ch_scratch; + +/** + * A Chimera scratch space. + */ +typedef struct ch_scratch ch_scratch_t; + +/** + * Callback return value used to tell the Chimera matcher what to do after + * processing this match. + */ +typedef int ch_callback_t; + +/** + * @defgroup CH_CALLBACK ch_callback_t values + * + * @{ + */ + +/** + * Continue matching. + */ +#define CH_CALLBACK_CONTINUE 0 + +/** + * Terminate matching. + */ +#define CH_CALLBACK_TERMINATE 1 + +/** + * Skip remaining matches for this ID and continue. + */ +#define CH_CALLBACK_SKIP_PATTERN 2 + + +/** @} */ + + +/** + * Type used to differentiate the errors raised with the @ref + * ch_error_event_handler callback. + */ +typedef int ch_error_event_t; + +/** + * @defgroup CH_ERROR_EVENT ch_error_event_t values + * + * @{ + */ + +/** + * PCRE hits its match limit and reports PCRE_ERROR_MATCHLIMIT. + */ +#define CH_ERROR_MATCHLIMIT 1 + +/** + * PCRE hits its recursion limit and reports PCRE_ERROR_RECURSIONLIMIT. + */ +#define CH_ERROR_RECURSIONLIMIT 2 + +/** @} */ + +/** + * Structure representing a captured subexpression within a match. An array of + * these structures corresponding to capture groups in order is passed to the + * callback on match, with active structures identified by the + * CH_CAPTURE_FLAG_ACTIVE flag. + */ +typedef struct ch_capture { + /** + * The flags indicating if this structure is active. + */ + unsigned int flags; + + /** + * offset at which this capture group begins. + */ + unsigned long long from; /*< offset at which this capture group begins. */ + + /** + * offset at which this capture group ends. + */ + unsigned long long to; +} ch_capture_t; + +/** + * @defgroup CH_CAPTURE ch_capture_t flags + * + * These flags are used in @ref ch_capture_t::flags to indicate if this + * structure is active. + * + * @{ + */ + +/** + * Flag indicating that a particular capture group is inactive, used in @ref + * ch_capture_t::flags. + */ +#define CH_CAPTURE_FLAG_INACTIVE 0 + +/** + * Flag indicating that a particular capture group is active, used in @ref + * ch_capture_t::flags. + */ +#define CH_CAPTURE_FLAG_ACTIVE 1 + +/** @} */ + +/** + * Definition of the match event callback function type. + * + * A callback function matching the defined type must be provided by the + * application calling the @ref ch_scan() + * + * This callback function will be invoked whenever a match is located in the + * target data during the execution of a scan. The details of the match are + * passed in as parameters to the callback function, and the callback function + * should return a value indicating whether or not matching should continue on + * the target data. If no callbacks are desired from a scan call, NULL may be + * provided in order to suppress match production. + * + * @param id + * The ID number of the expression that matched. If the expression was a + * single expression compiled with @ref ch_compile(), this value will be + * zero. + * + * @param from + * The offset of the first byte that matches the expression. + * + * @param to + * The offset after the last byte that matches the expression. + * + * @param flags + * This is provided for future use and is unused at present. + * + * @param size + * The number of valid entries pointed to by the captured parameter. + * + * @param captured + * A pointer to an array of @ref ch_capture_t structures that + * contain the start and end offsets of entire pattern match and + * each captured subexpression. + * + * @param ctx + * The pointer supplied by the user to the @ref ch_scan() function. + * + * @return + * The callback can return @ref CH_CALLBACK_TERMINATE to stop matching. + * Otherwise, a return value of @ref CH_CALLBACK_CONTINUE will continue, + * with the current pattern if configured to produce multiple matches per + * pattern, while a return value of @ref CH_CALLBACK_SKIP_PATTERN will + * cease matching this pattern but continue matching the next pattern. + */ +typedef ch_callback_t (HS_CDECL *ch_match_event_handler)(unsigned int id, + unsigned long long from, + unsigned long long to, + unsigned int flags, + unsigned int size, + const ch_capture_t *captured, + void *ctx); + +/** + * Definition of the Chimera error event callback function type. + * + * A callback function matching the defined type may be provided by the + * application calling the @ref ch_scan function. This callback function + * will be invoked when an error event occurs during matching; this indicates + * that some matches for a given expression may not be reported. + * + * @param error_type + * The type of error event that occurred. Currently these errors + * correspond to resource limits on PCRE backtracking + * @ref CH_ERROR_MATCHLIMIT and @ref CH_ERROR_RECURSIONLIMIT. + * + * @param id + * The ID number of the expression that matched. + * + * @param info + * Event-specific data, for future use. Currently unused. + * + * @param ctx + * The context pointer supplied by the user to the @ref ch_scan + * function. + * + * @return + * The callback can return @ref CH_CALLBACK_SKIP_PATTERN to cease matching + * this pattern but continue matching the next pattern. Otherwise, we stop + * matching for all patterns with @ref CH_CALLBACK_TERMINATE. + */ + typedef ch_callback_t (HS_CDECL *ch_error_event_handler)( + ch_error_event_t error_type, + unsigned int id, void *info, + void *ctx); + +/** + * The block regular expression scanner. + * + * This is the function call in which the actual pattern matching takes place + * for block-mode pattern databases. + * + * @param db + * A compiled pattern database. + * + * @param data + * Pointer to the data to be scanned. + * + * @param length + * The number of bytes to scan. + * + * @param flags + * Flags modifying the behaviour of this function. This parameter is + * provided for future use and is unused at present. + * + * @param scratch + * A per-thread scratch space allocated by @ref ch_alloc_scratch() for this + * database. + * + * @param onEvent + * Pointer to a match event callback function. If a NULL pointer is given, + * no matches will be returned. + * + * @param onError + * Pointer to a error event callback function. If a NULL pointer is given, + * @ref CH_ERROR_MATCHLIMIT and @ref CH_ERROR_RECURSIONLIMIT errors will + * be ignored and match will continue. + * + * @param context + * The user defined pointer which will be passed to the callback function. + * + * @return + * Returns @ref CH_SUCCESS on success; @ref CH_SCAN_TERMINATED if the + * match callback indicated that scanning should stop; other values on + * error. + */ +ch_error_t HS_CDECL ch_scan(const ch_database_t *db, const char *data, + unsigned int length, unsigned int flags, + ch_scratch_t *scratch, + ch_match_event_handler onEvent, + ch_error_event_handler onError, + void *context); + +/** + * Allocate a "scratch" space for use by Chimera. + * + * This is required for runtime use, and one scratch space per thread, or + * concurrent caller, is required. Any allocator callback set by @ref + * ch_set_scratch_allocator() or @ref ch_set_allocator() will be used by this + * function. + * + * @param db + * The database, as produced by @ref ch_compile(). + * + * @param scratch + * On first allocation, a pointer to NULL should be provided so a new + * scratch can be allocated. If a scratch block has been previously + * allocated, then a pointer to it should be passed back in to see if it + * is valid for this database block. If a new scratch block is required, + * the original will be freed and the new one returned, otherwise the + * previous scratch block will be returned. On success, the scratch block + * will be suitable for use with the provided database in addition to any + * databases that original scratch space was suitable for. + * + * @return + * @ref CH_SUCCESS on successful allocation; @ref CH_NOMEM if the + * allocation fails. Other errors may be returned if invalid parameters + * are specified. + */ +ch_error_t HS_CDECL ch_alloc_scratch(const ch_database_t *db, + ch_scratch_t **scratch); + +/** + * Allocate a scratch space that is a clone of an existing scratch space. + * + * This is useful when multiple concurrent threads will be using the same set + * of compiled databases, and another scratch space is required. Any allocator + * callback set by @ref ch_set_scratch_allocator() or @ref ch_set_allocator() + * will be used by this function. + * + * @param src + * The existing @ref ch_scratch_t to be cloned. + * + * @param dest + * A pointer to the new scratch space will be returned here. + * + * @return + * @ref CH_SUCCESS on success; @ref CH_NOMEM if the allocation fails. + * Other errors may be returned if invalid parameters are specified. + */ +ch_error_t HS_CDECL ch_clone_scratch(const ch_scratch_t *src, + ch_scratch_t **dest); + +/** + * Provides the size of the given scratch space. + * + * @param scratch + * A per-thread scratch space allocated by @ref ch_alloc_scratch() or @ref + * ch_clone_scratch(). + * + * @param scratch_size + * On success, the size of the scratch space in bytes is placed in this + * parameter. + * + * @return + * @ref CH_SUCCESS on success, other values on failure. + */ +ch_error_t HS_CDECL ch_scratch_size(const ch_scratch_t *scratch, + size_t *scratch_size); + +/** + * Free a scratch block previously allocated by @ref ch_alloc_scratch() or @ref + * ch_clone_scratch(). + * + * The free callback set by @ref ch_set_scratch_allocator() or @ref + * ch_set_allocator() will be used by this function. + * + * @param scratch + * The scratch block to be freed. NULL may also be safely provided. + * + * @return + * @ref CH_SUCCESS on success, other values on failure. + */ +ch_error_t HS_CDECL ch_free_scratch(ch_scratch_t *scratch); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* CH_RUNTIME_H_ */ diff --git a/chimera/ch_scratch.c b/chimera/ch_scratch.c new file mode 100644 index 000000000..af49c34d8 --- /dev/null +++ b/chimera/ch_scratch.c @@ -0,0 +1,317 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Chimera: scratch space alloc. + */ + +#include + +#include "allocator.h" +#include "ch.h" +#include "hs.h" +#include "hs_internal.h" +#include "ue2common.h" +#include "ch_alloc.h" +#include "ch_internal.h" +#include "ch_scratch.h" +#include "ch_database.h" + +static +size_t getPatternDataSize(const ch_scratch_t *s) { + size_t numCapturingStructs = + s->patternCount * (s->maxCaptureGroups + 1); + return (sizeof(struct ch_patterndata) * s->patternCount) + + alignof(struct ch_capture) + // padding + (sizeof(struct ch_capture) * numCapturingStructs); +} + +static +void initPatternData(const ch_scratch_t *s) { + // ch_capture array is aligned, directly after the patterndata array. + char *ptr = (char *)s->patternData + + (sizeof(struct ch_patterndata) * s->patternCount); + struct ch_capture *cap = (struct ch_capture *) + (ROUNDUP_PTR(ptr, alignof(struct ch_capture))); + + for (u32 i = 0; i < s->patternCount; i++) { + struct ch_patterndata *pd = &s->patternData[i]; + pd->match = cap; + DEBUG_PRINTF("pattern %u: pd=%p, match=%p\n", i, pd, pd->match); + cap += (s->maxCaptureGroups + 1); + } +} + +static +ch_error_t alloc_scratch(const ch_scratch_t *proto, ch_scratch_t **scratch) { + size_t ovectorSize = (proto->maxCaptureGroups + 1) * sizeof(int) * 3; + size_t capturedSize = + sizeof(struct ch_capture) * (proto->maxCaptureGroups + 1); + size_t patternDataSize = getPatternDataSize(proto); + size_t activeSize = proto->activeSize; + size_t queueSize = proto->patternCount * sizeof(struct queue_item); + + // max padding for alignment below. + size_t padding = alignof(int) + alignof(struct ch_capture) + + alignof(struct ch_patterndata) + + alignof(struct queue_item); + + size_t allocSize = sizeof(ch_scratch_t) + ovectorSize + capturedSize + + patternDataSize + activeSize + queueSize + padding + + 256; /* padding for cacheline alignment */ + ch_scratch_t *s; + ch_scratch_t *s_tmp = ch_scratch_alloc(allocSize); + ch_error_t err = ch_check_alloc(s_tmp); + if (err != CH_SUCCESS) { + ch_scratch_free(s_tmp); + *scratch = NULL; + return err; + } + + memset(s_tmp, 0, allocSize); + s = ROUNDUP_PTR(s_tmp, 64); + // Set ordinary members. + *s = *proto; + + s->magic = CH_SCRATCH_MAGIC; + s->in_use = 0; + s->scratch_alloc = (char *)s_tmp; + + // Set pointers internal to allocation. + + char *ptr = (char *)s + sizeof(*s); + ptr = ROUNDUP_PTR(ptr, alignof(int)); + s->ovector = (int *)ptr; + ptr += ovectorSize; + + ptr = ROUNDUP_PTR(ptr, alignof(struct ch_capture)); + s->captured = (struct ch_capture *)ptr; + ptr += capturedSize; + + ptr = ROUNDUP_PTR(ptr, alignof(struct ch_patterndata)); + s->patternData = (struct ch_patterndata *)ptr; + ptr += patternDataSize; + + // Pre-fill pattern data, setting captureOffsets + initPatternData(s); + + ptr = ROUNDUP_PTR(ptr, alignof(struct queue_item)); + s->pq.item = (struct queue_item *)ptr; + ptr += queueSize; + + s->active = (u8 *)ptr; + + // Store size. + s->scratchSize = allocSize; + + // We should never overrun our allocation. + assert((ptr + activeSize) - (char *)s <= (ptrdiff_t)allocSize); + + *scratch = s; + return CH_SUCCESS; +} + +HS_PUBLIC_API +ch_error_t HS_CDECL ch_alloc_scratch(const ch_database_t *hydb, + ch_scratch_t **scratch) { + if (!hydb || !scratch) { + DEBUG_PRINTF("invalid args\n"); + return CH_INVALID; + } + + DEBUG_PRINTF("hydb=%p, &scratch=%p\n", hydb, scratch); + ch_error_t rv = hydbIsValid(hydb); + if (rv != CH_SUCCESS) { + DEBUG_PRINTF("invalid database\n"); + return rv; + } + + if (*scratch != NULL) { + /* has to be aligned before we can do anything with it */ + if (!ISALIGNED_CL(*scratch)) { + return CH_INVALID; + } + if ((*scratch)->magic != CH_SCRATCH_MAGIC) { + return CH_INVALID; + } + if (markScratchInUse(*scratch)) { + return CH_SCRATCH_IN_USE; + } + } + + // We allocate a prototype of the scratch header to do our sizing with. + ch_scratch_t *proto; + ch_scratch_t *proto_tmp = ch_scratch_alloc(sizeof(ch_scratch_t) + 256); + ch_error_t proto_ret = ch_check_alloc(proto_tmp); + if (proto_ret != CH_SUCCESS) { + ch_scratch_free(proto_tmp); + ch_scratch_free(*scratch); + *scratch = NULL; + return proto_ret; + } + + proto = ROUNDUP_PTR(proto_tmp, 64); + + int resize = 0; + if (*scratch) { + *proto = **scratch; + } else { + memset(proto, 0, sizeof(*proto)); + resize = 1; + } + proto->scratch_alloc = (char *)proto_tmp; + + const struct ch_bytecode *db = ch_get_bytecode(hydb); + + if (db->maxCaptureGroups > proto->maxCaptureGroups) { + proto->maxCaptureGroups = db->maxCaptureGroups; + resize = 1; + } + + if (db->patternCount > proto->patternCount) { + proto->patternCount = db->patternCount; + proto->activeSize = db->activeSize; + resize = 1; + } + + if (resize) { + if (*scratch) { + ch_scratch_free((*scratch)->scratch_alloc); + } + + ch_error_t alloc_ret = alloc_scratch(proto, scratch); + ch_scratch_free(proto_tmp); + if (alloc_ret != CH_SUCCESS) { + *scratch = NULL; + return alloc_ret; + } + } else { + ch_scratch_free(proto_tmp); + unmarkScratchInUse(*scratch); + } + + if (db->flags & CHIMERA_FLAG_NO_MULTIMATCH) { + (*scratch)->multi_scratch = NULL; + return CH_SUCCESS; + } + + // We may still have to realloc the underlying Hyperscan scratch. + rv = hs_alloc_scratch(getHyperscanDatabase(db), + &(*scratch)->multi_scratch); + if (rv != HS_SUCCESS) { + DEBUG_PRINTF("hs_alloc_scratch for multi_scratch failed\n"); + hs_free_scratch((*scratch)->multi_scratch); + ch_scratch_free((*scratch)->scratch_alloc); + *scratch = NULL; + return rv; + } + + return CH_SUCCESS; +} + +HS_PUBLIC_API +ch_error_t HS_CDECL ch_clone_scratch(const ch_scratch_t *src, + ch_scratch_t **dest) { + if (!dest || !src || !ISALIGNED_CL(src) || + src->magic != CH_SCRATCH_MAGIC) { + DEBUG_PRINTF("scratch invalid\n"); + return CH_INVALID; + } + + ch_error_t ret = alloc_scratch(src, dest); + if (ret != CH_SUCCESS) { + DEBUG_PRINTF("alloc_scratch failed\n"); + *dest = NULL; + return ret; + } + + if (src->multi_scratch) { + (*dest)->multi_scratch = NULL; + ret = hs_clone_scratch(src->multi_scratch, &(*dest)->multi_scratch); + if (ret != HS_SUCCESS) { + DEBUG_PRINTF("hs_clone_scratch(multi_scratch,...) failed\n"); + ch_scratch_free(*dest); + return ret; + } + } + + return CH_SUCCESS; +} + +HS_PUBLIC_API +ch_error_t HS_CDECL ch_free_scratch(ch_scratch_t *scratch) { + ch_error_t ret = CH_SUCCESS; + if (scratch) { + /* has to be aligned before we can do anything with it */ + if (!ISALIGNED_CL(scratch)) { + return CH_INVALID; + } + if (scratch->magic != CH_SCRATCH_MAGIC) { + return CH_INVALID; + } + if (markScratchInUse(scratch)) { + return CH_SCRATCH_IN_USE; + } + + if (scratch->multi_scratch) { + ret = hs_free_scratch(scratch->multi_scratch); + } + + scratch->magic = 0; + assert(scratch->scratch_alloc); + DEBUG_PRINTF("scratch %p is really at %p : freeing\n", scratch, + scratch->scratch_alloc); + ch_scratch_free(scratch->scratch_alloc); + } + + return ret; +} + +/** Not public, but used for info from our internal tools. Note that in the + * hybrid matcher the scratch is definitely not a contiguous memory region. */ +HS_PUBLIC_API +ch_error_t HS_CDECL ch_scratch_size(const ch_scratch_t *scratch, size_t *size) { + ch_error_t ret = CH_SUCCESS; + if (!size || !scratch || !ISALIGNED_CL(scratch) || + scratch->magic != CH_SCRATCH_MAGIC) { + return CH_INVALID; + } else { + size_t multi_size = 0; + + if (scratch->multi_scratch) { + ret = hs_scratch_size(scratch->multi_scratch, &multi_size); + } + if (ret) { + multi_size = 0; + } + + *size = scratch->scratchSize + multi_size; + } + + return ret; +} diff --git a/chimera/ch_scratch.h b/chimera/ch_scratch.h new file mode 100644 index 000000000..47d9101e2 --- /dev/null +++ b/chimera/ch_scratch.h @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Scratch and associated data structures. + * + * This header gets pulled into many places (many deep, slow to compile + * places). Try to keep the included headers under control. + */ + +#ifndef CH_SCRATCH_H_ +#define CH_SCRATCH_H_ + +#include "ch_common.h" +#include "ch_runtime.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +#define CH_SCRATCH_MAGIC 0x554F4259 //!< Magic number stored in \ref ch_scratch + +struct queue_item { + int from; /** \brief used to store the start location. */ + int to; /** \brief used to store the current location. */ + u32 id; /**< pattern index. */ +}; + +struct match_pq { + struct queue_item *item; + u32 size; /**< current size of the priority queue */ +}; + +/** \brief Information about a pattern stored at runtime when a match is + * encountered. */ +struct ch_patterndata { + struct ch_capture *match; //!< buffered group info + u32 groupCount; //!< number of capturing groups + u32 scanStart; //!< start of match window (still to be single-scanned). +}; + +/** \brief Scratch space header for Chimera. */ +struct ch_scratch { + u32 magic; //!< must be \ref CH_SCRATCH_MAGIC + u8 in_use; /**< non-zero when being used by an API call. */ + struct hs_scratch *multi_scratch; //!< for hyperscan scatch. + int *ovector; //!< maximally-sized ovector for PCRE usage. + struct ch_capture *captured; //!< max-sized capture group struct. + u8 *active; //!< active multibit. + struct ch_patterndata *patternData; //!< per-pattern match data, indexed by + // pattern ID. + struct match_pq pq; //!< priority queue to ensure matching ordering + u32 patternCount; //!< number of patterns, used to size active multibit + u32 activeSize; //!< size of active multibit + u32 maxCaptureGroups; //!< largest num of capturing groups required + u32 scratchSize; //!< size of allocation + int ret; //!< return value in Hyperscan callback + char *scratch_alloc; /* user allocated scratch object */ +}; + +/** + * \brief Mark scratch as in use. + * + * Returns non-zero if it was already in use, zero otherwise. + */ +static really_inline +char markScratchInUse(struct ch_scratch *scratch) { + DEBUG_PRINTF("marking scratch as in use\n"); + assert(scratch && scratch->magic == CH_SCRATCH_MAGIC); + if (scratch->in_use) { + DEBUG_PRINTF("scratch already in use!\n"); + return 1; + } + scratch->in_use = 1; + return 0; +} + +/** + * \brief Mark scratch as no longer in use. + */ +static really_inline +void unmarkScratchInUse(struct ch_scratch *scratch) { + DEBUG_PRINTF("marking scratch as not in use\n"); + assert(scratch && scratch->magic == CH_SCRATCH_MAGIC); + assert(scratch->in_use == 1); + scratch->in_use = 0; +} + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* CH_SCRATCH_H_ */ diff --git a/chimera/libch.pc.in b/chimera/libch.pc.in new file mode 100644 index 000000000..7f2660094 --- /dev/null +++ b/chimera/libch.pc.in @@ -0,0 +1,12 @@ +prefix=@CMAKE_INSTALL_PREFIX@ +exec_prefix=@CMAKE_INSTALL_PREFIX@ +libdir=@CMAKE_INSTALL_PREFIX@/lib +includedir=@CMAKE_INSTALL_PREFIX@/include + +Name: libch +Description: Intel(R) Chimera Library +Version: @HS_VERSION@ +Requires.private: libhs +Libs: -L${libdir} -lchimera +Libs.private: @PRIVATE_LIBS@ +Cflags: -I${includedir}/hs diff --git a/cmake/pcre.cmake b/cmake/pcre.cmake index acad45bdd..2b0d23c73 100644 --- a/cmake/pcre.cmake +++ b/cmake/pcre.cmake @@ -54,11 +54,10 @@ else () find_package(PkgConfig) pkg_check_modules(PCRE libpcre=${PCRE_REQUIRED_VERSION}) if (PCRE_FOUND) + set(CORRECT_PCRE_VERSION TRUE) message(STATUS "PCRE version ${PCRE_REQUIRED_VERSION}") else () message(STATUS "PCRE version ${PCRE_REQUIRED_VERSION} not found") return () endif () endif (PCRE_BUILD_SOURCE) - -set (PCRE_CHECKED TRUE PARENT_SCOPE) diff --git a/doc/dev-reference/chimera.rst b/doc/dev-reference/chimera.rst new file mode 100644 index 000000000..883cb5a0a --- /dev/null +++ b/doc/dev-reference/chimera.rst @@ -0,0 +1,333 @@ +.. _chimera: + +####### +Chimera +####### + +This section describes Chimera library. + +************ +Introduction +************ + +Chimera is a software regular expression matching engine that is a hybrid of +Hyperscan and PCRE. The design goals of Chimera are to fully support PCRE +syntax as well as to take advantage of the high performance nature of Hyperscan. + +Chimera inherits the design guideline of Hyperscan with C APIs for compilation +and scanning. + +The Chimera API itself is composed of two major components: + +=========== +Compilation +=========== + +These functions take a group of regular expressions, along with identifiers and +option flags, and compile them into an immutable database that can be used by +the Chimera scanning API. This compilation process performs considerable +analysis and optimization work in order to build a database that will match +the given expressions efficiently. + +See :ref:`chcompile` for more details + +======== +Scanning +======== + +Once a Chimera database has been created, it can be used to scan data in memory. +Chimera only supports block mode in which we scan a single contiguous block in +memory. + +Matches are delivered to the application via a user-supplied callback function +that is called synchronously for each match. + +For a given database, Chimera provides several guarantees: + +* No memory allocations occur at runtime with the exception of scratch space + allocation, it should be done ahead of time for performance-critical + applications: + + - **Scratch space**: temporary memory used for internal data at scan time. + Structures in scratch space do not persist beyond the end of a single scan + call. + +* The size of the scratch space required for a given database is fixed and + determined at database compile time. This means that the memory requirement + of the application are known ahead of time, and the scratch space can be + pre-allocated if required for performance reasons. + +* Any pattern that has successfully been compiled by the Chimera compiler can + be scanned against any input. There could be internal resource limits or + other limitations caused by PCRE at runtime that could cause a scan call to + return an error. + +.. note:: Chimera is designed to have the same matching behavior as PCRE, + including greedy/ungreedy, capturing, etc. Chimera reports both + **start offset** and **end offset** for each match like PCRE. Different + from the fashion of reporting all matches in Hyperscan, Chimera only reports + non-overlapping matches. For example, the pattern :regexp:`/foofoo/` will + match ``foofoofoofoo`` at offsets (0, 6) and (6, 12). + +.. note:: Since Chimera is a hybrid of Hyperscan and PCRE in order to support + full PCRE syntax, there will be extra performance overhead compared to + Hyperscan-only solution. Please always use Hyperscan for better performance + unless you must need full PCRE syntax support. + +See :ref:`chruntime` for more details + +************ +Requirements +************ + +The PCRE library (http://pcre.org/) version 8.41 is required for Chimera. + +.. note:: Since Chimera needs to reference PCRE internal function, please place PCRE source + directory under Hyperscan root directory in order to build Chimera. + +Beside this, both hardware and software requirements of Chimera are the same to Hyperscan. +See :ref:`hardware` and :ref:`software` for more details. + +.. note:: Building Hyperscan will automatically generate Chimera library. + Currently only static library is supported for Chimera, so please + use static build type when configure CMake build options. + +.. _chcompile: + +****************** +Compiling Patterns +****************** + +=================== +Building a Database +=================== + +The Chimera compiler API accepts regular expressions and converts them into a +compiled pattern database that can then be used to scan data. + +The API provides two functions that compile regular expressions into +databases: + +#. :c:func:`ch_compile`: compiles a single expression into a pattern database. + +#. :c:func:`ch_compile_multi`: compiles an array of expressions into a pattern + database. All of the supplied patterns will be scanned for concurrently at + scan time, with user-supplied identifiers returned when they match. + +#. :c:func:`ch_compile_ext_multi`: compiles an array of expressions as above, + but allows PCRE match limits to be specified for each expression. + +Compilation allows the Chimera library to analyze the given pattern(s) and +pre-determine how to scan for these patterns in an optimized fashion using +Hyperscan and PCRE. + +=============== +Pattern Support +=============== + +Chimera fully supports the pattern syntax used by the PCRE library ("libpcre"), +described at .The version of PCRE used to validate +Chimera's interpretation of this syntax is 8.41. + +========= +Semantics +========= + +Chimera supports the exact same semantics of PCRE library. Moreover, it supports +multiple simultaneous pattern matching like Hyperscan and the multiple matches +will be reported in order by end offset. + +.. _chruntime: + +********************* +Scanning for Patterns +********************* + +Chimera provides scan function with ``ch_scan``. + +================ +Handling Matches +================ + +``ch_scan`` will call a user-supplied callback function when a match +is found. This function has the following signature: + + .. doxygentypedef:: ch_match_event_handler + :outline: + :no-link: + +The *id* argument will be set to the identifier for the matching expression +provided at compile time, and the *from* argument will be set to the +start-offset of the match the *to* argument will be set to the end-offset +of the match. The *captured* stores offsets of entire pattern match as well as +captured subexpressions. The *size* will be set to the number of valid entries in +the *captured*. + +The match callback function has the capability to continue or halt scanning +by returning different values. + +See :c:type:`ch_match_event_handler` for more information. + +======================= +Handling Runtime Errors +======================= + +``ch_scan`` will call a user-supplied callback function when a runtime error +occurs in libpcre. This function has the following signature: + + .. doxygentypedef:: ch_error_event_handler + :outline: + :no-link: + +The *id* argument will be set to the identifier for the matching expression +provided at compile time. + +The match callback function has the capability to either halt scanning or +continue scanning for the next pattern. + +See :c:type:`ch_error_event_handler` for more information. + +============= +Scratch Space +============= + +While scanning data, Chimera needs a small amount of temporary memory to store +on-the-fly internal data. This amount is unfortunately too large to fit on the +stack, particularly for embedded applications, and allocating memory dynamically +is too expensive, so a pre-allocated "scratch" space must be provided to the +scanning functions. + +The function :c:func:`ch_alloc_scratch` allocates a large enough region of +scratch space to support a given database. If the application uses multiple +databases, only a single scratch region is necessary: in this case, calling +:c:func:`ch_alloc_scratch` on each database (with the same ``scratch`` pointer) +will ensure that the scratch space is large enough to support scanning against +any of the given databases. + +While the Chimera library is re-entrant, the use of scratch spaces is not. +For example, if by design it is deemed necessary to run recursive or nested +scanning (say, from the match callback function), then an additional scratch +space is required for that context. + +In the absence of recursive scanning, only one such space is required per thread +and can (and indeed should) be allocated before data scanning is to commence. + +In a scenario where a set of expressions are compiled by a single "master" +thread and data will be scanned by multiple "worker" threads, the convenience +function :c:func:`ch_clone_scratch` allows multiple copies of an existing +scratch space to be made for each thread (rather than forcing the caller to pass +all the compiled databases through :c:func:`ch_alloc_scratch` multiple times). + +For example: + +.. code-block:: c + + ch_error_t err; + ch_scratch_t *scratch_prototype = NULL; + err = ch_alloc_scratch(db, &scratch_prototype); + if (err != CH_SUCCESS) { + printf("ch_alloc_scratch failed!"); + exit(1); + } + + ch_scratch_t *scratch_thread1 = NULL; + ch_scratch_t *scratch_thread2 = NULL; + + err = ch_clone_scratch(scratch_prototype, &scratch_thread1); + if (err != CH_SUCCESS) { + printf("ch_clone_scratch failed!"); + exit(1); + } + err = ch_clone_scratch(scratch_prototype, &scratch_thread2); + if (err != CH_SUCCESS) { + printf("ch_clone_scratch failed!"); + exit(1); + } + + ch_free_scratch(scratch_prototype); + + /* Now two threads can both scan against database db, + each with its own scratch space. */ + + +================= +Custom Allocators +================= + +By default, structures used by Chimera at runtime (scratch space, etc) are +allocated with the default system allocators, usually +``malloc()`` and ``free()``. + +The Chimera API provides a facility for changing this behaviour to support +applications that use custom memory allocators. + +These functions are: + +- :c:func:`ch_set_database_allocator`, which sets the allocate and free functions + used for compiled pattern databases. +- :c:func:`ch_set_scratch_allocator`, which sets the allocate and free + functions used for scratch space. +- :c:func:`ch_set_misc_allocator`, which sets the allocate and free functions + used for miscellaneous data, such as compile error structures and + informational strings. + +The :c:func:`ch_set_allocator` function can be used to set all of the custom +allocators to the same allocate/free pair. + + +************************ +API Reference: Constants +************************ + +=========== +Error Codes +=========== + +.. doxygengroup:: CH_ERROR + :content-only: + :no-link: + +============= +Pattern flags +============= + +.. doxygengroup:: CH_PATTERN_FLAG + :content-only: + :no-link: + +================== +Compile mode flags +================== + +.. doxygengroup:: CH_MODE_FLAG + :content-only: + :no-link: + + +******************** +API Reference: Files +******************** + +========== +File: ch.h +========== + +.. doxygenfile:: ch.h + +================= +File: ch_common.h +================= + +.. doxygenfile:: ch_common.h + +================== +File: ch_compile.h +================== + +.. doxygenfile:: ch_compile.h + +================== +File: ch_runtime.h +================== + +.. doxygenfile:: ch_runtime.h diff --git a/doc/dev-reference/compilation.rst b/doc/dev-reference/compilation.rst index df3041875..7a7f37ec6 100644 --- a/doc/dev-reference/compilation.rst +++ b/doc/dev-reference/compilation.rst @@ -471,3 +471,93 @@ matching support. Here they are, in a nutshell: Approximate matching is always disabled by default, and can be enabled on a per-pattern basis by using an extended parameter described in :ref:`extparam`. + +.. _logical_combinations: + +******************** +Logical Combinations +******************** + +For situations when a user requires behaviour that depends on the presence or +absence of matches from groups of patterns, Hyperscan provides support for the +logical combination of patterns in a given pattern set, with three operators: +``NOT``, ``AND`` and ``OR``. + +The logical value of such a combination is based on each expression's matching +status at a given offset. The matching status of any expression has a boolean +value: *false* if the expression has not yet matched or *true* if the expression +has already matched. In particular, the value of a ``NOT`` operation at a given +offset is *true* if the expression it refers to is *false* at this offset. + +For example, ``NOT 101`` means that expression 101 has not yet matched at this +offset. + +A logical combination is passed to Hyperscan at compile time as an expression. +This combination expression will raise matches at every offset where one of its +sub-expressions matches and the logical value of the whole expression is *true*. + +To illustrate, here is an example combination expression: :: + + ((301 OR 302) AND 303) AND (304 OR NOT 305) + +If expression 301 matches at offset 10, the logical value of 301 is *true* +while the other patterns' values are *false*. Hence, the whole combination's value is +*false*. + +Then expression 303 matches at offset 20. Now the values of 301 and 303 are +*true* while the other patterns' values are still *false*. In this case, the +combination's value is *true*, so the combination expression raises a match at +offset 20. + +Finally, expression 305 has matches at offset 30. Now the values of 301, 303 and 305 +are *true* while the other patterns' values are still *false*. In this case, the +combination's value is *false* and no match is raised. + +**Using Logical Combinations** + +In logical combination syntax, an expression is written as infix notation, it +consists of operands, operators and parentheses. The operands are expression +IDs, and operators are ``!`` (NOT), ``&`` (AND) or ``|`` (OR). For example, the +combination described in the previous section would be written as: :: + + ((301 | 302) & 303) & (304 | !305) + +In a logical combination expression: + + * The priority of operators are ``!`` > ``&`` > ``|``. For example: + - ``A&B|C`` is treated as ``(A&B)|C``, + - ``A|B&C`` is treated as ``A|(B&C)``, + - ``A&!B`` is treated as ``A&(!B)``. + * Extra parentheses are allowed. For example: + - ``(A)&!(B)`` is the same as ``A&!B``, + - ``(A&B)|C`` is the same as ``A&B|C``. + * Whitespace is ignored. + +To use a logical combination expression, it must be passed to one of the +Hyperscan compile functions (:c:func:`hs_compile_multi`, +:c:func:`hs_compile_ext_multi`) along with the :c:member:`HS_FLAG_COMBINATION` flag, +which identifies the pattern as a logical combination expression. The patterns +referred to in the logical combination expression must be compiled together in +the same pattern set as the combination expression. + +When an expression has the :c:member:`HS_FLAG_COMBINATION` flag set, it ignores +all other flags except the :c:member:`HS_FLAG_SINGLEMATCH` flag and the +:c:member:`HS_FLAG_QUIET` flag. + +Hyperscan will reject logical combination expressions at compile time that +evaluate to *true* when no patterns have matched; for example: :: + + !101 + !101|102 + !101&!102 + !(101&102) + +Patterns that are referred to as operands within a logical combination (for +example, 301 through 305 in the examples above) may also use the +:c:member:`HS_FLAG_QUIET` flag to silence the reporting of individual matches +for those patterns. In the absence of this flag, all matches (for +both individual patterns and their logical combinations) will be reported. + +When an expression has both the :c:member:`HS_FLAG_COMBINATION` flag and the +:c:member:`HS_FLAG_QUIET` flag set, no matches for this logical combination +will be reported. diff --git a/doc/dev-reference/getting_started.rst b/doc/dev-reference/getting_started.rst index 1c5002f5b..4e4d36f3a 100644 --- a/doc/dev-reference/getting_started.rst +++ b/doc/dev-reference/getting_started.rst @@ -27,10 +27,10 @@ Very Quick Start Known working generators: * ``Unix Makefiles`` --- make-compatible makefiles (default on Linux/FreeBSD/Mac OS X) * ``Ninja`` --- `Ninja `_ build files. + * ``Visual Studio 15 2017`` --- Visual Studio projects Generators that might work include: * ``Xcode`` --- OS X Xcode projects. - * ``Visual Studio`` --- Visual Studio projects - very experimental #. Build Hyperscan @@ -38,6 +38,7 @@ Very Quick Start * ``cmake --build .`` --- will build everything * ``make -j`` --- use makefiles in parallel * ``ninja`` --- use Ninja build + * ``MsBuild.exe`` --- use Visual Studio MsBuild * etc. #. Check Hyperscan @@ -49,6 +50,8 @@ Very Quick Start Requirements ************ +.. _hardware: + Hardware ======== @@ -84,6 +87,7 @@ compiler support. The supported compilers are: * GCC, v4.8.1 or higher * Clang, v3.4 or higher (with libstdc++ or libc++) * Intel C++ Compiler v15 or higher + * Visual C++ 2017 Build Tools Examples of operating systems that Hyperscan is known to work on include: @@ -96,13 +100,17 @@ FreeBSD: * 10.0 or newer +Windows: + +* 8 or newer + Mac OS X: * 10.8 or newer, using XCode/Clang Hyperscan *may* compile and run on other platforms, but there is no guarantee. We currently have experimental support for Windows using Intel C++ Compiler -or Visual Studio 2015. +or Visual Studio 2017. In addition, the following software is required for compiling the Hyperscan library: @@ -118,7 +126,8 @@ Dependency Version Notes Most of these dependencies can be provided by the package manager on the build system (e.g. Debian/Ubuntu/RedHat packages, FreeBSD ports, etc). However, -ensure that the correct version is present. +ensure that the correct version is present. As for Windows, in order to have +Ragel, you may use Cygwin to build it from source. Boost Headers ------------- diff --git a/doc/dev-reference/hyperscan.doxyfile.in b/doc/dev-reference/hyperscan.doxyfile.in index a01739587..b9eaf078b 100644 --- a/doc/dev-reference/hyperscan.doxyfile.in +++ b/doc/dev-reference/hyperscan.doxyfile.in @@ -758,7 +758,7 @@ WARN_LOGFILE = # spaces. # Note: If this tag is empty the current directory is searched. -INPUT = @CMAKE_SOURCE_DIR@/src/hs.h @CMAKE_SOURCE_DIR@/src/hs_common.h @CMAKE_SOURCE_DIR@/src/hs_compile.h @CMAKE_SOURCE_DIR@/src/hs_runtime.h +INPUT = @CMAKE_SOURCE_DIR@/src/hs.h @CMAKE_SOURCE_DIR@/src/hs_common.h @CMAKE_SOURCE_DIR@/src/hs_compile.h @CMAKE_SOURCE_DIR@/src/hs_runtime.h @CMAKE_SOURCE_DIR@/chimera/ch.h @CMAKE_SOURCE_DIR@/chimera/ch_common.h @CMAKE_SOURCE_DIR@/chimera/ch_compile.h @CMAKE_SOURCE_DIR@/chimera/ch_runtime.h # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses diff --git a/doc/dev-reference/index.rst b/doc/dev-reference/index.rst index 32f188dd4..b5d6a54bc 100644 --- a/doc/dev-reference/index.rst +++ b/doc/dev-reference/index.rst @@ -20,3 +20,4 @@ Hyperscan |version| Developer's Reference Guide tools api_constants api_files + chimera diff --git a/doc/dev-reference/tools.rst b/doc/dev-reference/tools.rst index 9c2ce6eb5..e0465fc65 100644 --- a/doc/dev-reference/tools.rst +++ b/doc/dev-reference/tools.rst @@ -246,6 +246,8 @@ Character API Flag Description ``W`` :c:member:`HS_FLAG_UCP` Unicode property support ``P`` :c:member:`HS_FLAG_PREFILTER` Prefiltering mode ``L`` :c:member:`HS_FLAG_SOM_LEFTMOST` Leftmost start of match reporting +``C`` :c:member:`HS_FLAG_COMBINATION` Logical combination of patterns +``Q`` :c:member:`HS_FLAG_QUIET` Quiet at matching ========= ================================= =========== In addition to the set of flags above, :ref:`extparam` can be supplied diff --git a/src/compiler/compiler.cpp b/src/compiler/compiler.cpp index 7affb08d3..a34eadd0f 100644 --- a/src/compiler/compiler.cpp +++ b/src/compiler/compiler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -45,6 +45,7 @@ #include "parser/buildstate.h" #include "parser/dump.h" #include "parser/Component.h" +#include "parser/logical_combination.h" #include "parser/parse_error.h" #include "parser/Parser.h" // for flags #include "parser/position.h" @@ -111,14 +112,21 @@ ParsedExpression::ParsedExpression(unsigned index_in, const char *expression, const hs_expr_ext *ext) : expr(index_in, flags & HS_FLAG_ALLOWEMPTY, flags & HS_FLAG_SINGLEMATCH, false, flags & HS_FLAG_PREFILTER, SOM_NONE, report, 0, MAX_OFFSET, - 0, 0, 0) { + 0, 0, 0, flags & HS_FLAG_QUIET) { + // We disallow SOM + Quiet. + if ((flags & HS_FLAG_QUIET) && (flags & HS_FLAG_SOM_LEFTMOST)) { + throw CompileError("HS_FLAG_QUIET is not supported in " + "combination with HS_FLAG_SOM_LEFTMOST."); + } + flags &= ~HS_FLAG_QUIET; ParseMode mode(flags); component = parse(expression, mode); expr.utf8 = mode.utf8; /* utf8 may be set by parse() */ - if (expr.utf8 && !isValidUtf8(expression)) { + const size_t len = strlen(expression); + if (expr.utf8 && !isValidUtf8(expression, len)) { throw ParseError("Expression is not valid UTF-8."); } @@ -233,6 +241,45 @@ void addExpression(NG &ng, unsigned index, const char *expression, DEBUG_PRINTF("index=%u, id=%u, flags=%u, expr='%s'\n", index, id, flags, expression); + if (flags & HS_FLAG_COMBINATION) { + if (flags & ~(HS_FLAG_COMBINATION | HS_FLAG_QUIET | + HS_FLAG_SINGLEMATCH)) { + throw CompileError("only HS_FLAG_QUIET and HS_FLAG_SINGLEMATCH " + "are supported in combination " + "with HS_FLAG_COMBINATION."); + } + if (flags & HS_FLAG_QUIET) { + DEBUG_PRINTF("skip QUIET logical combination expression %u\n", id); + } else { + u32 ekey = INVALID_EKEY; + u64a min_offset = 0; + u64a max_offset = MAX_OFFSET; + if (flags & HS_FLAG_SINGLEMATCH) { + ekey = ng.rm.getExhaustibleKey(id); + } + if (ext) { + validateExt(*ext); + if (ext->flags & ~(HS_EXT_FLAG_MIN_OFFSET | + HS_EXT_FLAG_MAX_OFFSET)) { + throw CompileError("only HS_EXT_FLAG_MIN_OFFSET and " + "HS_EXT_FLAG_MAX_OFFSET extra flags " + "are supported in combination " + "with HS_FLAG_COMBINATION."); + } + if (ext->flags & HS_EXT_FLAG_MIN_OFFSET) { + min_offset = ext->min_offset; + } + if (ext->flags & HS_EXT_FLAG_MAX_OFFSET) { + max_offset = ext->max_offset; + } + } + ng.rm.pl.parseLogicalCombination(id, expression, ekey, min_offset, + max_offset); + DEBUG_PRINTF("parsed logical combination expression %u\n", id); + } + return; + } + // Ensure that our pattern isn't too long (in characters). if (strlen(expression) > cc.grey.limitPatternLength) { throw CompileError("Pattern length exceeds limit."); diff --git a/src/compiler/expression_info.h b/src/compiler/expression_info.h index 45d18cbfc..fefb3b58a 100644 --- a/src/compiler/expression_info.h +++ b/src/compiler/expression_info.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, Intel Corporation + * Copyright (c) 2017-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -46,12 +46,12 @@ class ExpressionInfo { bool highlander_in, bool utf8_in, bool prefilter_in, som_type som_in, ReportID report_in, u64a min_offset_in, u64a max_offset_in, u64a min_length_in, u32 edit_distance_in, - u32 hamm_distance_in) + u32 hamm_distance_in, bool quiet_in) : index(index_in), report(report_in), allow_vacuous(allow_vacuous_in), highlander(highlander_in), utf8(utf8_in), prefilter(prefilter_in), som(som_in), min_offset(min_offset_in), max_offset(max_offset_in), min_length(min_length_in), edit_distance(edit_distance_in), - hamm_distance(hamm_distance_in) {} + hamm_distance(hamm_distance_in), quiet(quiet_in) {} /** * \brief Index of the expression represented by this graph. @@ -98,6 +98,9 @@ class ExpressionInfo { */ u32 edit_distance; u32 hamm_distance; + + /** \brief Quiet on match. */ + bool quiet; }; } diff --git a/src/hs.cpp b/src/hs.cpp index 04ffb479d..329702d40 100644 --- a/src/hs.cpp +++ b/src/hs.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -245,6 +245,11 @@ hs_compile_multi_int(const char *const *expressions, const unsigned *flags, } } + // Check sub-expression ids + ng.rm.pl.validateSubIDs(ids, expressions, flags, elements); + // Renumber and assign lkey to reports + ng.rm.logicalKeyRenumber(); + unsigned length = 0; struct hs_database *out = build(ng, &length); diff --git a/src/hs_compile.h b/src/hs_compile.h index dc9ba307c..c8dcfdf21 100644 --- a/src/hs_compile.h +++ b/src/hs_compile.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -811,6 +811,28 @@ hs_error_t HS_CDECL hs_populate_platform(hs_platform_info_t *platform); */ #define HS_FLAG_SOM_LEFTMOST 256 +/** + * Compile flag: Logical combination. + * + * This flag instructs Hyperscan to parse this expression as logical + * combination syntax. + * Logical constraints consist of operands, operators and parentheses. + * The operands are expression indices, and operators can be + * '!'(NOT), '&'(AND) or '|'(OR). + * For example: + * (101&102&103)|(104&!105) + * ((301|302)&303)&(304|305) + */ +#define HS_FLAG_COMBINATION 512 + +/** + * Compile flag: Don't do any match reporting. + * + * This flag instructs Hyperscan to ignore match reporting for this expression. + * It is designed to be used on the sub-expressions in logical combinations. + */ +#define HS_FLAG_QUIET 1024 + /** @} */ /** diff --git a/src/hs_runtime.h b/src/hs_runtime.h index 9bf674866..6d34b6c48 100644 --- a/src/hs_runtime.h +++ b/src/hs_runtime.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -122,11 +122,11 @@ typedef struct hs_scratch hs_scratch_t; * subsequent calls to @ref hs_scan_stream() for that stream will * immediately return with @ref HS_SCAN_TERMINATED. */ -typedef int (*match_event_handler)(unsigned int id, - unsigned long long from, - unsigned long long to, - unsigned int flags, - void *context); +typedef int (HS_CDECL *match_event_handler)(unsigned int id, + unsigned long long from, + unsigned long long to, + unsigned int flags, + void *context); /** * Open and initialise a stream. diff --git a/src/nfa/goughcompile.cpp b/src/nfa/goughcompile.cpp index ba7f27184..3f1614dd1 100644 --- a/src/nfa/goughcompile.cpp +++ b/src/nfa/goughcompile.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -374,7 +374,7 @@ unique_ptr makeCFG(const raw_som_dfa &raw) { } u16 top_sym = raw.alpha_remap[TOP]; - DEBUG_PRINTF("top: %hu, kind %d\n", top_sym, raw.kind); + DEBUG_PRINTF("top: %hu, kind %s\n", top_sym, to_string(raw.kind).c_str()); /* create edges, JOIN variables (on edge targets) */ map seen; diff --git a/src/nfa/mcclellan_internal.h b/src/nfa/mcclellan_internal.h index 549bccf5e..5289b074c 100644 --- a/src/nfa/mcclellan_internal.h +++ b/src/nfa/mcclellan_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -84,7 +84,7 @@ struct mcclellan { u8 has_accel; /**< 1 iff there are any accel plans */ u8 remap[256]; /**< remaps characters to a smaller alphabet */ ReportID arb_report; /**< one of the accepts that this dfa may raise */ - u32 accel_offset; /**< offset of the accel structures from start of NFA */ + u32 accel_offset; /**< offset of accel structures from start of McClellan */ u32 haig_offset; /**< reserved for use by Haig, relative to start of NFA */ }; diff --git a/src/nfa/mcclellancompile.cpp b/src/nfa/mcclellancompile.cpp index d705ddf91..8e3a744cb 100644 --- a/src/nfa/mcclellancompile.cpp +++ b/src/nfa/mcclellancompile.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -760,7 +760,7 @@ bytecode_ptr mcclellanCompile8(dfa_info &info, const CompileContext &cc, return nfa; } -#define MAX_SHERMAN_LIST_LEN 8 +#define MAX_SHERMAN_LIST_LEN 9 static void addIfEarlier(flat_set &dest, dstate_id_t candidate, diff --git a/src/nfa/mcsheng.c b/src/nfa/mcsheng.c index 9722fd676..4619ff6fd 100644 --- a/src/nfa/mcsheng.c +++ b/src/nfa/mcsheng.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2017, Intel Corporation + * Copyright (c) 2016-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -173,7 +173,7 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end, u32 sheng_limit_x4 = sheng_limit * 0x01010101; m128 simd_stop_limit = set4x32(sheng_stop_limit_x4); m128 accel_delta = set16x8(sheng_limit - sheng_stop_limit); - DEBUG_PRINTF("end %hu, accel %hhu --> limit %hhu\n", sheng_limit, + DEBUG_PRINTF("end %hhu, accel %hu --> limit %hhu\n", sheng_limit, m->sheng_accel_limit, sheng_stop_limit); #endif @@ -181,7 +181,7 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end, m128 shuffle_mask = masks[*(c++)]; \ s = pshufb_m128(shuffle_mask, s); \ u32 s_gpr_x4 = movd(s); /* convert to u8 */ \ - DEBUG_PRINTF("c %hhu (%c) --> s %hhu\n", c[-1], c[-1], s_gpr_x4); \ + DEBUG_PRINTF("c %hhu (%c) --> s %u\n", c[-1], c[-1], s_gpr_x4); \ if (s_gpr_x4 >= sheng_stop_limit_x4) { \ s_gpr = s_gpr_x4; \ goto exit; \ @@ -191,7 +191,7 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end, u8 s_gpr; while (c < c_end) { #if defined(HAVE_BMI2) && defined(ARCH_64_BIT) - /* This version uses pext for efficently bitbashing out scaled + /* This version uses pext for efficiently bitbashing out scaled * versions of the bytes to process from a u64a */ u64a data_bytes = unaligned_load_u64a(c); @@ -201,7 +201,7 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end, s = pshufb_m128(shuffle_mask0, s); m128 s_max = s; m128 s_max0 = s_max; - DEBUG_PRINTF("c %02llx --> s %hhu\n", cc0 >> 4, movd(s)); + DEBUG_PRINTF("c %02llx --> s %u\n", cc0 >> 4, movd(s)); #define SHENG_SINGLE_UNROLL_ITER(iter) \ assert(iter); \ @@ -217,7 +217,7 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end, s_max = max_u8_m128(s_max, s); \ } \ m128 s_max##iter = s_max; \ - DEBUG_PRINTF("c %02llx --> s %hhu max %hhu\n", cc##iter >> 4, \ + DEBUG_PRINTF("c %02llx --> s %u max %u\n", cc##iter >> 4, \ movd(s), movd(s_max)); SHENG_SINGLE_UNROLL_ITER(1); diff --git a/src/nfa/mcsheng_internal.h b/src/nfa/mcsheng_internal.h index 81a658e0e..bb45ae23f 100644 --- a/src/nfa/mcsheng_internal.h +++ b/src/nfa/mcsheng_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, Intel Corporation + * Copyright (c) 2016-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -84,7 +84,7 @@ struct mcsheng { u8 has_accel; /**< 1 iff there are any accel plans */ u8 remap[256]; /**< remaps characters to a smaller alphabet */ ReportID arb_report; /**< one of the accepts that this dfa may raise */ - u32 accel_offset; /**< offset of the accel structures from start of NFA */ + u32 accel_offset; /**< offset of accel structures from start of McClellan */ m128 sheng_masks[N_CHARS]; }; diff --git a/src/nfa/sheng.c b/src/nfa/sheng.c index 837aa7dfa..4f30910b5 100644 --- a/src/nfa/sheng.c +++ b/src/nfa/sheng.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, Intel Corporation + * Copyright (c) 2016-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -628,7 +628,7 @@ char nfaExecSheng_reportCurrent(const struct NFA *n, struct mq *q) { fireSingleReport(cb, ctxt, sh->report, offset); } else { fireReports(sh, cb, ctxt, s, offset, &cached_state_id, - &cached_report_id, 1); + &cached_report_id, 0); } } diff --git a/src/nfagraph/ng.cpp b/src/nfagraph/ng.cpp index 8b7e4f91f..8dccf9863 100644 --- a/src/nfagraph/ng.cpp +++ b/src/nfagraph/ng.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -577,7 +577,8 @@ bool NG::addHolder(NGHolder &g) { } bool NG::addLiteral(const ue2_literal &literal, u32 expr_index, - u32 external_report, bool highlander, som_type som) { + u32 external_report, bool highlander, som_type som, + bool quiet) { assert(!literal.empty()); if (!cc.grey.shortcutLiterals) { @@ -605,7 +606,7 @@ bool NG::addLiteral(const ue2_literal &literal, u32 expr_index, } else { u32 ekey = highlander ? rm.getExhaustibleKey(external_report) : INVALID_EKEY; - Report r = makeECallback(external_report, 0, ekey); + Report r = makeECallback(external_report, 0, ekey, quiet); id = rm.getInternalId(r); } diff --git a/src/nfagraph/ng.h b/src/nfagraph/ng.h index a13045834..a5a9077d4 100644 --- a/src/nfagraph/ng.h +++ b/src/nfagraph/ng.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -77,7 +77,7 @@ class NG : noncopyable { /** \brief Adds a literal to Rose, used by literal shortcut passes (instead * of using \ref addGraph) */ bool addLiteral(const ue2_literal &lit, u32 expr_index, u32 external_report, - bool highlander, som_type som); + bool highlander, som_type som, bool quiet); /** \brief Maximum history in bytes available for use by SOM reverse NFAs, * a hack for pattern support (see UE-1903). This is always set to the max diff --git a/src/nfagraph/ng_calc_components.cpp b/src/nfagraph/ng_calc_components.cpp index 65574b50b..3e9454eee 100644 --- a/src/nfagraph/ng_calc_components.cpp +++ b/src/nfagraph/ng_calc_components.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -53,11 +53,11 @@ #include "ng_depth.h" #include "ng_holder.h" #include "ng_prune.h" -#include "ng_undirected.h" #include "ng_util.h" #include "grey.h" #include "ue2common.h" #include "util/graph_range.h" +#include "util/graph_undirected.h" #include "util/make_unique.h" #include @@ -310,28 +310,19 @@ void splitIntoComponents(unique_ptr g, return; } - unordered_map old2new; - auto ug = createUnGraph(*g, true, true, old2new); + auto ug = make_undirected_graph(*g); - // Construct reverse mapping. - unordered_map new2old; - for (const auto &m : old2new) { - new2old.emplace(m.second, m.first); - } + // Filter specials and shell vertices from undirected graph. + unordered_set bad_vertices( + {g->start, g->startDs, g->accept, g->acceptEod}); + bad_vertices.insert(head_shell.begin(), head_shell.end()); + bad_vertices.insert(tail_shell.begin(), tail_shell.end()); - // Filter shell vertices from undirected graph. - unordered_set shell_undir_vertices; - for (auto v : head_shell) { - shell_undir_vertices.insert(old2new.at(v)); - } - for (auto v : tail_shell) { - shell_undir_vertices.insert(old2new.at(v)); - } auto filtered_ug = boost::make_filtered_graph( - ug, boost::keep_all(), make_bad_vertex_filter(&shell_undir_vertices)); + ug, boost::keep_all(), make_bad_vertex_filter(&bad_vertices)); // Actually run the connected components algorithm. - map split_components; + map split_components; const u32 num = connected_components( filtered_ug, boost::make_assoc_property_map(split_components)); @@ -348,10 +339,8 @@ void splitIntoComponents(unique_ptr g, // Collect vertex lists per component. for (const auto &m : split_components) { - NFAUndirectedVertex uv = m.first; + NFAVertex v = m.first; u32 c = m.second; - assert(contains(new2old, uv)); - NFAVertex v = new2old.at(uv); verts[c].push_back(v); DEBUG_PRINTF("vertex %zu is in comp %u\n", (*g)[v].index, c); } diff --git a/src/nfagraph/ng_haig.cpp b/src/nfagraph/ng_haig.cpp index 992faf7ce..805454477 100644 --- a/src/nfagraph/ng_haig.cpp +++ b/src/nfagraph/ng_haig.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -73,7 +73,7 @@ static void populateInit(const NGHolder &g, const flat_set &unused, stateset *init, stateset *initDS, vector *v_by_index) { - DEBUG_PRINTF("graph kind: %u\n", (int)g.kind); + DEBUG_PRINTF("graph kind: %s\n", to_string(g.kind).c_str()); for (auto v : vertices_range(g)) { if (contains(unused, v)) { continue; diff --git a/src/nfagraph/ng_mcclellan.cpp b/src/nfagraph/ng_mcclellan.cpp index 091b89b8a..4ce5dc153 100644 --- a/src/nfagraph/ng_mcclellan.cpp +++ b/src/nfagraph/ng_mcclellan.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -542,7 +542,8 @@ unique_ptr buildMcClellan(const NGHolder &graph, return nullptr; } - DEBUG_PRINTF("attempting to build ?%d? mcclellan\n", (int)graph.kind); + DEBUG_PRINTF("attempting to build %s mcclellan\n", + to_string(graph.kind).c_str()); assert(allMatchStatesHaveReports(graph)); bool prunable = grey.highlanderPruneDFA && has_managed_reports(graph); diff --git a/src/nfagraph/ng_repeat.cpp b/src/nfagraph/ng_repeat.cpp index 96c553ded..1f63ad3c6 100644 --- a/src/nfagraph/ng_repeat.cpp +++ b/src/nfagraph/ng_repeat.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -38,7 +38,6 @@ #include "ng_prune.h" #include "ng_reports.h" #include "ng_som_util.h" -#include "ng_undirected.h" #include "ng_util.h" #include "nfa/accel.h" #include "nfa/limex_limits.h" @@ -48,6 +47,7 @@ #include "util/dump_charclass.h" #include "util/graph_range.h" #include "util/graph_small_color_map.h" +#include "util/graph_undirected.h" #include "util/report_manager.h" #include "util/unordered.h" @@ -73,40 +73,41 @@ namespace ue2 { namespace { -/** \brief Filter that retains only edges between vertices with the same - * reachability. */ +/** + * \brief Filter that retains only edges between vertices with the same + * reachability. Special vertices are dropped. + */ template struct ReachFilter { - ReachFilter() {} + ReachFilter() = default; explicit ReachFilter(const Graph *g_in) : g(g_in) {} // Convenience typedefs. - typedef typename boost::graph_traits Traits; - typedef typename Traits::vertex_descriptor VertexDescriptor; - typedef typename Traits::edge_descriptor EdgeDescriptor; + using Traits = typename boost::graph_traits; + using VertexDescriptor = typename Traits::vertex_descriptor; + using EdgeDescriptor = typename Traits::edge_descriptor; - bool operator()(const EdgeDescriptor &e) const { + bool operator()(const VertexDescriptor &v) const { assert(g); - - VertexDescriptor u = source(e, *g), v = target(e, *g); - // Disallow special vertices, as otherwise we will try to remove them // later. - if (is_special(u, *g) || is_special(v, *g)) { - return false; - } + return !is_special(v, *g); + } + bool operator()(const EdgeDescriptor &e) const { + assert(g); // Vertices must have the same reach. + auto u = source(e, *g), v = target(e, *g); const CharReach &cr_u = (*g)[u].char_reach; const CharReach &cr_v = (*g)[v].char_reach; - return cr_u == cr_v; } const Graph *g = nullptr; }; -typedef boost::filtered_graph> RepeatGraph; +using RepeatGraph = boost::filtered_graph, + ReachFilter>; struct ReachSubgraph { vector vertices; @@ -300,10 +301,9 @@ void splitSubgraph(const NGHolder &g, const deque &verts, unordered_map verts_map; // in g -> in verts_g fillHolder(&verts_g, g, verts, &verts_map); - unordered_map old2new; - auto ug = createUnGraph(verts_g, true, true, old2new); + const auto ug = make_undirected_graph(verts_g); - unordered_map repeatMap; + unordered_map repeatMap; size_t num = connected_components(ug, make_assoc_property_map(repeatMap)); DEBUG_PRINTF("found %zu connected repeat components\n", num); @@ -312,7 +312,8 @@ void splitSubgraph(const NGHolder &g, const deque &verts, vector rs(num); for (auto v : verts) { - NFAUndirectedVertex vu = old2new.at(verts_map.at(v)); + assert(!is_special(v, g)); + auto vu = verts_map.at(v); auto rit = repeatMap.find(vu); if (rit == repeatMap.end()) { continue; /* not part of a repeat */ @@ -323,8 +324,14 @@ void splitSubgraph(const NGHolder &g, const deque &verts, } for (const auto &rsi : rs) { + if (rsi.vertices.empty()) { + // Empty elements can happen when connected_components finds a + // subgraph consisting entirely of specials (which aren't added to + // ReachSubgraph in the loop above). There's nothing we can do with + // these, so we skip them. + continue; + } DEBUG_PRINTF("repeat with %zu vertices\n", rsi.vertices.size()); - assert(!rsi.vertices.empty()); if (rsi.vertices.size() >= minNumVertices) { DEBUG_PRINTF("enqueuing\n"); q.push(rsi); @@ -1023,17 +1030,16 @@ static void buildReachSubgraphs(const NGHolder &g, vector &rs, const u32 minNumVertices) { const ReachFilter fil(&g); - const RepeatGraph rg(g, fil); + const RepeatGraph rg(g, fil, fil); if (!isCompBigEnough(rg, minNumVertices)) { DEBUG_PRINTF("component not big enough, bailing\n"); return; } - unordered_map old2new; - auto ug = createUnGraph(rg, true, true, old2new); + const auto ug = make_undirected_graph(rg); - unordered_map repeatMap; + unordered_map repeatMap; unsigned int num; num = connected_components(ug, make_assoc_property_map(repeatMap)); @@ -1045,8 +1051,7 @@ void buildReachSubgraphs(const NGHolder &g, vector &rs, rs.resize(num); for (auto v : topoOrder) { - NFAUndirectedVertex vu = old2new[v]; - auto rit = repeatMap.find(vu); + auto rit = repeatMap.find(v); if (rit == repeatMap.end()) { continue; /* not part of a repeat */ } diff --git a/src/nfagraph/ng_stop.cpp b/src/nfagraph/ng_stop.cpp index c335540ac..5e627bb59 100644 --- a/src/nfagraph/ng_stop.cpp +++ b/src/nfagraph/ng_stop.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -87,7 +87,11 @@ struct InitDepths { /** Find the set of characters that are not present in the reachability of * graph \p g after a certain depth (currently 8). If a character in this set * is encountered, it means that the NFA is either dead or has not progressed - * more than 8 characters from its start states. */ + * more than 8 characters from its start states. + * + * This is only used to guide merging heuristics, use + * findLeftOffsetStopAlphabet for real uses. + */ CharReach findStopAlphabet(const NGHolder &g, som_type som) { const depth max_depth(MAX_STOP_DEPTH); const InitDepths depths(g); diff --git a/src/nfagraph/ng_stop.h b/src/nfagraph/ng_stop.h index da70a4fd0..4a889dca0 100644 --- a/src/nfagraph/ng_stop.h +++ b/src/nfagraph/ng_stop.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -47,7 +47,11 @@ class NGHolder; /** Find the set of characters that are not present in the reachability of * graph \p g after a certain depth (currently 8). If a character in this set * is encountered, it means that the NFA is either dead or has not progressed - * more than 8 characters from its start states. */ + * more than 8 characters from its start states. + * + * This is only used to guide merging heuristics, use + * findLeftOffsetStopAlphabet for real uses. + */ CharReach findStopAlphabet(const NGHolder &g, som_type som); /** Calculate the stop alphabet for each depth from 0 to MAX_STOP_DEPTH. Then diff --git a/src/nfagraph/ng_undirected.h b/src/nfagraph/ng_undirected.h deleted file mode 100644 index 036adcbff..000000000 --- a/src/nfagraph/ng_undirected.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2015-2017, Intel Corporation - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Intel Corporation nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -/** \file - * \brief Create an undirected graph from an NFAGraph. - */ - -#ifndef NG_UNDIRECTED_H -#define NG_UNDIRECTED_H - -#include "ng_holder.h" -#include "ng_util.h" -#include "ue2common.h" -#include "util/graph_range.h" -#include "util/unordered.h" - -#include - -#include - -namespace ue2 { - -/** - * \brief BGL graph type for the undirected NFA graph. - * - * Note that we use a set for the out-edge lists: this avoids the construction - * of parallel edges. The only vertex property constructed is \a - * vertex_index_t. - */ -using NFAUndirectedGraph = boost::adjacency_list< - boost::listS, // out edges - boost::listS, // vertices - boost::undirectedS, // graph is undirected - boost::property>; // vertex properties - -using NFAUndirectedVertex = NFAUndirectedGraph::vertex_descriptor; - -/** - * Make a copy of an NFAGraph with undirected edges, optionally without start - * vertices. Mappings from the original graph to the new one are provided. - * - * Note that new vertex indices are assigned contiguously in \a vertices(g) - * order. - */ -template -NFAUndirectedGraph createUnGraph(const Graph &g, - bool excludeStarts, - bool excludeAccepts, - std::unordered_map &old2new) { - NFAUndirectedGraph ug; - size_t idx = 0; - - assert(old2new.empty()); - old2new.reserve(num_vertices(g)); - - for (auto v : ue2::vertices_range(g)) { - // skip all accept nodes - if (excludeAccepts && is_any_accept(v, g)) { - continue; - } - - // skip starts if required - if (excludeStarts && is_any_start(v, g)) { - continue; - } - - auto nuv = boost::add_vertex(ug); - old2new.emplace(v, nuv); - boost::put(boost::vertex_index, ug, nuv, idx++); - } - - // Track seen edges so that we don't insert parallel edges. - using Vertex = typename Graph::vertex_descriptor; - ue2_unordered_set> seen; - seen.reserve(num_edges(g)); - auto make_ordered_edge = [](Vertex a, Vertex b) { - return std::make_pair(std::min(a, b), std::max(a, b)); - }; - - for (const auto &e : ue2::edges_range(g)) { - auto u = source(e, g); - auto v = target(e, g); - - if ((excludeAccepts && is_any_accept(u, g)) - || (excludeStarts && is_any_start(u, g))) { - continue; - } - - if ((excludeAccepts && is_any_accept(v, g)) - || (excludeStarts && is_any_start(v, g))) { - continue; - } - - if (!seen.emplace(make_ordered_edge(u, v)).second) { - continue; // skip parallel edge. - } - - NFAUndirectedVertex new_u = old2new.at(u); - NFAUndirectedVertex new_v = old2new.at(v); - - boost::add_edge(new_u, new_v, ug); - } - - assert(!has_parallel_edge(ug)); - return ug; -} - -} // namespace ue2 - -#endif /* NG_UNDIRECTED_H */ diff --git a/src/nfagraph/ng_violet.cpp b/src/nfagraph/ng_violet.cpp index 9ce732c2e..78d73082a 100644 --- a/src/nfagraph/ng_violet.cpp +++ b/src/nfagraph/ng_violet.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2017, Intel Corporation + * Copyright (c) 2016-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -60,6 +60,7 @@ #include "util/flat_containers.h" #include "util/graph.h" #include "util/graph_range.h" +#include "util/graph_small_color_map.h" #include "util/insertion_ordered.h" #include "util/make_unique.h" #include "util/order_check.h" @@ -133,14 +134,21 @@ bool createsTransientLHS(const NGHolder &g, const vector &vv, return true; } +/** + * Counts the number of vertices that are reachable from the set of sources + * given. + */ static -double calcSplitRatio(const NGHolder &g, const vector &vv) { - flat_set not_reachable; - find_unreachable(g, vv, ¬_reachable); - double rv = (double)not_reachable.size() / num_vertices(g); - rv = rv > 0.5 ? 1 - rv : rv; +size_t count_reachable(const NGHolder &g, const vector &sources, + small_color_map &color_map) { + auto null_visitor = boost::make_dfs_visitor(boost::null_visitor()); + color_map.fill(small_color::white); - return rv; + for (auto v : sources) { + boost::depth_first_visit(g, v, null_visitor, color_map); + } + + return color_map.count(small_color::black); } static @@ -687,8 +695,12 @@ unique_ptr findBestSplit(const NGHolder &g, } if (last_chance) { + const size_t num_verts = num_vertices(g); + auto color_map = make_small_color_map(g); for (auto &a : lits) { - a->split_ratio = calcSplitRatio(g, a->vv); + size_t num_reachable = count_reachable(g, a->vv, color_map); + double ratio = (double)num_reachable / (double)num_verts; + a->split_ratio = ratio > 0.5 ? 1 - ratio : ratio; } } diff --git a/src/nfagraph/ng_width.cpp b/src/nfagraph/ng_width.cpp index 4c33220c1..219241ca5 100644 --- a/src/nfagraph/ng_width.cpp +++ b/src/nfagraph/ng_width.cpp @@ -176,11 +176,7 @@ depth findMaxWidth(const NGHolder &h, const SpecialEdgeFilter &filter, } if (d.is_unreachable()) { - // If we're actually reachable, we'll have a min width, so we can - // return infinity in this case. - if (findMinWidth(h, filter, src).is_reachable()) { - return depth::infinity(); - } + assert(findMinWidth(h, filter, src).is_unreachable()); return d; } diff --git a/src/parser/logical_combination.cpp b/src/parser/logical_combination.cpp new file mode 100644 index 000000000..b78390b07 --- /dev/null +++ b/src/parser/logical_combination.cpp @@ -0,0 +1,376 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Parse and build ParsedLogical::logicalTree and combInfoMap. + */ +#include "logical_combination.h" +#include "parser/parse_error.h" +#include "util/container.h" +#include "hs_compile.h" + +#include + +using namespace std; + +namespace ue2 { + +u32 ParsedLogical::getLogicalKey(u32 a) { + auto it = toLogicalKeyMap.find(a); + if (it == toLogicalKeyMap.end()) { + // get size before assigning to avoid wacky LHS shenanigans + u32 size = toLogicalKeyMap.size(); + bool inserted; + tie(it, inserted) = toLogicalKeyMap.emplace(a, size); + assert(inserted); + } + DEBUG_PRINTF("%u -> lkey %u\n", it->first, it->second); + return it->second; +} + +u32 ParsedLogical::getCombKey(u32 a) { + auto it = toCombKeyMap.find(a); + if (it == toCombKeyMap.end()) { + u32 size = toCombKeyMap.size(); + bool inserted; + tie(it, inserted) = toCombKeyMap.emplace(a, size); + assert(inserted); + } + DEBUG_PRINTF("%u -> ckey %u\n", it->first, it->second); + return it->second; +} + +void ParsedLogical::addRelateCKey(u32 lkey, u32 ckey) { + auto it = lkey2ckeys.find(lkey); + if (it == lkey2ckeys.end()) { + bool inserted; + tie(it, inserted) = lkey2ckeys.emplace(lkey, set()); + assert(inserted); + } + it->second.insert(ckey); + DEBUG_PRINTF("lkey %u belongs to combination key %u\n", + it->first, ckey); +} + +#define TRY_RENUM_OP(ckey) \ +do { \ + if (ckey & LOGICAL_OP_BIT) { \ + ckey = (ckey & ~LOGICAL_OP_BIT) + toLogicalKeyMap.size(); \ + } \ +} while(0) + +u32 ParsedLogical::logicalTreeAdd(u32 op, u32 left, u32 right) { + LogicalOp lop; + assert((LOGICAL_OP_BIT & (u32)logicalTree.size()) == 0); + lop.id = LOGICAL_OP_BIT | (u32)logicalTree.size(); + lop.op = op; + lop.lo = left; + lop.ro = right; + logicalTree.push_back(lop); + return lop.id; +} + +void ParsedLogical::combinationInfoAdd(UNUSED u32 ckey, u32 id, u32 ekey, + u32 lkey_start, u32 lkey_result, + u64a min_offset, u64a max_offset) { + assert(ckey == combInfoMap.size()); + CombInfo ci; + ci.id = id; + ci.ekey = ekey; + ci.start = lkey_start; + ci.result = lkey_result; + ci.min_offset = min_offset; + ci.max_offset = max_offset; + combInfoMap.push_back(ci); + + DEBUG_PRINTF("ckey %u (id %u) -> lkey %u..%u, ekey=0x%x\n", ckey, ci.id, + ci.start, ci.result, ci.ekey); +} + +void ParsedLogical::validateSubIDs(const unsigned *ids, + const char *const *expressions, + const unsigned *flags, + unsigned elements) { + for (const auto &it : toLogicalKeyMap) { + bool unknown = true; + u32 i = 0; + for (i = 0; i < elements; i++) { + if ((ids ? ids[i] : 0) == it.first) { + unknown = false; + break; + } + } + if (unknown) { + throw CompileError("Unknown sub-expression id."); + } + if (contains(toCombKeyMap, it.first)) { + throw CompileError("Have combination of combination."); + } + if (flags && (flags[i] & HS_FLAG_SOM_LEFTMOST)) { + throw CompileError("Have SOM flag in sub-expression."); + } + if (flags && (flags[i] & HS_FLAG_PREFILTER)) { + throw CompileError("Have PREFILTER flag in sub-expression."); + } + hs_compile_error_t *compile_err = NULL; + hs_expr_info_t *info = NULL; + hs_error_t err = hs_expression_info(expressions[i], flags[i], &info, + &compile_err); + if (err != HS_SUCCESS) { + hs_free_compile_error(compile_err); + throw CompileError("Run hs_expression_info() failed."); + } + if (!info) { + throw CompileError("Get hs_expr_info_t failed."); + } else { + if (info->unordered_matches) { + throw CompileError("Have unordered match in sub-expressions."); + } + free(info); + } + } +} + +void ParsedLogical::logicalKeyRenumber() { + // renumber operation lkey in op vector + for (auto &op : logicalTree) { + TRY_RENUM_OP(op.id); + TRY_RENUM_OP(op.lo); + TRY_RENUM_OP(op.ro); + } + // renumber operation lkey in info map + for (auto &ci : combInfoMap) { + TRY_RENUM_OP(ci.start); + TRY_RENUM_OP(ci.result); + } +} + +struct LogicalOperator { + LogicalOperator(u32 op_in, u32 paren_in) + : op(op_in), paren(paren_in) {} + u32 op; + u32 paren; +}; + +static +u32 toOperator(char c) { + u32 op = UNKNOWN_OP; + switch (c) { + case '!' : + op = LOGICAL_OP_NOT; + break; + case '&' : + op = LOGICAL_OP_AND; + break; + case '|' : + op = LOGICAL_OP_OR; + break; + default: + break; + }; + return op; +} + +static +bool cmpOperator(const LogicalOperator &op1, const LogicalOperator &op2) { + if (op1.paren < op2.paren) { + return false; + } + if (op1.paren > op2.paren) { + return true; + } + assert(op1.paren == op2.paren); + if (op1.op > op2.op) { + return false; + } + if (op1.op < op2.op) { + return true; + } + return true; +} + +static +u32 fetchSubID(const char *logical, u32 &digit, u32 end) { + if (digit == (u32)-1) { // no digit parsing in progress + return (u32)-1; + } + assert(end > digit); + if (end - digit > 9) { + throw LocatedParseError("Expression id too large"); + } + u32 mult = 1; + u32 sum = 0; + for (u32 j = end - 1; (j >= digit) && (j != (u32)-1) ; j--) { + assert(isdigit(logical[j])); + sum += (logical[j] - '0') * mult; + mult *= 10; + } + digit = (u32)-1; + return sum; +} + +static +void popOperator(vector &op_stack, vector &subid_stack, + ParsedLogical &pl) { + if (subid_stack.empty()) { + throw LocatedParseError("Not enough operand"); + } + u32 right = subid_stack.back(); + subid_stack.pop_back(); + u32 left = 0; + if (op_stack.back().op != LOGICAL_OP_NOT) { + if (subid_stack.empty()) { + throw LocatedParseError("Not enough operand"); + } + left = subid_stack.back(); + subid_stack.pop_back(); + } + subid_stack.push_back(pl.logicalTreeAdd(op_stack.back().op, left, right)); + op_stack.pop_back(); +} + +static +char getValue(const vector &lv, u32 ckey) { + if (ckey & LOGICAL_OP_BIT) { + return lv[ckey & ~LOGICAL_OP_BIT]; + } else { + return 0; + } +} + +static +bool hasMatchFromPurelyNegative(const vector &tree, + u32 start, u32 result) { + vector lv(tree.size()); + assert(start <= result); + for (u32 i = start; i <= result; i++) { + assert(i & LOGICAL_OP_BIT); + const LogicalOp &op = tree[i & ~LOGICAL_OP_BIT]; + assert(i == op.id); + switch (op.op) { + case LOGICAL_OP_NOT: + lv[op.id & ~LOGICAL_OP_BIT] = !getValue(lv, op.ro); + break; + case LOGICAL_OP_AND: + lv[op.id & ~LOGICAL_OP_BIT] = getValue(lv, op.lo) & + getValue(lv, op.ro); + break; + case LOGICAL_OP_OR: + lv[op.id & ~LOGICAL_OP_BIT] = getValue(lv, op.lo) | + getValue(lv, op.ro); + break; + default: + assert(0); + break; + } + } + return lv[result & ~LOGICAL_OP_BIT]; +} + +void ParsedLogical::parseLogicalCombination(unsigned id, const char *logical, + u32 ekey, u64a min_offset, + u64a max_offset) { + u32 ckey = getCombKey(id); + vector op_stack; + vector subid_stack; + u32 lkey_start = INVALID_LKEY; // logical operation's lkey + u32 paren = 0; // parentheses + u32 digit = (u32)-1; // digit start offset, invalid offset is -1 + u32 subid = (u32)-1; + u32 i; + try { + for (i = 0; logical[i]; i++) { + if (isdigit(logical[i])) { + if (digit == (u32)-1) { // new digit start + digit = i; + } + } else { + if ((subid = fetchSubID(logical, digit, i)) != (u32)-1) { + subid_stack.push_back(getLogicalKey(subid)); + addRelateCKey(subid_stack.back(), ckey); + } + if (logical[i] == ' ') { // skip whitespace + continue; + } + if (logical[i] == '(') { + paren += 1; + } else if (logical[i] == ')') { + if (paren <= 0) { + throw LocatedParseError("Not enough left parentheses"); + } + paren -= 1; + } else { + u32 prio = toOperator(logical[i]); + if (prio != UNKNOWN_OP) { + LogicalOperator op(prio, paren); + while (!op_stack.empty() + && cmpOperator(op_stack.back(), op)) { + popOperator(op_stack, subid_stack, *this); + if (lkey_start == INVALID_LKEY) { + lkey_start = subid_stack.back(); + } + } + op_stack.push_back(op); + } else { + throw LocatedParseError("Unknown character"); + } + } + } + } + if (paren != 0) { + throw LocatedParseError("Not enough right parentheses"); + } + if ((subid = fetchSubID(logical, digit, i)) != (u32)-1) { + subid_stack.push_back(getLogicalKey(subid)); + addRelateCKey(subid_stack.back(), ckey); + } + while (!op_stack.empty()) { + popOperator(op_stack, subid_stack, *this); + if (lkey_start == INVALID_LKEY) { + lkey_start = subid_stack.back(); + } + } + if (subid_stack.size() != 1) { + throw LocatedParseError("Not enough operator"); + } + } catch (LocatedParseError &error) { + error.locate(i); + throw; + } + u32 lkey_result = subid_stack.back(); // logical operation's lkey + if (lkey_start == INVALID_LKEY) { + throw CompileError("No logical operation."); + } + if (hasMatchFromPurelyNegative(logicalTree, lkey_start, lkey_result)) { + throw CompileError("Has match from purely negative sub-expressions."); + } + combinationInfoAdd(ckey, id, ekey, lkey_start, lkey_result, + min_offset, max_offset); +} + +} // namespace ue2 diff --git a/src/parser/logical_combination.h b/src/parser/logical_combination.h new file mode 100644 index 000000000..7c8eb36ef --- /dev/null +++ b/src/parser/logical_combination.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Parse and build ParsedLogical::logicalTree and combInfoMap. + */ + +#ifndef LOGICAL_COMBINATION_H +#define LOGICAL_COMBINATION_H + +#include "util/logical.h" + +#include +#include +#include + +namespace ue2 { + +class ParsedLogical { + friend class ReportManager; +public: + /** \brief Parse 1 logical expression \a logical, assign temporary ckey. */ + void parseLogicalCombination(unsigned id, const char *logical, u32 ekey, + u64a min_offset, u64a max_offset); + + /** \brief Check if all sub-expression id in combinations are valid. */ + void validateSubIDs(const unsigned *ids, const char *const *expressions, + const unsigned *flags, unsigned elements); + + /** \brief Renumber and assign final lkey for each logical operation + * after parsed all logical expressions. */ + void logicalKeyRenumber(); + + /** \brief Fetch the lkey associated with the given expression id, + * assigning one if necessary. */ + u32 getLogicalKey(u32 expressionId); + + /** \brief Fetch the ckey associated with the given expression id, + * assigning one if necessary. */ + u32 getCombKey(u32 expressionId); + + /** \brief Add lkey's corresponding combination id. */ + void addRelateCKey(u32 lkey, u32 ckey); + + /** \brief Add one Logical Operation. */ + u32 logicalTreeAdd(u32 op, u32 left, u32 right); + + /** \brief Assign the combination info associated with the given ckey. */ + void combinationInfoAdd(u32 ckey, u32 id, u32 ekey, u32 lkey_start, + u32 lkey_result, u64a min_offset, u64a max_offset); + + const std::map &getLkeyMap() const { + return toLogicalKeyMap; + } + + const std::vector &getLogicalTree() const { + return logicalTree; + } + + CombInfo getCombInfoById(u32 id) const { + u32 ckey = toCombKeyMap.at(id); + assert(ckey < combInfoMap.size()); + return combInfoMap.at(ckey); + } + +private: + /** \brief Mapping from ckey to combination info. */ + std::vector combInfoMap; + + /** \brief Mapping from combination expression id to combination key, + * combination key is used in combination bit-vector cache. */ + std::map toCombKeyMap; + + /** \brief Mapping from expression id to logical key, logical key is used + * as index in LogicalOp array. */ + std::map toLogicalKeyMap; + + /** \brief Mapping from logical key to related combination keys. */ + std::map> lkey2ckeys; + + /** \brief Logical constraints, each operation from postfix notation. */ + std::vector logicalTree; +}; + +} // namespace ue2 + +#endif diff --git a/src/parser/shortcut_literal.cpp b/src/parser/shortcut_literal.cpp index 82679c88a..7a7ab6eea 100644 --- a/src/parser/shortcut_literal.cpp +++ b/src/parser/shortcut_literal.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -199,7 +199,7 @@ bool shortcutLiteral(NG &ng, const ParsedExpression &pe) { DEBUG_PRINTF("constructed literal %s\n", dumpString(lit).c_str()); return ng.addLiteral(lit, expr.index, expr.report, expr.highlander, - expr.som); + expr.som, expr.quiet); } } // namespace ue2 diff --git a/src/parser/utf8_validate.cpp b/src/parser/utf8_validate.cpp index cedaf9445..50aa06d8e 100644 --- a/src/parser/utf8_validate.cpp +++ b/src/parser/utf8_validate.cpp @@ -60,12 +60,11 @@ bool isAllowedCodepoint(u32 val) { return true; } -bool isValidUtf8(const char *expression) { +bool isValidUtf8(const char *expression, const size_t len) { if (!expression) { return true; } - const size_t len = strlen(expression); const u8 *s = (const u8 *)expression; u32 val; diff --git a/src/parser/utf8_validate.h b/src/parser/utf8_validate.h index 26a2f22e8..6389a0859 100644 --- a/src/parser/utf8_validate.h +++ b/src/parser/utf8_validate.h @@ -29,10 +29,12 @@ #ifndef PARSER_UTF8_VALIDATE_H #define PARSER_UTF8_VALIDATE_H +#include // size_t + namespace ue2 { /** \brief Validate that the given expression is well-formed UTF-8. */ -bool isValidUtf8(const char *expression); +bool isValidUtf8(const char *expression, const size_t len); } // namespace ue2 diff --git a/src/report.h b/src/report.h index 4a5f401e7..a2e2d0f3d 100644 --- a/src/report.h +++ b/src/report.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, Intel Corporation + * Copyright (c) 2016-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -42,6 +42,7 @@ #include "rose/runtime.h" #include "som/som_runtime.h" #include "util/exhaust.h" +#include "util/logical.h" #include "util/fatbit.h" enum DedupeResult { @@ -151,6 +152,93 @@ void clearEvec(const struct RoseEngine *rose, char *evec) { mmbit_clear((u8 *)evec, rose->ekeyCount); } +/** \brief Test whether the given key (\a lkey) is set in the logical vector + * \a lvec. */ +static really_inline +char getLogicalVal(const struct RoseEngine *rose, const char *lvec, u32 lkey) { + DEBUG_PRINTF("checking lkey matching %p %u\n", lvec, lkey); + assert(lkey != INVALID_LKEY); + assert(lkey < rose->lkeyCount + rose->lopCount); + return mmbit_isset((const u8 *)lvec, rose->lkeyCount + rose->lopCount, + lkey); +} + +/** \brief Mark key \a lkey on in the logical vector. */ +static really_inline +void setLogicalVal(const struct RoseEngine *rose, char *lvec, u32 lkey, + char val) { + DEBUG_PRINTF("marking as matched logical key %u\n", lkey); + assert(lkey != INVALID_LKEY); + assert(lkey < rose->lkeyCount + rose->lopCount); + switch (val) { + case 0: + mmbit_unset((u8 *)lvec, rose->lkeyCount + rose->lopCount, lkey); + break; + default: + mmbit_set((u8 *)lvec, rose->lkeyCount + rose->lopCount, lkey); + break; + } +} + +/** \brief Mark key \a ckey on in the combination vector. */ +static really_inline +void setCombinationActive(const struct RoseEngine *rose, char *cvec, u32 ckey) { + DEBUG_PRINTF("marking as active combination key %u\n", ckey); + assert(ckey != INVALID_CKEY); + assert(ckey < rose->ckeyCount); + mmbit_set((u8 *)cvec, rose->ckeyCount, ckey); +} + +/** \brief Returns 1 if compliant to all logical combinations. */ +static really_inline +char isLogicalCombination(const struct RoseEngine *rose, char *lvec, + u32 start, u32 result) { + const struct LogicalOp *logicalTree = (const struct LogicalOp *) + ((const char *)rose + rose->logicalTreeOffset); + assert(start >= rose->lkeyCount); + assert(start <= result); + assert(result < rose->lkeyCount + rose->lopCount); + for (u32 i = start; i <= result; i++) { + const struct LogicalOp *op = logicalTree + (i - rose->lkeyCount); + assert(i == op->id); + assert(op->op <= LAST_LOGICAL_OP); + switch ((enum LogicalOpType)op->op) { + case LOGICAL_OP_NOT: + setLogicalVal(rose, lvec, op->id, + !getLogicalVal(rose, lvec, op->ro)); + break; + case LOGICAL_OP_AND: + setLogicalVal(rose, lvec, op->id, + getLogicalVal(rose, lvec, op->lo) & + getLogicalVal(rose, lvec, op->ro)); // && + break; + case LOGICAL_OP_OR: + setLogicalVal(rose, lvec, op->id, + getLogicalVal(rose, lvec, op->lo) | + getLogicalVal(rose, lvec, op->ro)); // || + break; + } + } + return getLogicalVal(rose, lvec, result); +} + +/** \brief Clear all keys in the logical vector. */ +static really_inline +void clearLvec(const struct RoseEngine *rose, char *lvec, char *cvec) { + DEBUG_PRINTF("clearing lvec %p %u\n", lvec, + rose->lkeyCount + rose->lopCount); + DEBUG_PRINTF("clearing cvec %p %u\n", cvec, rose->ckeyCount); + mmbit_clear((u8 *)lvec, rose->lkeyCount + rose->lopCount); + mmbit_clear((u8 *)cvec, rose->ckeyCount); +} + +/** \brief Clear all keys in the combination vector. */ +static really_inline +void clearCvec(const struct RoseEngine *rose, char *cvec) { + DEBUG_PRINTF("clearing cvec %p %u\n", cvec, rose->ckeyCount); + mmbit_clear((u8 *)cvec, rose->ckeyCount); +} + /** * \brief Deliver the given report to the user callback. * diff --git a/src/rose/block.c b/src/rose/block.c index 2c4932199..a32113f4b 100644 --- a/src/rose/block.c +++ b/src/rose/block.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -145,6 +145,7 @@ void init_for_block(const struct RoseEngine *t, struct hs_scratch *scratch, tctxt->lastEndOffset = 0; tctxt->filledDelayedSlots = 0; tctxt->lastMatchOffset = 0; + tctxt->lastCombMatchOffset = 0; tctxt->minMatchOffset = 0; tctxt->minNonMpvMatchOffset = 0; tctxt->next_mpv_offset = 0; diff --git a/src/rose/catchup.c b/src/rose/catchup.c index 9e36d0914..7a6648da9 100644 --- a/src/rose/catchup.c +++ b/src/rose/catchup.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -424,6 +424,12 @@ hwlmcb_rv_t roseCatchUpMPV_i(const struct RoseEngine *t, s64a loc, } done: + if (t->flushCombProgramOffset) { + if (roseRunFlushCombProgram(t, scratch, mpv_exec_end) + == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } updateMinMatchOffsetFromMpv(&scratch->tctxt, mpv_exec_end); scratch->tctxt.next_mpv_offset = MAX(next_pos_match_loc + scratch->core_info.buf_offset, diff --git a/src/rose/catchup.h b/src/rose/catchup.h index 24b843f5a..8188d5af0 100644 --- a/src/rose/catchup.h +++ b/src/rose/catchup.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -51,6 +51,7 @@ #include "hwlm/hwlm.h" #include "runtime.h" #include "scratch.h" +#include "rose.h" #include "rose_common.h" #include "rose_internal.h" #include "ue2common.h" @@ -105,6 +106,12 @@ hwlmcb_rv_t roseCatchUpMPV(const struct RoseEngine *t, s64a loc, assert(!can_stop_matching(scratch)); if (canSkipCatchUpMPV(t, scratch, cur_offset)) { + if (t->flushCombProgramOffset) { + if (roseRunFlushCombProgram(t, scratch, cur_offset) + == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } updateMinMatchOffsetFromMpv(&scratch->tctxt, cur_offset); return HWLM_CONTINUE_MATCHING; } @@ -139,6 +146,12 @@ hwlmcb_rv_t roseCatchUpTo(const struct RoseEngine *t, hwlmcb_rv_t rv; if (!t->activeArrayCount || !mmbit_any(getActiveLeafArray(t, state), t->activeArrayCount)) { + if (t->flushCombProgramOffset) { + if (roseRunFlushCombProgram(t, scratch, end) + == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } updateMinMatchOffset(&scratch->tctxt, end); rv = HWLM_CONTINUE_MATCHING; } else { diff --git a/src/rose/match.c b/src/rose/match.c index 5d1b6e07a..97e93c938 100644 --- a/src/rose/match.c +++ b/src/rose/match.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -571,6 +571,22 @@ int roseRunBoundaryProgram(const struct RoseEngine *rose, u32 program, return MO_CONTINUE_MATCHING; } +/** + * \brief Execute a flush combination program. + * + * Returns MO_HALT_MATCHING if the stream is exhausted or the user has + * instructed us to halt, or MO_CONTINUE_MATCHING otherwise. + */ +int roseRunFlushCombProgram(const struct RoseEngine *rose, + struct hs_scratch *scratch, u64a end) { + hwlmcb_rv_t rv = roseRunProgram(rose, scratch, rose->flushCombProgramOffset, + 0, end, 0); + if (rv == HWLM_TERMINATE_MATCHING) { + return MO_HALT_MATCHING; + } + return MO_CONTINUE_MATCHING; +} + int roseReportAdaptor(u64a start, u64a end, ReportID id, void *context) { struct hs_scratch *scratch = context; assert(scratch && scratch->magic == SCRATCH_MAGIC); diff --git a/src/rose/match.h b/src/rose/match.h index 0d4fb19c1..c03b1ebba 100644 --- a/src/rose/match.h +++ b/src/rose/match.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -66,6 +66,7 @@ hwlmcb_rv_t roseHandleChainMatch(const struct RoseEngine *t, u64a top_squash_distance, u64a end, char in_catchup); +/** \brief Initialize the queue for a suffix/outfix engine. */ static really_inline void initQueue(struct mq *q, u32 qi, const struct RoseEngine *t, struct hs_scratch *scratch) { @@ -90,6 +91,7 @@ void initQueue(struct mq *q, u32 qi, const struct RoseEngine *t, info->stateOffset, *(u32 *)q->state); } +/** \brief Initialize the queue for a leftfix (prefix/infix) engine. */ static really_inline void initRoseQueue(const struct RoseEngine *t, u32 qi, const struct LeftNfaInfo *left, diff --git a/src/rose/program_runtime.h b/src/rose/program_runtime.h index e6ce9bdbe..3c11300bd 100644 --- a/src/rose/program_runtime.h +++ b/src/rose/program_runtime.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -501,8 +501,7 @@ hwlmcb_rv_t roseReport(const struct RoseEngine *t, struct hs_scratch *scratch, } /* catches up engines enough to ensure any earlier mpv triggers are enqueued - * and then adds the trigger to the mpv queue. Must not be called during catch - * up */ + * and then adds the trigger to the mpv queue. */ static rose_inline hwlmcb_rv_t roseCatchUpAndHandleChainMatch(const struct RoseEngine *t, struct hs_scratch *scratch, @@ -558,6 +557,22 @@ void roseHandleSomSom(struct hs_scratch *scratch, setSomFromSomAware(scratch, sr, start, end); } +static rose_inline +hwlmcb_rv_t roseSetExhaust(const struct RoseEngine *t, + struct hs_scratch *scratch, u32 ekey) { + assert(scratch); + assert(scratch->magic == SCRATCH_MAGIC); + + struct core_info *ci = &scratch->core_info; + + assert(!can_stop_matching(scratch)); + assert(!isExhausted(ci->rose, ci->exhaustionVector, ekey)); + + markAsMatched(ci->rose, ci->exhaustionVector, ekey); + + return roseHaltIfExhausted(t, scratch); +} + static really_inline int reachHasBit(const u8 *reach, u8 c) { return !!(reach[c / 8U] & (u8)1U << (c % 8U)); @@ -1823,6 +1838,56 @@ void updateSeqPoint(struct RoseContext *tctxt, u64a offset, } } +static rose_inline +hwlmcb_rv_t flushActiveCombinations(const struct RoseEngine *t, + struct hs_scratch *scratch) { + u8 *cvec = (u8 *)scratch->core_info.combVector; + if (!mmbit_any(cvec, t->ckeyCount)) { + return HWLM_CONTINUE_MATCHING; + } + u64a end = scratch->tctxt.lastCombMatchOffset; + for (u32 i = mmbit_iterate(cvec, t->ckeyCount, MMB_INVALID); + i != MMB_INVALID; i = mmbit_iterate(cvec, t->ckeyCount, i)) { + const struct CombInfo *combInfoMap = (const struct CombInfo *) + ((const char *)t + t->combInfoMapOffset); + const struct CombInfo *ci = combInfoMap + i; + if ((ci->min_offset != 0) && (end < ci->min_offset)) { + DEBUG_PRINTF("halt: before min_offset=%llu\n", ci->min_offset); + continue; + } + if ((ci->max_offset != MAX_OFFSET) && (end > ci->max_offset)) { + DEBUG_PRINTF("halt: after max_offset=%llu\n", ci->max_offset); + continue; + } + + DEBUG_PRINTF("check ekey %u\n", ci->ekey); + if (ci->ekey != INVALID_EKEY) { + assert(ci->ekey < t->ekeyCount); + const char *evec = scratch->core_info.exhaustionVector; + if (isExhausted(t, evec, ci->ekey)) { + DEBUG_PRINTF("ekey %u already set, match is exhausted\n", + ci->ekey); + continue; + } + } + + DEBUG_PRINTF("check ckey %u\n", i); + char *lvec = scratch->core_info.logicalVector; + if (!isLogicalCombination(t, lvec, ci->start, ci->result)) { + DEBUG_PRINTF("Logical Combination Failed!\n"); + continue; + } + + DEBUG_PRINTF("Logical Combination Passed!\n"); + if (roseReport(t, scratch, end, ci->id, 0, + ci->ekey) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } + clearCvec(t, (char *)cvec); + return HWLM_CONTINUE_MATCHING; +} + #define PROGRAM_CASE(name) \ case ROSE_INSTR_##name: { \ DEBUG_PRINTF("instruction: " #name " (pc=%u)\n", \ @@ -2588,6 +2653,47 @@ hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t, } } PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(SET_LOGICAL) { + DEBUG_PRINTF("set logical value of lkey %u, offset_adjust=%d\n", + ri->lkey, ri->offset_adjust); + assert(ri->lkey != INVALID_LKEY); + assert(ri->lkey < t->lkeyCount); + char *lvec = scratch->core_info.logicalVector; + setLogicalVal(t, lvec, ri->lkey, 1); + updateLastCombMatchOffset(tctxt, end + ri->offset_adjust); + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(SET_COMBINATION) { + DEBUG_PRINTF("set ckey %u as active\n", ri->ckey); + assert(ri->ckey != INVALID_CKEY); + assert(ri->ckey < t->ckeyCount); + char *cvec = scratch->core_info.combVector; + setCombinationActive(t, cvec, ri->ckey); + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(FLUSH_COMBINATION) { + assert(end >= tctxt->lastCombMatchOffset); + if (end > tctxt->lastCombMatchOffset) { + if (flushActiveCombinations(t, scratch) + == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(SET_EXHAUST) { + updateSeqPoint(tctxt, end, from_mpv); + if (roseSetExhaust(t, scratch, ri->ekey) + == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + work_done = 1; + } + PROGRAM_NEXT_INSTRUCTION } } diff --git a/src/rose/rose.h b/src/rose/rose.h index b29519b6d..c2b682f6b 100644 --- a/src/rose/rose.h +++ b/src/rose/rose.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -53,4 +53,7 @@ int roseReportAdaptor(u64a start, u64a end, ReportID id, void *context); int roseRunBoundaryProgram(const struct RoseEngine *rose, u32 program, u64a stream_offset, struct hs_scratch *scratch); +int roseRunFlushCombProgram(const struct RoseEngine *rose, + struct hs_scratch *scratch, u64a end); + #endif // ROSE_H diff --git a/src/rose/rose_build_add.cpp b/src/rose/rose_build_add.cpp index 71f1667dc..aa043fade 100644 --- a/src/rose/rose_build_add.cpp +++ b/src/rose/rose_build_add.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -453,7 +453,7 @@ RoseVertex tryForAnchoredVertex(RoseBuildImpl *tbi, <= tbi->cc.grey.maxAnchoredRegion) { if (ep.maxBound || ep.minBound) { /* TODO: handle, however these cases are not generated currently by - ng_rose */ + ng_violet */ return RoseGraph::null_vertex(); } max_width = depth(ep.maxBound + iv_info.s.length()); @@ -567,7 +567,7 @@ void doRoseLiteralVertex(RoseBuildImpl *tbi, bool use_eod_table, assert(iv_info.type == RIV_LITERAL); assert(!parents.empty()); /* start vertices should not be here */ - // ng_rose should have ensured that mixed-sensitivity literals are no + // ng_violet should have ensured that mixed-sensitivity literals are no // longer than the benefits max width. assert(iv_info.s.length() <= MAX_MASK2_WIDTH || !mixed_sensitivity(iv_info.s)); @@ -1849,13 +1849,12 @@ bool RoseBuildImpl::addChainTail(const raw_puff &rp, u32 *queue_out, return true; /* failure is not yet an option */ } - static bool prepAcceptForAddAnchoredNFA(RoseBuildImpl &tbi, const NGHolder &w, - u32 max_adj, NFAVertex u, + NFAVertex u, const vector &vertexDepths, map &depthMap, - map > &reportMap, + map> &reportMap, map &allocated_reports, flat_set &added_lit_ids) { const depth max_anchored_depth(tbi.cc.grey.maxAnchoredRegion); @@ -1883,9 +1882,9 @@ bool prepAcceptForAddAnchoredNFA(RoseBuildImpl &tbi, const NGHolder &w, depthMap[lit_id] = unionDepthMinMax(depthMap[lit_id], d); } - if (depthMap[lit_id].max + depth(max_adj) > max_anchored_depth) { + if (depthMap[lit_id].max > max_anchored_depth) { DEBUG_PRINTF("depth=%s exceeds maxAnchoredRegion=%u\n", - (depthMap[lit_id].max + depth(max_adj)).str().c_str(), + depthMap[lit_id].max.str().c_str(), tbi.cc.grey.maxAnchoredRegion); return false; } @@ -1932,7 +1931,7 @@ bool RoseBuildImpl::addAnchoredAcyclic(const NGHolder &h) { flat_set added_lit_ids; /* literal ids added for this NFA */ for (auto v : inv_adjacent_vertices_range(h.accept, h)) { - if (!prepAcceptForAddAnchoredNFA(*this, h, 0, v, vertexDepths, depthMap, + if (!prepAcceptForAddAnchoredNFA(*this, h, v, vertexDepths, depthMap, reportMap, allocated_reports, added_lit_ids)) { removeAddedLiterals(*this, added_lit_ids); @@ -1946,7 +1945,7 @@ bool RoseBuildImpl::addAnchoredAcyclic(const NGHolder &h) { if (v == h.accept) { continue; } - if (!prepAcceptForAddAnchoredNFA(*this, h, 0, v, vertexDepths, depthMap, + if (!prepAcceptForAddAnchoredNFA(*this, h, v, vertexDepths, depthMap, reportMap, allocated_reports_eod, added_lit_ids)) { removeAddedLiterals(*this, added_lit_ids); diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp index 9a546ae4e..2c0a9b286 100644 --- a/src/rose/rose_build_bytecode.cpp +++ b/src/rose/rose_build_bytecode.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -426,6 +426,17 @@ void fillStateOffsets(const RoseBuildImpl &build, u32 rolesWithStateCount, curr_offset += mmbit_size(build.rm.numEkeys()); so->exhausted_size = mmbit_size(build.rm.numEkeys()); + // Logical multibit. + so->logicalVec = curr_offset; + so->logicalVec_size = mmbit_size(build.rm.numLogicalKeys() + + build.rm.numLogicalOps()); + curr_offset += so->logicalVec_size; + + // Combination multibit. + so->combVec = curr_offset; + so->combVec_size = mmbit_size(build.rm.numCkeys()); + curr_offset += so->combVec_size; + // SOM locations and valid/writeable multibit structures. if (build.ssm.numSomSlots()) { const u32 somWidth = build.ssm.somPrecision(); @@ -2469,6 +2480,18 @@ void writeLeftInfo(RoseEngineBlob &engine_blob, RoseEngine &proto, proto.rosePrefixCount = countRosePrefixes(leftInfoTable); } +static +void writeLogicalInfo(const ReportManager &rm, RoseEngineBlob &engine_blob, + RoseEngine &proto) { + const auto &tree = rm.getLogicalTree(); + proto.logicalTreeOffset = engine_blob.add_range(tree); + const auto &combMap = rm.getCombInfoMap(); + proto.combInfoMapOffset = engine_blob.add_range(combMap); + proto.lkeyCount = rm.numLogicalKeys(); + proto.lopCount = rm.numLogicalOps(); + proto.ckeyCount = rm.numCkeys(); +} + static void writeNfaInfo(const RoseBuildImpl &build, build_context &bc, RoseEngine &proto, const set &no_retrigger_queues) { @@ -3313,6 +3336,15 @@ RoseProgram makeEodProgram(const RoseBuildImpl &build, build_context &bc, return program; } +static +RoseProgram makeFlushCombProgram(const RoseEngine &t) { + RoseProgram program; + if (t.ckeyCount) { + addFlushCombinationProgram(program); + } + return program; +} + static u32 history_required(const rose_literal_id &key) { if (key.msk.size() < key.s.length()) { @@ -3678,6 +3710,10 @@ bytecode_ptr RoseBuildImpl::buildFinalEngine(u32 minWidth) { writeDkeyInfo(rm, bc.engine_blob, proto); writeLeftInfo(bc.engine_blob, proto, leftInfoTable); + writeLogicalInfo(rm, bc.engine_blob, proto); + + auto flushComb_prog = makeFlushCombProgram(proto); + proto.flushCombProgramOffset = writeProgram(bc, move(flushComb_prog)); // Build anchored matcher. auto atable = buildAnchoredMatcher(*this, fragments, anchored_dfas); diff --git a/src/rose/rose_build_dump.cpp b/src/rose/rose_build_dump.cpp index b70112f2d..0cc5b5c31 100644 --- a/src/rose/rose_build_dump.cpp +++ b/src/rose/rose_build_dump.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -1469,6 +1469,25 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) { } PROGRAM_NEXT_INSTRUCTION + PROGRAM_CASE(SET_LOGICAL) { + os << " lkey " << ri->lkey << endl; + os << " offset_adjust " << ri->offset_adjust << endl; + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(SET_COMBINATION) { + os << " ckey " << ri->ckey << endl; + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(FLUSH_COMBINATION) {} + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(SET_EXHAUST) { + os << " ekey " << ri->ekey << endl; + } + PROGRAM_NEXT_INSTRUCTION + default: os << " UNKNOWN (code " << int{code} << ")" << endl; os << " " << endl; @@ -1523,6 +1542,23 @@ void dumpRoseEodPrograms(const RoseEngine *t, const string &filename) { os.close(); } +static +void dumpRoseFlushCombPrograms(const RoseEngine *t, const string &filename) { + ofstream os(filename); + const char *base = (const char *)t; + + if (t->flushCombProgramOffset) { + os << "Flush Combination Program @ " << t->flushCombProgramOffset + << ":" << endl; + dumpProgram(os, t, base + t->flushCombProgramOffset); + os << endl; + } else { + os << "" << endl; + } + + os.close(); +} + static void dumpRoseReportPrograms(const RoseEngine *t, const string &filename) { ofstream os(filename); @@ -2028,6 +2064,10 @@ void roseDumpText(const RoseEngine *t, FILE *f) { fprintf(f, " - history buffer : %u bytes\n", t->historyRequired); fprintf(f, " - exhaustion vector : %u bytes\n", t->stateOffsets.exhausted_size); + fprintf(f, " - logical vector : %u bytes\n", + t->stateOffsets.logicalVec_size); + fprintf(f, " - combination vector: %u bytes\n", + t->stateOffsets.combVec_size); fprintf(f, " - role state mmbit : %u bytes\n", t->stateSize); fprintf(f, " - long lit matcher : %u bytes\n", t->longLitStreamState); fprintf(f, " - active array : %u bytes\n", @@ -2092,6 +2132,11 @@ void roseDumpStructRaw(const RoseEngine *t, FILE *f) { DUMP_U32(t, mode); DUMP_U32(t, historyRequired); DUMP_U32(t, ekeyCount); + DUMP_U32(t, lkeyCount); + DUMP_U32(t, lopCount); + DUMP_U32(t, ckeyCount); + DUMP_U32(t, logicalTreeOffset); + DUMP_U32(t, combInfoMapOffset); DUMP_U32(t, dkeyCount); DUMP_U32(t, dkeyLogSize); DUMP_U32(t, invDkeyOffset); @@ -2127,6 +2172,7 @@ void roseDumpStructRaw(const RoseEngine *t, FILE *f) { DUMP_U32(t, leftOffset); DUMP_U32(t, roseCount); DUMP_U32(t, eodProgramOffset); + DUMP_U32(t, flushCombProgramOffset); DUMP_U32(t, lastByteHistoryIterOffset); DUMP_U32(t, minWidth); DUMP_U32(t, minWidthExcludingBoundaries); @@ -2150,6 +2196,10 @@ void roseDumpStructRaw(const RoseEngine *t, FILE *f) { DUMP_U32(t, stateOffsets.history); DUMP_U32(t, stateOffsets.exhausted); DUMP_U32(t, stateOffsets.exhausted_size); + DUMP_U32(t, stateOffsets.logicalVec); + DUMP_U32(t, stateOffsets.logicalVec_size); + DUMP_U32(t, stateOffsets.combVec); + DUMP_U32(t, stateOffsets.combVec_size); DUMP_U32(t, stateOffsets.activeLeafArray); DUMP_U32(t, stateOffsets.activeLeafArray_size); DUMP_U32(t, stateOffsets.activeLeftArray); @@ -2200,6 +2250,7 @@ void roseDumpPrograms(const vector &fragments, const RoseEngine *t, const string &base) { dumpRoseLitPrograms(fragments, t, base + "/rose_lit_programs.txt"); dumpRoseEodPrograms(t, base + "/rose_eod_programs.txt"); + dumpRoseFlushCombPrograms(t, base + "/rose_flush_comb_programs.txt"); dumpRoseReportPrograms(t, base + "/rose_report_programs.txt"); dumpRoseAnchoredPrograms(t, base + "/rose_anchored_programs.txt"); dumpRoseDelayPrograms(t, base + "/rose_delay_programs.txt"); diff --git a/src/rose/rose_build_instructions.cpp b/src/rose/rose_build_instructions.cpp index 8af082984..2fe534559 100644 --- a/src/rose/rose_build_instructions.cpp +++ b/src/rose/rose_build_instructions.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, Intel Corporation + * Copyright (c) 2017-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -47,6 +47,7 @@ RoseInstrSuffixesEod::~RoseInstrSuffixesEod() = default; RoseInstrMatcherEod::~RoseInstrMatcherEod() = default; RoseInstrEnd::~RoseInstrEnd() = default; RoseInstrClearWorkDone::~RoseInstrClearWorkDone() = default; +RoseInstrFlushCombination::~RoseInstrFlushCombination() = default; using OffsetMap = RoseInstruction::OffsetMap; @@ -644,4 +645,26 @@ void RoseInstrIncludedJump::write(void *dest, RoseEngineBlob &blob, inst->squash = squash; } +void RoseInstrSetLogical::write(void *dest, RoseEngineBlob &blob, + const OffsetMap &offset_map) const { + RoseInstrBase::write(dest, blob, offset_map); + auto *inst = static_cast(dest); + inst->lkey = lkey; + inst->offset_adjust = offset_adjust; +} + +void RoseInstrSetCombination::write(void *dest, RoseEngineBlob &blob, + const OffsetMap &offset_map) const { + RoseInstrBase::write(dest, blob, offset_map); + auto *inst = static_cast(dest); + inst->ckey = ckey; +} + +void RoseInstrSetExhaust::write(void *dest, RoseEngineBlob &blob, + const OffsetMap &offset_map) const { + RoseInstrBase::write(dest, blob, offset_map); + auto *inst = static_cast(dest); + inst->ekey = ekey; +} + } diff --git a/src/rose/rose_build_instructions.h b/src/rose/rose_build_instructions.h index d3ede29b6..61e6d7a65 100644 --- a/src/rose/rose_build_instructions.h +++ b/src/rose/rose_build_instructions.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, Intel Corporation + * Copyright (c) 2017-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -2144,6 +2144,94 @@ class RoseInstrIncludedJump } }; +class RoseInstrSetLogical + : public RoseInstrBaseNoTargets { +public: + u32 lkey; + s32 offset_adjust; + + RoseInstrSetLogical(u32 lkey_in, s32 offset_adjust_in) + : lkey(lkey_in), offset_adjust(offset_adjust_in) {} + + bool operator==(const RoseInstrSetLogical &ri) const { + return lkey == ri.lkey && offset_adjust == ri.offset_adjust; + } + + size_t hash() const override { + return hash_all(opcode, lkey, offset_adjust); + } + + void write(void *dest, RoseEngineBlob &blob, + const OffsetMap &offset_map) const override; + + bool equiv_to(const RoseInstrSetLogical &ri, const OffsetMap &, + const OffsetMap &) const { + return lkey == ri.lkey && offset_adjust == ri.offset_adjust; + } +}; + +class RoseInstrSetCombination + : public RoseInstrBaseNoTargets { +public: + u32 ckey; + + RoseInstrSetCombination(u32 ckey_in) : ckey(ckey_in) {} + + bool operator==(const RoseInstrSetCombination &ri) const { + return ckey == ri.ckey; + } + + size_t hash() const override { + return hash_all(opcode, ckey); + } + + void write(void *dest, RoseEngineBlob &blob, + const OffsetMap &offset_map) const override; + + bool equiv_to(const RoseInstrSetCombination &ri, const OffsetMap &, + const OffsetMap &) const { + return ckey == ri.ckey; + } +}; + +class RoseInstrFlushCombination + : public RoseInstrBaseTrivial { +public: + ~RoseInstrFlushCombination() override; +}; + +class RoseInstrSetExhaust + : public RoseInstrBaseNoTargets { +public: + u32 ekey; + + RoseInstrSetExhaust(u32 ekey_in) : ekey(ekey_in) {} + + bool operator==(const RoseInstrSetExhaust &ri) const { + return ekey == ri.ekey; + } + + size_t hash() const override { + return hash_all(opcode, ekey); + } + + void write(void *dest, RoseEngineBlob &blob, + const OffsetMap &offset_map) const override; + + bool equiv_to(const RoseInstrSetExhaust &ri, const OffsetMap &, + const OffsetMap &) const { + return ekey == ri.ekey; + } +}; + class RoseInstrEnd : public RoseInstrBaseTrivial { diff --git a/src/rose/rose_build_merge.cpp b/src/rose/rose_build_merge.cpp index c0eba22b0..5066dbd57 100644 --- a/src/rose/rose_build_merge.cpp +++ b/src/rose/rose_build_merge.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -459,7 +459,7 @@ class Bouquet { const_iterator end() const { return ordering.end(); } }; -typedef Bouquet RoseBouquet; +typedef Bouquet LeftfixBouquet; typedef Bouquet SuffixBouquet; } // namespace @@ -565,7 +565,7 @@ bool hasSameEngineType(const RoseVertexProps &u_prop, * * Parameters are vectors of literals + lag pairs. * - * Note: if more constaints of when the leftfixes were going to be checked + * Note: if more constraints of when the leftfixes were going to be checked * (mandatory lookarounds passing, offset checks), more merges may be allowed. */ static @@ -599,7 +599,7 @@ bool compatibleLiteralsForMerge( /* An engine requires that all accesses to it are ordered by offsets. (ie, we can not check an engine's state at offset Y, if we have already checked its status at offset X and X > Y). If we can not establish that - the literals used for triggering will statisfy this property, then it is + the literals used for triggering will satisfy this property, then it is not safe to merge the engine. */ for (const auto &ue : ulits) { const rose_literal_id &ul = *ue.first; @@ -1437,7 +1437,19 @@ void mergeLeftfixesVariableLag(RoseBuildImpl &build) { assert(!parents.empty()); +#ifndef _WIN32 engine_groups[MergeKey(left, parents)].push_back(left); +#else + // On windows, when passing MergeKey object into map 'engine_groups', + // it will not be copied, but will be freed along with + // engine_groups.clear(). + // If we construct MergeKey object on the stack, it will be destructed + // on its life cycle ending, then on engine_groups.clear(), which + // will cause is_block_type_valid() assertion error in MergeKey + // destructor. + MergeKey *mk = new MergeKey(left, parents); + engine_groups[*mk].push_back(left); +#endif } vector> chunks; @@ -1778,7 +1790,7 @@ u32 estimatedAccelStates(const RoseBuildImpl &tbi, const NGHolder &h) { } static -void mergeNfaLeftfixes(RoseBuildImpl &tbi, RoseBouquet &roses) { +void mergeNfaLeftfixes(RoseBuildImpl &tbi, LeftfixBouquet &roses) { RoseGraph &g = tbi.g; DEBUG_PRINTF("%zu nfa rose merge candidates\n", roses.size()); @@ -1894,7 +1906,7 @@ void mergeSmallLeftfixes(RoseBuildImpl &tbi) { RoseGraph &g = tbi.g; - RoseBouquet nfa_roses; + LeftfixBouquet nfa_leftfixes; for (auto v : vertices_range(g)) { if (!g[v].left) { @@ -1939,20 +1951,20 @@ void mergeSmallLeftfixes(RoseBuildImpl &tbi) { continue; } - nfa_roses.insert(left, v); + nfa_leftfixes.insert(left, v); } - deque rose_groups; - chunkBouquets(nfa_roses, rose_groups, MERGE_GROUP_SIZE_MAX); - nfa_roses.clear(); - DEBUG_PRINTF("chunked nfa roses into %zu groups\n", rose_groups.size()); + deque leftfix_groups; + chunkBouquets(nfa_leftfixes, leftfix_groups, MERGE_GROUP_SIZE_MAX); + nfa_leftfixes.clear(); + DEBUG_PRINTF("chunked nfa leftfixes into %zu groups\n", + leftfix_groups.size()); - for (auto &group : rose_groups) { + for (auto &group : leftfix_groups) { mergeNfaLeftfixes(tbi, group); } } - static void mergeCastleChunk(RoseBuildImpl &build, vector &cands, insertion_ordered_map> &eng_verts) { diff --git a/src/rose/rose_build_misc.cpp b/src/rose/rose_build_misc.cpp index a7332df70..0b0e689c9 100644 --- a/src/rose/rose_build_misc.cpp +++ b/src/rose/rose_build_misc.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -993,15 +993,19 @@ bool canImplementGraphs(const RoseBuildImpl &tbi) { return true; } +/** + * \brief True if there is an engine with a top that is not triggered by a + * vertex in the Rose graph. This is a consistency check used in assertions. + */ bool hasOrphanedTops(const RoseBuildImpl &build) { const RoseGraph &g = build.g; - unordered_map> roses; + unordered_map> leftfixes; unordered_map> suffixes; for (auto v : vertices_range(g)) { if (g[v].left) { - set &tops = roses[g[v].left]; + set &tops = leftfixes[g[v].left]; if (!build.isRootSuccessor(v)) { // Tops for infixes come from the in-edges. for (const auto &e : in_edges_range(v, g)) { @@ -1014,7 +1018,7 @@ bool hasOrphanedTops(const RoseBuildImpl &build) { } } - for (const auto &e : roses) { + for (const auto &e : leftfixes) { if (all_tops(e.first) != e.second) { DEBUG_PRINTF("rose tops (%s) don't match rose graph (%s)\n", as_string_list(all_tops(e.first)).c_str(), diff --git a/src/rose/rose_build_program.cpp b/src/rose/rose_build_program.cpp index 8f350e295..2a6581e97 100644 --- a/src/rose/rose_build_program.cpp +++ b/src/rose/rose_build_program.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2017, Intel Corporation + * Copyright (c) 2016-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -280,7 +280,7 @@ void stripCheckHandledInstruction(RoseProgram &prog) { } -/** Returns true if the program may read the the interpreter's work_done flag */ +/** Returns true if the program may read the interpreter's work_done flag */ static bool reads_work_done_flag(const RoseProgram &prog) { for (const auto &ri : prog) { @@ -313,6 +313,10 @@ void addMatcherEodProgram(RoseProgram &program) { program.add_block(move(block)); } +void addFlushCombinationProgram(RoseProgram &program) { + program.add_before_end(make_unique()); +} + static void makeRoleCheckLeftfix(const RoseBuildImpl &build, const map &leftfix_info, @@ -496,6 +500,23 @@ void writeSomOperation(const Report &report, som_operation *op) { } } +static +void addLogicalSetRequired(const Report &report, ReportManager &rm, + RoseProgram &program) { + if (report.lkey == INVALID_LKEY) { + return; + } + // set matching status of current lkey + auto risl = make_unique(report.lkey, + report.offsetAdjust); + program.add_before_end(move(risl)); + // set current lkey's corresponding ckeys active, pending to check + for (auto ckey : rm.getRelateCKeys(report.lkey)) { + auto risc = make_unique(ckey); + program.add_before_end(move(risc)); + } +} + static void makeReport(const RoseBuildImpl &build, const ReportID id, const bool has_som, RoseProgram &program) { @@ -542,38 +563,62 @@ void makeReport(const RoseBuildImpl &build, const ReportID id, switch (report.type) { case EXTERNAL_CALLBACK: + if (build.rm.numCkeys()) { + addFlushCombinationProgram(report_block); + } if (!has_som) { // Dedupe is only necessary if this report has a dkey, or if there // are SOM reports to catch up. bool needs_dedupe = build.rm.getDkey(report) != ~0U || build.hasSom; if (report.ekey == INVALID_EKEY) { if (needs_dedupe) { - report_block.add_before_end( - make_unique( - report.quashSom, build.rm.getDkey(report), - report.onmatch, report.offsetAdjust, end_inst)); + if (!report.quiet) { + report_block.add_before_end( + make_unique( + report.quashSom, build.rm.getDkey(report), + report.onmatch, report.offsetAdjust, end_inst)); + } else { + makeDedupe(build.rm, report, report_block); + } } else { - report_block.add_before_end(make_unique( - report.onmatch, report.offsetAdjust)); + if (!report.quiet) { + report_block.add_before_end( + make_unique( + report.onmatch, report.offsetAdjust)); + } } } else { if (needs_dedupe) { makeDedupe(build.rm, report, report_block); } - report_block.add_before_end(make_unique( - report.onmatch, report.offsetAdjust, report.ekey)); + if (!report.quiet) { + report_block.add_before_end( + make_unique( + report.onmatch, report.offsetAdjust, report.ekey)); + } else { + report_block.add_before_end( + make_unique(report.ekey)); + } } } else { // has_som makeDedupeSom(build.rm, report, report_block); if (report.ekey == INVALID_EKEY) { - report_block.add_before_end(make_unique( - report.onmatch, report.offsetAdjust)); + if (!report.quiet) { + report_block.add_before_end(make_unique( + report.onmatch, report.offsetAdjust)); + } } else { - report_block.add_before_end( - make_unique( - report.onmatch, report.offsetAdjust, report.ekey)); + if (!report.quiet) { + report_block.add_before_end( + make_unique( + report.onmatch, report.offsetAdjust, report.ekey)); + } else { + report_block.add_before_end( + make_unique(report.ekey)); + } } } + addLogicalSetRequired(report, build.rm, report_block); break; case INTERNAL_SOM_LOC_SET: case INTERNAL_SOM_LOC_SET_IF_UNSET: @@ -586,6 +631,9 @@ void makeReport(const RoseBuildImpl &build, const ReportID id, case INTERNAL_SOM_LOC_MAKE_WRITABLE: case INTERNAL_SOM_LOC_SET_FROM: case INTERNAL_SOM_LOC_SET_FROM_IF_WRITABLE: + if (build.rm.numCkeys()) { + addFlushCombinationProgram(report_block); + } if (has_som) { auto ri = make_unique(); writeSomOperation(report, &ri->som); @@ -605,24 +653,48 @@ void makeReport(const RoseBuildImpl &build, const ReportID id, case EXTERNAL_CALLBACK_SOM_STORED: case EXTERNAL_CALLBACK_SOM_ABS: case EXTERNAL_CALLBACK_SOM_REV_NFA: + if (build.rm.numCkeys()) { + addFlushCombinationProgram(report_block); + } makeDedupeSom(build.rm, report, report_block); if (report.ekey == INVALID_EKEY) { - report_block.add_before_end(make_unique( - report.onmatch, report.offsetAdjust)); + if (!report.quiet) { + report_block.add_before_end(make_unique( + report.onmatch, report.offsetAdjust)); + } } else { - report_block.add_before_end(make_unique( - report.onmatch, report.offsetAdjust, report.ekey)); + if (!report.quiet) { + report_block.add_before_end( + make_unique( + report.onmatch, report.offsetAdjust, report.ekey)); + } else { + report_block.add_before_end( + make_unique(report.ekey)); + } } + addLogicalSetRequired(report, build.rm, report_block); break; case EXTERNAL_CALLBACK_SOM_PASS: + if (build.rm.numCkeys()) { + addFlushCombinationProgram(report_block); + } makeDedupeSom(build.rm, report, report_block); if (report.ekey == INVALID_EKEY) { - report_block.add_before_end(make_unique( - report.onmatch, report.offsetAdjust)); + if (!report.quiet) { + report_block.add_before_end(make_unique( + report.onmatch, report.offsetAdjust)); + } } else { - report_block.add_before_end(make_unique( - report.onmatch, report.offsetAdjust, report.ekey)); + if (!report.quiet) { + report_block.add_before_end( + make_unique( + report.onmatch, report.offsetAdjust, report.ekey)); + } else { + report_block.add_before_end( + make_unique(report.ekey)); + } } + addLogicalSetRequired(report, build.rm, report_block); break; default: @@ -630,7 +702,6 @@ void makeReport(const RoseBuildImpl &build, const ReportID id, throw CompileError("Unable to generate bytecode."); } - assert(!report_block.empty()); program.add_block(move(report_block)); } @@ -1837,7 +1908,7 @@ void makeRoleEagerEodReports(const RoseBuildImpl &build, program.add_before_end(move(eod_program)); } -/* Makes a program for a role/vertex given a specfic pred/in_edge. */ +/** Makes a program for a role/vertex given a specific pred/in_edge. */ static RoseProgram makeRoleProgram(const RoseBuildImpl &build, const map &leftfix_info, @@ -2045,7 +2116,7 @@ RoseProgram makeLiteralProgram(const RoseBuildImpl &build, } if (lit_id == build.eod_event_literal_id) { - /* Note: does not require the lit intial program */ + /* Note: does not require the lit initial program */ assert(build.eod_event_literal_id != MO_INVALID_IDX); return role_programs; } diff --git a/src/rose/rose_build_program.h b/src/rose/rose_build_program.h index cc59303f2..8c8c37ed9 100644 --- a/src/rose/rose_build_program.h +++ b/src/rose/rose_build_program.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2017, Intel Corporation + * Copyright (c) 2016-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -187,6 +187,7 @@ struct ProgramBuild : noncopyable { void addEnginesEodProgram(u32 eodNfaIterOffset, RoseProgram &program); void addSuffixesEodProgram(RoseProgram &program); void addMatcherEodProgram(RoseProgram &program); +void addFlushCombinationProgram(RoseProgram &program); static constexpr u32 INVALID_QUEUE = ~0U; diff --git a/src/rose/rose_graph.h b/src/rose/rose_graph.h index 2c5ebbe99..499d796ac 100644 --- a/src/rose/rose_graph.h +++ b/src/rose/rose_graph.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -41,7 +41,6 @@ #include "rose_build.h" #include "rose_internal.h" #include "nfa/nfa_internal.h" // for MO_INVALID_IDX -#include "util/charreach.h" #include "util/depth.h" #include "util/flat_containers.h" #include "util/ue2_graph.h" diff --git a/src/rose/rose_in_graph.h b/src/rose/rose_in_graph.h index ed4644ae4..da0ea08da 100644 --- a/src/rose/rose_in_graph.h +++ b/src/rose/rose_in_graph.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -27,7 +27,7 @@ */ /** \file - * \brief Rose Input Graph: Used for ng_rose -> rose_build_add communication. + * \brief Rose Input Graph: Used for ng_violet -> rose_build_add communication. * * The input graph MUST be a DAG. * There MUST be exactly 1 START or ANCHORED_START vertex. @@ -127,7 +127,7 @@ struct RoseInVertexProps { flat_set reports; /**< for RIV_ACCEPT/RIV_ACCEPT_EOD */ u32 min_offset; /**< Minimum offset at which this vertex can match. */ u32 max_offset; /**< Maximum offset at which this vertex can match. */ - size_t index = 0; + size_t index = 0; /**< \brief Unique vertex index. */ }; struct RoseInEdgeProps { @@ -176,7 +176,13 @@ struct RoseInEdgeProps { /** \brief Haig version of graph, if required. */ std::shared_ptr haig; + /** + * \brief Distance behind the match offset for the literal in the target + * vertex that the leftfix needs to be checked at. + */ u32 graph_lag; + + /** \brief Unique edge index. */ size_t index = 0; }; diff --git a/src/rose/rose_internal.h b/src/rose/rose_internal.h index d38ee8c09..386b035ca 100644 --- a/src/rose/rose_internal.h +++ b/src/rose/rose_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -199,9 +199,25 @@ struct RoseStateOffsets { * reports with that ekey should not be delivered to the user. */ u32 exhausted; - /** size of exhausted multibit */ + /** size in bytes of exhausted multibit */ u32 exhausted_size; + /** Logical multibit. + * + * entry per logical key(operand/operator) (used by Logical Combination). */ + u32 logicalVec; + + /** size in bytes of logical multibit */ + u32 logicalVec_size; + + /** Combination multibit. + * + * entry per combination key (used by Logical Combination). */ + u32 combVec; + + /** size in bytes of combination multibit */ + u32 combVec_size; + /** Multibit for active suffix/outfix engines. */ u32 activeLeafArray; @@ -327,6 +343,11 @@ struct RoseEngine { u32 mode; /**< scanning mode, one of HS_MODE_{BLOCK,STREAM,VECTORED} */ u32 historyRequired; /**< max amount of history required for streaming */ u32 ekeyCount; /**< number of exhaustion keys */ + u32 lkeyCount; /**< number of logical keys */ + u32 lopCount; /**< number of logical ops */ + u32 ckeyCount; /**< number of combination keys */ + u32 logicalTreeOffset; /**< offset to mapping from lkey to LogicalOp */ + u32 combInfoMapOffset; /**< offset to mapping from ckey to combInfo */ u32 dkeyCount; /**< number of dedupe keys */ u32 dkeyLogSize; /**< size of fatbit for storing dkey log (bytes) */ u32 invDkeyOffset; /**< offset to table mapping from dkeys to the external @@ -404,6 +425,7 @@ struct RoseEngine { u32 roseCount; u32 eodProgramOffset; //!< EOD program, otherwise 0. + u32 flushCombProgramOffset; /**< FlushCombination program, otherwise 0 */ u32 lastByteHistoryIterOffset; // if non-zero diff --git a/src/rose/rose_program.h b/src/rose/rose_program.h index eeebfed1c..7feee04fe 100644 --- a/src/rose/rose_program.h +++ b/src/rose/rose_program.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -183,7 +183,25 @@ enum RoseInstructionCode { */ ROSE_INSTR_INCLUDED_JUMP, - LAST_ROSE_INSTRUCTION = ROSE_INSTR_INCLUDED_JUMP //!< Sentinel. + /** + * \brief Set matching status of a sub-expression. + */ + ROSE_INSTR_SET_LOGICAL, + + /** + * \brief Set combination status pending checking. + */ + ROSE_INSTR_SET_COMBINATION, + + /** + * \brief Check if compliant with any logical constraints. + */ + ROSE_INSTR_FLUSH_COMBINATION, + + /** \brief Mark as exhausted instead of report while quiet. */ + ROSE_INSTR_SET_EXHAUST, + + LAST_ROSE_INSTRUCTION = ROSE_INSTR_SET_EXHAUST //!< Sentinel. }; struct ROSE_STRUCT_END { @@ -636,4 +654,24 @@ struct ROSE_STRUCT_INCLUDED_JUMP { u8 squash; //!< FDR confirm squash mask for included literal. u32 child_offset; //!< Program offset of included literal. }; + +struct ROSE_STRUCT_SET_LOGICAL { + u8 code; //!< From enum RoseInstructionCode. + u32 lkey; //!< Logical key to set. + s32 offset_adjust; //!< offsetAdjust from struct Report triggers the flush. +}; + +struct ROSE_STRUCT_SET_COMBINATION { + u8 code; //!< From enum RoseInstructionCode. + u32 ckey; //!< Combination key to set. +}; + +struct ROSE_STRUCT_FLUSH_COMBINATION { + u8 code; //!< From enum RoseInstructionCode. +}; + +struct ROSE_STRUCT_SET_EXHAUST { + u8 code; //!< From enum RoseInstructionCode. + u32 ekey; //!< Exhaustion key. +}; #endif // ROSE_ROSE_PROGRAM_H diff --git a/src/rose/runtime.h b/src/rose/runtime.h index 88342b531..5fbb2b741 100644 --- a/src/rose/runtime.h +++ b/src/rose/runtime.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -127,6 +127,15 @@ void updateLastMatchOffset(struct RoseContext *tctxt, u64a offset) { tctxt->lastMatchOffset = offset; } +static really_inline +void updateLastCombMatchOffset(struct RoseContext *tctxt, u64a offset) { + DEBUG_PRINTF("match @%llu, last match @%llu\n", offset, + tctxt->lastCombMatchOffset); + + assert(offset >= tctxt->lastCombMatchOffset); + tctxt->lastCombMatchOffset = offset; +} + static really_inline void updateMinMatchOffset(struct RoseContext *tctxt, u64a offset) { DEBUG_PRINTF("min match now @%llu, was @%llu\n", offset, diff --git a/src/rose/stream.c b/src/rose/stream.c index d667ae56f..26268dd57 100644 --- a/src/rose/stream.c +++ b/src/rose/stream.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -578,6 +578,7 @@ void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch) { tctxt->lastEndOffset = offset; tctxt->filledDelayedSlots = 0; tctxt->lastMatchOffset = 0; + tctxt->lastCombMatchOffset = offset; tctxt->minMatchOffset = offset; tctxt->minNonMpvMatchOffset = offset; tctxt->next_mpv_offset = 0; @@ -700,6 +701,7 @@ void roseStreamInitEod(const struct RoseEngine *t, u64a offset, tctxt->lastEndOffset = offset; tctxt->filledDelayedSlots = 0; tctxt->lastMatchOffset = 0; + tctxt->lastCombMatchOffset = offset; /* DO NOT set 0 here! */ tctxt->minMatchOffset = offset; tctxt->minNonMpvMatchOffset = offset; tctxt->next_mpv_offset = offset; diff --git a/src/runtime.c b/src/runtime.c index c384c0318..052449f61 100644 --- a/src/runtime.c +++ b/src/runtime.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -67,9 +67,9 @@ void prefetch_data(const char *data, unsigned length) { /** dummy event handler for use when user does not provide one */ static -int null_onEvent(UNUSED unsigned id, UNUSED unsigned long long from, - UNUSED unsigned long long to, UNUSED unsigned flags, - UNUSED void *ctxt) { +int HS_CDECL null_onEvent(UNUSED unsigned id, UNUSED unsigned long long from, + UNUSED unsigned long long to, UNUSED unsigned flags, + UNUSED void *ctxt) { return 0; } @@ -356,6 +356,15 @@ hs_error_t HS_CDECL hs_scan(const hs_database_t *db, const char *data, length, NULL, 0, 0, 0, flags); clearEvec(rose, scratch->core_info.exhaustionVector); + if (rose->ckeyCount) { + scratch->core_info.logicalVector = scratch->bstate + + rose->stateOffsets.logicalVec; + scratch->core_info.combVector = scratch->bstate + + rose->stateOffsets.combVec; + scratch->tctxt.lastCombMatchOffset = 0; + clearLvec(rose, scratch->core_info.logicalVector, + scratch->core_info.combVector); + } if (!length) { if (rose->boundary.reportZeroEodOffset) { @@ -436,6 +445,13 @@ hs_error_t HS_CDECL hs_scan(const hs_database_t *db, const char *data, scratch); } + if (rose->flushCombProgramOffset) { + if (roseRunFlushCombProgram(rose, scratch, ~0ULL) == MO_HALT_MATCHING) { + unmarkScratchInUse(scratch); + return HS_SCAN_TERMINATED; + } + } + set_retval: DEBUG_PRINTF("done. told_to_stop_matching=%d\n", told_to_stop_matching(scratch)); @@ -500,6 +516,10 @@ void init_stream(struct hs_stream *s, const struct RoseEngine *rose, roseInitState(rose, state); clearEvec(rose, state + rose->stateOffsets.exhausted); + if (rose->ckeyCount) { + clearLvec(rose, state + rose->stateOffsets.logicalVec, + state + rose->stateOffsets.combVec); + } // SOM state multibit structures. initSomState(rose, state); @@ -614,6 +634,13 @@ void report_eod_matches(hs_stream_t *id, hs_scratch_t *scratch, getHistory(state, rose, id->offset), getHistoryAmount(rose, id->offset), id->offset, status, 0); + if (rose->ckeyCount) { + scratch->core_info.logicalVector = state + + rose->stateOffsets.logicalVec; + scratch->core_info.combVector = state + rose->stateOffsets.combVec; + scratch->tctxt.lastCombMatchOffset = id->offset; + } + if (rose->somLocationCount) { loadSomFromStream(scratch, id->offset); } @@ -657,6 +684,13 @@ void report_eod_matches(hs_stream_t *id, hs_scratch_t *scratch, scratch->core_info.status |= STATUS_TERMINATED; } } + + if (rose->flushCombProgramOffset && !told_to_stop_matching(scratch)) { + if (roseRunFlushCombProgram(rose, scratch, ~0ULL) == MO_HALT_MATCHING) { + DEBUG_PRINTF("told to stop matching\n"); + scratch->core_info.status |= STATUS_TERMINATED; + } + } } HS_PUBLIC_API @@ -849,6 +883,12 @@ hs_error_t hs_scan_stream_internal(hs_stream_t *id, const char *data, populateCoreInfo(scratch, rose, state, onEvent, context, data, length, getHistory(state, rose, id->offset), historyAmount, id->offset, status, flags); + if (rose->ckeyCount) { + scratch->core_info.logicalVector = state + + rose->stateOffsets.logicalVec; + scratch->core_info.combVector = state + rose->stateOffsets.combVec; + scratch->tctxt.lastCombMatchOffset = id->offset; + } assert(scratch->core_info.hlen <= id->offset && scratch->core_info.hlen <= rose->historyRequired); @@ -894,6 +934,12 @@ hs_error_t hs_scan_stream_internal(hs_stream_t *id, const char *data, } } + if (rose->flushCombProgramOffset && !told_to_stop_matching(scratch)) { + if (roseRunFlushCombProgram(rose, scratch, ~0ULL) == MO_HALT_MATCHING) { + scratch->core_info.status |= STATUS_TERMINATED; + } + } + setStreamStatus(state, scratch->core_info.status); if (likely(!can_stop_matching(scratch))) { diff --git a/src/scratch.h b/src/scratch.h index fa998e849..59aa02c69 100644 --- a/src/scratch.h +++ b/src/scratch.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -36,6 +36,7 @@ #ifndef SCRATCH_H_DA6D4FC06FF410 #define SCRATCH_H_DA6D4FC06FF410 +#include "hs_common.h" #include "ue2common.h" #include "rose/rose_types.h" @@ -88,12 +89,15 @@ struct core_info { void *userContext; /**< user-supplied context */ /** \brief user-supplied match callback */ - int (*userCallback)(unsigned int id, unsigned long long from, - unsigned long long to, unsigned int flags, void *ctx); + int (HS_CDECL *userCallback)(unsigned int id, unsigned long long from, + unsigned long long to, unsigned int flags, + void *ctx); const struct RoseEngine *rose; char *state; /**< full stream state */ char *exhaustionVector; /**< pointer to evec for this stream */ + char *logicalVector; /**< pointer to lvec for this stream */ + char *combVector; /**< pointer to cvec for this stream */ const u8 *buf; /**< main scan buffer */ size_t len; /**< length of main scan buffer in bytes */ const u8 *hbuf; /**< history buffer */ @@ -115,6 +119,7 @@ struct RoseContext { * stream */ u64a lastMatchOffset; /**< last match offset report up out of rose; * used _only_ for debugging, asserts */ + u64a lastCombMatchOffset; /**< last match offset of active combinations */ u64a minMatchOffset; /**< the earliest offset that we are still allowed to * report */ u64a minNonMpvMatchOffset; /**< the earliest offset that non-mpv engines are diff --git a/src/stream_compress_impl.h b/src/stream_compress_impl.h index 54aebd71b..d1ccf5e6d 100644 --- a/src/stream_compress_impl.h +++ b/src/stream_compress_impl.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, Intel Corporation + * Copyright (c) 2017-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -148,6 +148,13 @@ size_t JOIN(sc_, FN_SUFFIX)(const struct RoseEngine *rose, /* copy the exhaustion multibit */ COPY_MULTIBIT(stream_body + so->exhausted, rose->ekeyCount); + /* copy the logical multibit */ + COPY_MULTIBIT(stream_body + so->logicalVec, + rose->lkeyCount + rose->lopCount); + + /* copy the combination multibit */ + COPY_MULTIBIT(stream_body + so->combVec, rose->ckeyCount); + /* copy nfa stream state for endfixes */ /* Note: in the expand case the active array has already been copied into * the stream. */ diff --git a/src/ue2common.h b/src/ue2common.h index 4bec83155..5705af7be 100644 --- a/src/ue2common.h +++ b/src/ue2common.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -66,8 +66,13 @@ typedef signed int s32; /* We append the 'a' for aligned, since these aren't common, garden variety * 64 bit values. The alignment is necessary for structs on some platforms, * so we don't end up performing accidental unaligned accesses. */ +#if defined(_WIN32) && ! defined(_WIN64) +typedef unsigned long long ALIGN_ATTR(4) u64a; +typedef signed long long ALIGN_ATTR(4) s64a; +#else typedef unsigned long long ALIGN_ATTR(8) u64a; typedef signed long long ALIGN_ATTR(8) s64a; +#endif /* get the SIMD types */ #include "util/simd_types.h" diff --git a/src/util/bitfield.h b/src/util/bitfield.h index 24c0c5804..a580da7b6 100644 --- a/src/util/bitfield.h +++ b/src/util/bitfield.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -305,9 +305,10 @@ class bitfield { } /// Bitwise OR. - bitfield operator|(bitfield a) const { - a |= *this; - return a; + bitfield operator|(const bitfield &a) const { + bitfield b = a; + b |= *this; + return b; } /// Bitwise OR-equals. @@ -325,9 +326,10 @@ class bitfield { } /// Bitwise AND. - bitfield operator&(bitfield a) const { - a &= *this; - return a; + bitfield operator&(const bitfield &a) const { + bitfield b = a; + b &= *this; + return b; } /// Bitwise AND-equals. diff --git a/src/util/dump_charclass.cpp b/src/util/dump_charclass.cpp index 4535777d1..d0659a8bd 100644 --- a/src/util/dump_charclass.cpp +++ b/src/util/dump_charclass.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -56,7 +56,11 @@ void describeChar(ostream &os, char c, enum cc_output_t out_type) { const string backslash((out_type == CC_OUT_DOT ? 2 : 1), '\\'); +#ifdef _WIN32 + if (c >= 0x21 && c < 0x7F && c != '\\') { +#else if (isgraph(c) && c != '\\') { +#endif if (escaped.find(c) != string::npos) { os << backslash << c; } else if (out_type == CC_OUT_DOT diff --git a/src/util/graph_small_color_map.h b/src/util/graph_small_color_map.h index 03e61cf48..249b71531 100644 --- a/src/util/graph_small_color_map.h +++ b/src/util/graph_small_color_map.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, Intel Corporation + * Copyright (c) 2017-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -114,6 +114,21 @@ class small_color_map { std::memset(data->data(), val, data->size()); } + size_t count(small_color color) const { + assert(static_cast(color) < sizeof(fill_lut)); + size_t num = 0; + for (size_t i = 0; i < n; i++) { + size_t byte = i / entries_per_byte; + assert(byte < data->size()); + size_t bit = (i % entries_per_byte) * bit_size; + u8 val = ((*data)[byte] >> bit) & bit_mask; + if (static_cast(val) == color) { + num++; + } + } + return num; + } + small_color get_impl(key_type key) const { auto i = get(index_map, key); assert(i < n); diff --git a/src/util/graph_undirected.h b/src/util/graph_undirected.h new file mode 100644 index 000000000..049964ab0 --- /dev/null +++ b/src/util/graph_undirected.h @@ -0,0 +1,501 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file + * \brief Adaptor that presents an undirected view of a bidirectional BGL graph. + * + * Analogous to the reverse_graph adapter. You can construct one of these for + * bidirectional graph g with: + * + * auto ug = make_undirected_graph(g); + * + * The vertex descriptor type is the same as that of the underlying graph, but + * the edge descriptor is different. + */ + +#ifndef GRAPH_UNDIRECTED_H +#define GRAPH_UNDIRECTED_H + +#include "util/operators.h" + +#include +#include +#include +#include + +#include +#include + +namespace ue2 { + +struct undirected_graph_tag {}; + +template +class undirected_graph; + +namespace undirected_detail { + +template +class undirected_graph_edge_descriptor + : totally_ordered> { + using base_graph_type = BidirectionalGraph; + using base_graph_traits = typename boost::graph_traits; + using base_edge_type = typename base_graph_traits::edge_descriptor; + using base_vertex_type = typename base_graph_traits::vertex_descriptor; + + base_edge_type underlying_edge; + const base_graph_type *g; + bool reverse; // if true, reverse vertices in source() and target() + + inline std::pair + canonical_edge() const { + auto u = std::min(source(underlying_edge, *g), + target(underlying_edge, *g)); + auto v = std::max(source(underlying_edge, *g), + target(underlying_edge, *g)); + return std::make_pair(u, v); + } + + template + friend class ::ue2::undirected_graph; + +public: + undirected_graph_edge_descriptor() = default; + + undirected_graph_edge_descriptor(base_edge_type edge, + const base_graph_type &g_in, + bool reverse_in) + : underlying_edge(std::move(edge)), g(&g_in), reverse(reverse_in) {} + + bool operator==(const undirected_graph_edge_descriptor &other) const { + return canonical_edge() == other.canonical_edge(); + } + + bool operator<(const undirected_graph_edge_descriptor &other) const { + return canonical_edge() < other.canonical_edge(); + } + + base_vertex_type get_source() const { + return reverse ? target(underlying_edge, *g) + : source(underlying_edge, *g); + } + + base_vertex_type get_target() const { + return reverse ? source(underlying_edge, *g) + : target(underlying_edge, *g); + } +}; + +} // namespace undirected_detail + +template +class undirected_graph { +private: + using Self = undirected_graph; + using Traits = boost::graph_traits; + +public: + using base_type = BidirectionalGraph; + using base_ref_type = GraphRef; + + explicit undirected_graph(GraphRef g_in) : g(g_in) {} + + // Graph requirements + using vertex_descriptor = typename Traits::vertex_descriptor; + using edge_descriptor = + undirected_detail::undirected_graph_edge_descriptor; + using directed_category = boost::undirected_tag; + using edge_parallel_category = boost::disallow_parallel_edge_tag; + using traversal_category = typename Traits::traversal_category; + + // IncidenceGraph requirements + + /** + * \brief Templated iterator used for out_edge_iterator and + * in_edge_iterator, depending on the value of Reverse. + */ + template + class adj_edge_iterator + : public boost::iterator_facade< + adj_edge_iterator, edge_descriptor, + boost::forward_traversal_tag, edge_descriptor> { + vertex_descriptor u; + const base_type *g; + typename Traits::in_edge_iterator in_it; + typename Traits::out_edge_iterator out_it; + bool done_in = false; + public: + adj_edge_iterator() = default; + + adj_edge_iterator(vertex_descriptor u_in, const base_type &g_in, + bool end_iter) + : u(std::move(u_in)), g(&g_in) { + auto pi = in_edges(u, *g); + auto po = out_edges(u, *g); + if (end_iter) { + in_it = pi.second; + out_it = po.second; + done_in = true; + } else { + in_it = pi.first; + out_it = po.first; + if (in_it == pi.second) { + done_in = true; + find_first_valid_out(); + } + } + } + + private: + friend class boost::iterator_core_access; + + void find_first_valid_out() { + auto out_end = out_edges(u, *g).second; + for (; out_it != out_end; ++out_it) { + auto v = target(*out_it, *g); + if (!edge(v, u, *g).second) { + break; + } + } + } + + void increment() { + if (!done_in) { + auto in_end = in_edges(u, *g).second; + assert(in_it != in_end); + ++in_it; + if (in_it == in_end) { + done_in = true; + find_first_valid_out(); + } + } else { + ++out_it; + find_first_valid_out(); + } + } + bool equal(const adj_edge_iterator &other) const { + return in_it == other.in_it && out_it == other.out_it; + } + edge_descriptor dereference() const { + if (done_in) { + return edge_descriptor(*out_it, *g, Reverse); + } else { + return edge_descriptor(*in_it, *g, !Reverse); + } + } + }; + + using out_edge_iterator = adj_edge_iterator; + using in_edge_iterator = adj_edge_iterator; + + using degree_size_type = typename Traits::degree_size_type; + + // AdjacencyGraph requirements + using adjacency_iterator = + typename boost::adjacency_iterator_generator::type; + using inv_adjacency_iterator = + typename boost::inv_adjacency_iterator_generator< + Self, vertex_descriptor, in_edge_iterator>::type; + + // VertexListGraph requirements + using vertex_iterator = typename Traits::vertex_iterator; + + // EdgeListGraph requirements + enum { + is_edge_list = std::is_convertible::value + }; + + /** \brief Iterator used for edges(). */ + class edge_iterator + : public boost::iterator_facade { + const base_type *g; + typename Traits::edge_iterator it; + public: + edge_iterator() = default; + + edge_iterator(typename Traits::edge_iterator it_in, + const base_type &g_in) + : g(&g_in), it(std::move(it_in)) { + find_first_valid_edge(); + } + + private: + friend class boost::iterator_core_access; + + void find_first_valid_edge() { + const auto end = edges(*g).second; + for (; it != end; ++it) { + const auto &u = source(*it, *g); + const auto &v = target(*it, *g); + if (!edge(v, u, *g).second) { + break; // No reverse edge, we must visit this one + } + if (u <= v) { + // We have a reverse edge, but we'll return this one (and + // skip the other). Note that (u, u) shouldn't be skipped. + break; + } + } + } + + void increment() { + assert(it != edges(*g).second); + ++it; + find_first_valid_edge(); + } + bool equal(const edge_iterator &other) const { + return it == other.it; + } + edge_descriptor dereference() const { + return edge_descriptor(*it, *g, false); + } + }; + + using vertices_size_type = typename Traits::vertices_size_type; + using edges_size_type = typename Traits::edges_size_type; + + using graph_tag = undirected_graph_tag; + + using vertex_bundle_type = + typename boost::vertex_bundle_type::type; + using edge_bundle_type = typename boost::edge_bundle_type::type; + + vertex_bundle_type &operator[](const vertex_descriptor &d) { + return const_cast(g)[d]; + } + const vertex_bundle_type &operator[](const vertex_descriptor &d) const { + return g[d]; + } + + edge_bundle_type &operator[](const edge_descriptor &d) { + return const_cast(g)[d.underlying_edge]; + } + const edge_bundle_type &operator[](const edge_descriptor &d) const { + return g[d.underlying_edge]; + } + + static vertex_descriptor null_vertex() { return Traits::null_vertex(); } + + // Accessor free functions follow + + friend std::pair + vertices(const undirected_graph &ug) { + return vertices(ug.g); + } + + friend std::pair + edges(const undirected_graph &ug) { + auto e = edges(ug.g); + return std::make_pair(edge_iterator(e.first, ug.g), + edge_iterator(e.second, ug.g)); + } + + friend std::pair + out_edges(const vertex_descriptor &u, const undirected_graph &ug) { + return std::make_pair(out_edge_iterator(u, ug.g, false), + out_edge_iterator(u, ug.g, true)); + } + + friend vertices_size_type num_vertices(const undirected_graph &ug) { + return num_vertices(ug.g); + } + + friend edges_size_type num_edges(const undirected_graph &ug) { + auto p = edges(ug); + return std::distance(p.first, p.second); + } + + friend degree_size_type out_degree(const vertex_descriptor &u, + const undirected_graph &ug) { + return degree(u, ug); + } + + friend vertex_descriptor vertex(vertices_size_type n, + const undirected_graph &ug) { + return vertex(n, ug.g); + } + + friend std::pair edge(const vertex_descriptor &u, + const vertex_descriptor &v, + const undirected_graph &ug) { + auto e = edge(u, v, ug.g); + if (e.second) { + return std::make_pair(edge_descriptor(e.first, ug.g, false), true); + } + auto e_rev = edge(v, u, ug.g); + if (e_rev.second) { + return std::make_pair(edge_descriptor(e_rev.first, ug.g, true), + true); + } + return std::make_pair(edge_descriptor(), false); + } + + friend std::pair + in_edges(const vertex_descriptor &v, const undirected_graph &ug) { + return std::make_pair(in_edge_iterator(v, ug.g, false), + in_edge_iterator(v, ug.g, true)); + } + + friend std::pair + adjacent_vertices(const vertex_descriptor &u, const undirected_graph &ug) { + out_edge_iterator oi, oe; + std::tie(oi, oe) = out_edges(u, ug); + return std::make_pair(adjacency_iterator(oi, &ug), + adjacency_iterator(oe, &ug)); + } + + friend std::pair + inv_adjacent_vertices(const vertex_descriptor &v, + const undirected_graph &ug) { + in_edge_iterator ei, ee; + std::tie(ei, ee) = in_edges(v, ug); + return std::make_pair(inv_adjacency_iterator(ei, &ug), + inv_adjacency_iterator(ee, &ug)); + } + + friend degree_size_type in_degree(const vertex_descriptor &v, + const undirected_graph &ug) { + return degree(v, ug); + } + + friend vertex_descriptor source(const edge_descriptor &e, + const undirected_graph &) { + return e.get_source(); + } + + friend vertex_descriptor target(const edge_descriptor &e, + const undirected_graph &) { + return e.get_target(); + } + + friend degree_size_type degree(const vertex_descriptor &u, + const undirected_graph &ug) { + auto p = out_edges(u, ug); + return std::distance(p.first, p.second); + } + + // Property accessors. + + template + using prop_map = typename boost::property_map; + + template + friend typename prop_map::type + get(Property p, undirected_graph &ug) { + return get(p, ug.g); + } + + template + friend typename prop_map::const_type + get(Property p, const undirected_graph &ug) { + return get(p, ug.g); + } + + template + friend typename boost::property_traits< + typename prop_map::const_type>::value_type + get(Property p, const undirected_graph &ug, const Key &k) { + return get(p, ug.g, get_underlying_descriptor(k)); + } + + template + friend void put(Property p, const undirected_graph &ug, + const Key &k, const Value &val) { + put(p, const_cast(ug.g), + get_underlying_descriptor(k), val); + } + +private: + // Accessors are here because our free friend functions (above) cannot see + // edge_descriptor's private members. + static typename base_type::vertex_descriptor + get_underlying_descriptor(const vertex_descriptor &v) { + return v; + } + static typename base_type::edge_descriptor + get_underlying_descriptor(const edge_descriptor &e) { + return e.underlying_edge; + } + + // Reference to underlying bidirectional graph + GraphRef g; +}; + +template +undirected_graph +make_undirected_graph(const BidirectionalGraph &g) { + return undirected_graph(g); +} + +} // namespace ue2 + +namespace boost { + +/* Derive all the property map specializations from the underlying + * bidirectional graph. */ + +template +struct property_map, + Property> { + using base_map_type = property_map; + using type = typename base_map_type::type; + using const_type = typename base_map_type::const_type; +}; + +template +struct vertex_property_type> + : vertex_property_type {}; + +template +struct edge_property_type> + : edge_property_type {}; + +template +struct graph_property_type> + : graph_property_type {}; + +template +struct vertex_bundle_type> + : vertex_bundle_type {}; + +template +struct edge_bundle_type> + : edge_bundle_type {}; + +template +struct graph_bundle_type> + : graph_bundle_type {}; + +} // namespace boost + +#endif // GRAPH_UNDIRECTED_H diff --git a/src/util/logical.h b/src/util/logical.h new file mode 100644 index 000000000..0c8b6469a --- /dev/null +++ b/src/util/logical.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Inline functions for manipulating logical combinations. + */ + +#ifndef LOGICAL_H +#define LOGICAL_H + +#include "ue2common.h" + +/** Index meaning a given logical key is invalid. */ +#define INVALID_LKEY (~(u32)0) +#define INVALID_CKEY INVALID_LKEY + +/** Logical operation type, the priority is from high to low. */ +enum LogicalOpType { + LOGICAL_OP_NOT, + LOGICAL_OP_AND, + LOGICAL_OP_OR, + LAST_LOGICAL_OP = LOGICAL_OP_OR //!< Sentinel. +}; + +#define UNKNOWN_OP (~(u32)0) + +/** Logical Operation is consist of 4 parts. */ +struct LogicalOp { + u32 id; //!< logical operator/operation id + u32 op; //!< LogicalOpType + u32 lo; //!< left operand + u32 ro; //!< right operand +}; + +/** Each logical combination has its info: + * It occupies a region in LogicalOp vector. + * It has an exhaustion key for single-match mode. */ +struct CombInfo { + u32 id; + u32 ekey; //!< exhaustion key + u32 start; //!< ckey of logical operation to start calculating + u32 result; //!< ckey of logical operation to give final result + u64a min_offset; + u64a max_offset; +}; + +/** Temporarily use to seperate operations' id from reports' lkey + * when building logicalTree in shunting yard algorithm, + * operations' id will be finally renumbered following reports' lkey. */ +#define LOGICAL_OP_BIT 0x80000000UL + +#endif diff --git a/src/util/multibit.h b/src/util/multibit.h index 4df8733ae..c3a4ba461 100644 --- a/src/util/multibit.h +++ b/src/util/multibit.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -1197,7 +1197,11 @@ u32 mmbit_sparse_iter_begin(const u8 *bits, u32 total_bits, u32 *idx, assert(ISALIGNED_N(it_root, alignof(struct mmbit_sparse_iter))); // Our state _may_ be on the stack +#ifndef _WIN32 assert(ISALIGNED_N(s, alignof(struct mmbit_sparse_state))); +#else + assert(ISALIGNED_N(s, 4)); +#endif MDEBUG_PRINTF("%p total_bits %u\n", bits, total_bits); // iterator should have _something_ at the root level @@ -1305,7 +1309,11 @@ u32 mmbit_sparse_iter_next(const u8 *bits, u32 total_bits, u32 last_key, assert(ISALIGNED_N(it_root, alignof(struct mmbit_sparse_iter))); // Our state _may_ be on the stack +#ifndef _WIN32 assert(ISALIGNED_N(s, alignof(struct mmbit_sparse_state))); +#else + assert(ISALIGNED_N(s, 4)); +#endif MDEBUG_PRINTF("%p total_bits %u\n", bits, total_bits); MDEBUG_PRINTF("NEXT (total_bits=%u, last_key=%u)\n", total_bits, last_key); @@ -1458,7 +1466,11 @@ void mmbit_sparse_iter_unset(u8 *bits, u32 total_bits, assert(ISALIGNED_N(it, alignof(struct mmbit_sparse_iter))); // Our state _may_ be on the stack +#ifndef _WIN32 assert(ISALIGNED_N(s, alignof(struct mmbit_sparse_state))); +#else + assert(ISALIGNED_N(s, 4)); +#endif MDEBUG_PRINTF("%p total_bits %u\n", bits, total_bits); diff --git a/src/util/multibit_build.h b/src/util/multibit_build.h index ba5c8dfa7..24f1bb55b 100644 --- a/src/util/multibit_build.h +++ b/src/util/multibit_build.h @@ -33,6 +33,7 @@ #ifndef MULTIBIT_BUILD_H #define MULTIBIT_BUILD_H +#include "hs_common.h" #include "multibit_internal.h" #include "hash.h" diff --git a/src/util/report.h b/src/util/report.h index 0d5e69b8f..ee830d0f1 100644 --- a/src/util/report.h +++ b/src/util/report.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -36,6 +36,7 @@ #include "ue2common.h" #include "util/exhaust.h" // for INVALID_EKEY +#include "util/logical.h" // for INVALID_LKEY #include "util/hash.h" #include "util/order_check.h" @@ -107,6 +108,16 @@ struct Report { * exhaustible, this will be INVALID_EKEY. */ u32 ekey = INVALID_EKEY; + /** \brief Logical Combination key in each combination. + * + * If in Logical Combination, the lkey to check before reporting a match. + * Additionally before checking the lkey will be set. If not + * in Logical Combination, this will be INVALID_LKEY. */ + u32 lkey = INVALID_LKEY; + + /** \brief Quiet flag for expressions in any logical combination. */ + bool quiet = false; + /** \brief Adjustment to add to the match offset when we report a match. * * This is usually used for reports attached to states that form part of a @@ -207,16 +218,17 @@ bool operator==(const Report &a, const Report &b) { } static inline -Report makeECallback(u32 report, s32 offsetAdjust, u32 ekey) { +Report makeECallback(u32 report, s32 offsetAdjust, u32 ekey, bool quiet) { Report ir(EXTERNAL_CALLBACK, report); ir.offsetAdjust = offsetAdjust; ir.ekey = ekey; + ir.quiet = (u8)quiet; return ir; } static inline Report makeCallback(u32 report, s32 offsetAdjust) { - return makeECallback(report, offsetAdjust, INVALID_EKEY); + return makeECallback(report, offsetAdjust, INVALID_EKEY, false); } static inline diff --git a/src/util/report_manager.cpp b/src/util/report_manager.cpp index c0e9ee15c..78b9b73df 100644 --- a/src/util/report_manager.cpp +++ b/src/util/report_manager.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -95,6 +95,31 @@ u32 ReportManager::getExhaustibleKey(u32 a) { return it->second; } +const set &ReportManager::getRelateCKeys(u32 lkey) { + auto it = pl.lkey2ckeys.find(lkey); + assert(it != pl.lkey2ckeys.end()); + return it->second; +} + +void ReportManager::logicalKeyRenumber() { + pl.logicalKeyRenumber(); + // assign to corresponding report + for (u32 i = 0; i < reportIds.size(); i++) { + Report &ir = reportIds[i]; + if (contains(pl.toLogicalKeyMap, ir.onmatch)) { + ir.lkey = pl.toLogicalKeyMap.at(ir.onmatch); + } + } +} + +const vector &ReportManager::getLogicalTree() const { + return pl.logicalTree; +} + +const vector &ReportManager::getCombInfoMap() const { + return pl.combInfoMap; +} + u32 ReportManager::getUnassociatedExhaustibleKey(void) { u32 rv = toExhaustibleKeyMap.size(); bool inserted; @@ -115,6 +140,18 @@ u32 ReportManager::numEkeys() const { return (u32) toExhaustibleKeyMap.size(); } +u32 ReportManager::numLogicalKeys() const { + return (u32) pl.toLogicalKeyMap.size(); +} + +u32 ReportManager::numLogicalOps() const { + return (u32) pl.logicalTree.size(); +} + +u32 ReportManager::numCkeys() const { + return (u32) pl.toCombKeyMap.size(); +} + bool ReportManager::patternSetCanExhaust() const { return global_exhaust && !toExhaustibleKeyMap.empty(); } @@ -219,7 +256,7 @@ Report ReportManager::getBasicInternalReport(const ExpressionInfo &expr, ekey = getExhaustibleKey(expr.report); } - return makeECallback(expr.report, adj, ekey); + return makeECallback(expr.report, adj, ekey, expr.quiet); } void ReportManager::setProgramOffset(ReportID id, u32 programOffset) { diff --git a/src/util/report_manager.h b/src/util/report_manager.h index aa359ed76..015dc9c85 100644 --- a/src/util/report_manager.h +++ b/src/util/report_manager.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -38,6 +38,7 @@ #include "util/compile_error.h" #include "util/noncopyable.h" #include "util/report.h" +#include "parser/logical_combination.h" #include #include @@ -80,6 +81,15 @@ class ReportManager : noncopyable { /** \brief Total number of exhaustion keys. */ u32 numEkeys() const; + /** \brief Total number of logical keys. */ + u32 numLogicalKeys() const; + + /** \brief Total number of logical operators. */ + u32 numLogicalOps() const; + + /** \brief Total number of combination keys. */ + u32 numCkeys() const; + /** \brief True if the pattern set can exhaust (i.e. all patterns are * highlander). */ bool patternSetCanExhaust() const; @@ -110,6 +120,19 @@ class ReportManager : noncopyable { * assigning one if necessary. */ u32 getExhaustibleKey(u32 expressionIndex); + /** \brief Get lkey's corresponding ckeys. */ + const std::set &getRelateCKeys(u32 lkey); + + /** \brief Renumber lkey for logical operations, after parsed + * all logical expressions. */ + void logicalKeyRenumber(); + + /** \brief Used in Rose for writing bytecode. */ + const std::vector &getLogicalTree() const; + + /** \brief Used in Rose for writing bytecode. */ + const std::vector &getCombInfoMap() const; + /** \brief Fetch the dedupe key associated with the given report. Returns * ~0U if no dkey is needed. */ u32 getDkey(const Report &r) const; @@ -122,6 +145,9 @@ class ReportManager : noncopyable { * set. */ u32 getProgramOffset(ReportID id) const; + /** \brief Parsed logical combination structure. */ + ParsedLogical pl; + private: /** \brief Grey box ref, for checking resource limits. */ const Grey &grey; diff --git a/src/util/ue2_graph.h b/src/util/ue2_graph.h index bf719fd7a..b8e2e935d 100644 --- a/src/util/ue2_graph.h +++ b/src/util/ue2_graph.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2017, Intel Corporation + * Copyright (c) 2016-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -89,7 +89,7 @@ * (1) Deterministic ordering for vertices and edges * boost::adjacency_list<> uses pointer ordering for vertex_descriptors. As * a result, ordering of vertices and edges between runs is - * non-deterministic unless containers, etc use custom comparators. + * non-deterministic unless containers, etc use custom comparators. * * (2) Proper types for descriptors, etc. * No more void * for vertex_descriptors and trying to use it for the wrong @@ -288,7 +288,7 @@ class ue2_graph : graph_detail::graph_base { vertex_edge_list in_edge_list; /* The out going edges are considered owned by the vertex and - * need to be freed when the graph is begin destroyed */ + * need to be freed when the graph is being destroyed */ vertex_edge_list out_edge_list; /* The destructor only frees memory owned by the vertex and will leave @@ -1025,229 +1025,208 @@ class ue2_graph : graph_detail::graph_base { } }; +/** \brief Type trait to enable on whether the Graph is an ue2_graph. */ template -typename std::enable_if< - std::is_base_of::value, - typename Graph::vertex_descriptor>::type +struct is_ue2_graph + : public ::std::integral_constant< + bool, std::is_base_of::value> {}; + +template +typename std::enable_if::value, + typename Graph::vertex_descriptor>::type add_vertex(Graph &g) { return g.add_vertex_impl(); } template -typename std::enable_if< - std::is_base_of::value>::type +typename std::enable_if::value>::type remove_vertex(typename Graph::vertex_descriptor v, Graph &g) { g.remove_vertex_impl(v); } template -typename std::enable_if< - std::is_base_of::value>::type +typename std::enable_if::value>::type clear_in_edges(typename Graph::vertex_descriptor v, Graph &g) { g.clear_in_edges_impl(v); } template -typename std::enable_if< - std::is_base_of::value>::type +typename std::enable_if::value>::type clear_out_edges(typename Graph::vertex_descriptor v, Graph &g) { g.clear_out_edges_impl(v); } template -typename std::enable_if< - std::is_base_of::value>::type +typename std::enable_if::value>::type clear_vertex(typename Graph::vertex_descriptor v, Graph &g) { g.clear_in_edges_impl(v); g.clear_out_edges_impl(v); } template -typename std::enable_if< - std::is_base_of::value, - typename Graph::vertex_descriptor>::type +typename std::enable_if::value, + typename Graph::vertex_descriptor>::type source(typename Graph::edge_descriptor e, const Graph &) { return Graph::source_impl(e); } template -typename std::enable_if< - std::is_base_of::value, - typename Graph::vertex_descriptor>::type +typename std::enable_if::value, + typename Graph::vertex_descriptor>::type target(typename Graph::edge_descriptor e, const Graph &) { return Graph::target_impl(e); } template -typename std::enable_if< - std::is_base_of::value, - typename Graph::degree_size_type>::type +typename std::enable_if::value, + typename Graph::degree_size_type>::type out_degree(typename Graph::vertex_descriptor v, const Graph &) { return Graph::out_degree_impl(v); } template -typename std::enable_if< - std::is_base_of::value, - std::pair>::type +typename std::enable_if::value, + std::pair>::type out_edges(typename Graph::vertex_descriptor v, const Graph &) { return Graph::out_edges_impl(v); } template -typename std::enable_if< - std::is_base_of::value, - typename Graph::degree_size_type>::type +typename std::enable_if::value, + typename Graph::degree_size_type>::type in_degree(typename Graph::vertex_descriptor v, const Graph &) { return Graph::in_degree_impl(v); } template -typename std::enable_if< - std::is_base_of::value, - std::pair>::type +typename std::enable_if::value, + std::pair>::type in_edges(typename Graph::vertex_descriptor v, const Graph &) { return Graph::in_edges_impl(v); } template -typename std::enable_if< - std::is_base_of::value, - typename Graph::degree_size_type>::type +typename std::enable_if::value, + typename Graph::degree_size_type>::type degree(typename Graph::vertex_descriptor v, const Graph &) { return Graph::degree_impl(v); } template -typename std::enable_if< - std::is_base_of::value, - std::pair>::type +typename std::enable_if::value, + std::pair>::type adjacent_vertices(typename Graph::vertex_descriptor v, const Graph &) { return Graph::adjacent_vertices_impl(v); } template -typename std::enable_if< - std::is_base_of::value, - std::pair>::type +typename std::enable_if::value, + std::pair>::type edge(typename Graph::vertex_descriptor u, typename Graph::vertex_descriptor v, const Graph &g) { return g.edge_impl(u, v); } template -typename std::enable_if< - std::is_base_of::value, - std::pair>::type +typename std::enable_if::value, + std::pair>::type inv_adjacent_vertices(typename Graph::vertex_descriptor v, const Graph &) { return Graph::inv_adjacent_vertices_impl(v); } template -typename std::enable_if< - std::is_base_of::value, - std::pair>::type +typename std::enable_if::value, + std::pair>::type add_edge(typename Graph::vertex_descriptor u, typename Graph::vertex_descriptor v, Graph &g) { return g.add_edge_impl(u, v); } template -typename std::enable_if< - std::is_base_of::value>::type +typename std::enable_if::value>::type remove_edge(typename Graph::edge_descriptor e, Graph &g) { g.remove_edge_impl(e); } template typename std::enable_if< - !std::is_convertible::value - && std::is_base_of::value>::type + !std::is_convertible::value && + is_ue2_graph::value>::type remove_edge(Iter it, Graph &g) { g.remove_edge_impl(*it); } template -typename std::enable_if< - std::is_base_of::value>::type +typename std::enable_if::value>::type remove_out_edge_if(typename Graph::vertex_descriptor v, Predicate pred, Graph &g) { g.remove_out_edge_if_impl(v, pred); } template -typename std::enable_if< - std::is_base_of::value>::type +typename std::enable_if::value>::type remove_in_edge_if(typename Graph::vertex_descriptor v, Predicate pred, Graph &g) { g.remove_in_edge_if_impl(v, pred); } template -typename std::enable_if< - std::is_base_of::value>::type +typename std::enable_if::value>::type remove_edge_if(Predicate pred, Graph &g) { g.remove_edge_if_impl(pred); } template -typename std::enable_if< - std::is_base_of::value>::type +typename std::enable_if::value>::type remove_edge(const typename Graph::vertex_descriptor &u, const typename Graph::vertex_descriptor &v, Graph &g) { g.remove_edge_impl(u, v); } template -typename std::enable_if< - std::is_base_of::value, - typename Graph::vertices_size_type>::type +typename std::enable_if::value, + typename Graph::vertices_size_type>::type num_vertices(const Graph &g) { return g.num_vertices_impl(); } template -typename std::enable_if< - std::is_base_of::value, - std::pair>::type +typename std::enable_if::value, + std::pair>::type vertices(const Graph &g) { return g.vertices_impl(); } template -typename std::enable_if< - std::is_base_of::value, - typename Graph::edges_size_type>::type +typename std::enable_if::value, + typename Graph::edges_size_type>::type num_edges(const Graph &g) { return g.num_edges_impl(); } template -typename std::enable_if< - std::is_base_of::value, - std::pair>::type +typename std::enable_if::value, + std::pair>::type edges(const Graph &g) { return g.edges_impl(); } template -typename std::enable_if< - std::is_base_of::value, - typename Graph::vertex_descriptor>::type +typename std::enable_if::value, + typename Graph::vertex_descriptor>::type add_vertex(const typename Graph::vertex_property_type &vp, Graph &g) { return g.add_vertex_impl(vp); } template -typename std::enable_if< - std::is_base_of::value, - std::pair>::type +typename std::enable_if::value, + std::pair>::type add_edge(typename Graph::vertex_descriptor u, typename Graph::vertex_descriptor v, const typename Graph::edge_property_type &ep, Graph &g) { @@ -1255,35 +1234,59 @@ add_edge(typename Graph::vertex_descriptor u, } template -typename std::enable_if< - std::is_base_of::value>::type +typename std::enable_if::value>::type renumber_edges(Graph &g) { g.renumber_edges_impl(); } template -typename std::enable_if< - std::is_base_of::value>::type +typename std::enable_if::value>::type renumber_vertices(Graph &g) { g.renumber_vertices_impl(); } template -typename std::enable_if< - std::is_base_of::value, - typename Graph::vertices_size_type>::type +typename std::enable_if::value, + typename Graph::vertices_size_type>::type vertex_index_upper_bound(const Graph &g) { return g.vertex_index_upper_bound_impl(); } template -typename std::enable_if< - std::is_base_of::value, - typename Graph::edges_size_type>::type +typename std::enable_if::value, + typename Graph::edges_size_type>::type edge_index_upper_bound(const Graph &g) { return g.edge_index_upper_bound_impl(); } +template struct pointer_to_member_traits {}; + +template +struct pointer_to_member_traits { + using member_type = Return; + using class_type = Class; +}; + +template +struct is_ue2_vertex_or_edge_property { + static constexpr bool value = false; +}; + +template +struct is_ue2_vertex_or_edge_property< + Graph, Property, typename std::enable_if::value && + std::is_member_object_pointer< + Property>::value>::type> { +private: + using class_type = typename pointer_to_member_traits::class_type; + using vertex_type = typename Graph::vertex_property_type; + using edge_type = typename Graph::edge_property_type; +public: + static constexpr bool value = + std::is_same::value || + std::is_same::value; +}; + using boost::vertex_index; using boost::edge_index; @@ -1295,13 +1298,53 @@ namespace boost { * adaptors (like filtered_graph) to know the type of the property maps */ template struct property_map::value - >::type > { - typedef decltype(get(std::declval(), - std::declval())) type; - typedef decltype(get(std::declval(), - std::declval())) const_type; + typename std::enable_if::value && + ue2::is_ue2_vertex_or_edge_property< + Graph, Prop>::value>::type> { +private: + using prop_traits = ue2::pointer_to_member_traits; + using member_type = typename prop_traits::member_type; + using class_type = typename prop_traits::class_type; +public: + using type = typename Graph::template prop_map; + using const_type = typename Graph::template prop_map; +}; + +template +struct property_map::value>::type> { + using v_prop_type = typename Graph::vertex_property_type; + using type = typename Graph::template prop_map; + using const_type = + typename Graph::template prop_map; +}; + +template +struct property_map::value>::type> { + using e_prop_type = typename Graph::edge_property_type; + using type = typename Graph::template prop_map; + using const_type = + typename Graph::template prop_map; +}; + +template +struct property_map::value>::type> { + using v_prop_type = typename Graph::vertex_property_type; + using type = typename Graph::template prop_map_all; + using const_type = + typename Graph::template prop_map_all; +}; + +template +struct property_map::value>::type> { + using e_prop_type = typename Graph::edge_property_type; + using type = typename Graph::template prop_map_all; + using const_type = + typename Graph::template prop_map_all; }; } // namespace boost diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 61bb00f20..6ca3fd8a9 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -1,6 +1,3 @@ -if (WIN32) - return() -endif() find_package(Threads) # remove some warnings @@ -12,11 +9,18 @@ include_directories(${PROJECT_SOURCE_DIR}) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src) include_directories(${PROJECT_SOURCE_DIR}/util) -# add any subdir with a cmake file -file(GLOB dirents RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *) -foreach(e ${dirents}) - if(IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/${e} AND - EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${e}/CMakeLists.txt) - add_subdirectory(${e}) - endif () -endforeach () +if (WIN32) + add_subdirectory(hscheck) + add_subdirectory(hsbench) + add_subdirectory(hsdump) + add_subdirectory(hscollider) +else() + # add any subdir with a cmake file + file(GLOB dirents RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *) + foreach(e ${dirents}) + if(IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/${e} AND + EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${e}/CMakeLists.txt) + add_subdirectory(${e}) + endif () + endforeach () +endif() diff --git a/tools/hsbench/CMakeLists.txt b/tools/hsbench/CMakeLists.txt index f0e76da15..465081a8b 100644 --- a/tools/hsbench/CMakeLists.txt +++ b/tools/hsbench/CMakeLists.txt @@ -31,6 +31,8 @@ SET(hsbench_SOURCES common.h data_corpus.cpp data_corpus.h + engine.cpp + engine.h engine_hyperscan.cpp engine_hyperscan.h heapstats.cpp @@ -45,6 +47,28 @@ SET(hsbench_SOURCES timer.h ) +if (BUILD_CHIMERA) + add_definitions(-DHS_HYBRID) + SET(hsbench_SOURCES + ${hsbench_SOURCES} + engine_chimera.cpp + engine_chimera.h + engine_pcre.cpp + engine_pcre.h + ) +endif() + add_executable(hsbench ${hsbench_SOURCES}) -target_link_libraries(hsbench hs databaseutil expressionutil ${SQLITE3_LDFLAGS} - ${CMAKE_THREAD_LIBS_INIT}) +if (BUILD_CHIMERA) + include_directories(${PCRE_INCLUDE_DIRS}) + if(NOT WIN32) + target_link_libraries(hsbench hs chimera ${PCRE_LDFLAGS} databaseutil + expressionutil ${SQLITE3_LDFLAGS} ${CMAKE_THREAD_LIBS_INIT}) + else() + target_link_libraries(hsbench hs chimera pcre databaseutil + expressionutil ${SQLITE3_LDFLAGS} ${CMAKE_THREAD_LIBS_INIT}) + endif() +else() + target_link_libraries(hsbench hs databaseutil expressionutil + ${SQLITE3_LDFLAGS} ${CMAKE_THREAD_LIBS_INIT}) +endif() diff --git a/tools/hsbench/common.h b/tools/hsbench/common.h index d7bce73ad..820cad7c3 100644 --- a/tools/hsbench/common.h +++ b/tools/hsbench/common.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2017, Intel Corporation + * Copyright (c) 2016-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -42,6 +42,12 @@ extern bool forceEditDistance; extern unsigned editDistance; extern bool printCompressSize; +/** Structure for the result of a single complete scan. */ +struct ResultEntry { + double seconds = 0; //!< Time taken for scan. + unsigned int matches = 0; //!< Count of matches found. +}; + struct SqlFailure { explicit SqlFailure(const std::string &s) : message(s) {} std::string message; diff --git a/tools/hsbench/engine.cpp b/tools/hsbench/engine.cpp new file mode 100644 index 000000000..f447a0bcd --- /dev/null +++ b/tools/hsbench/engine.cpp @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "engine.h" + +EngineContext::~EngineContext() { } + +EngineStream::~EngineStream() { } + +Engine::~Engine() { } diff --git a/tools/hsbench/engine.h b/tools/hsbench/engine.h new file mode 100644 index 000000000..e41f9948c --- /dev/null +++ b/tools/hsbench/engine.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ENGINE_H +#define ENGINE_H + +#include "common.h" +#include "sqldb.h" + +#include +#include +#include + +#include + +// Engines have an engine context which is allocated on a per-thread basis. +class EngineContext : boost::noncopyable { +public: + virtual ~EngineContext(); +}; + +/** Streaming mode scans have persistent stream state associated with them. */ +class EngineStream : boost::noncopyable { +public: + virtual ~EngineStream(); + unsigned int sn; +}; + +// Benchmarking engine +class Engine : boost::noncopyable { +public: + virtual ~Engine(); + + // allocate an EngineContext + virtual std::unique_ptr makeContext() const = 0; + + // non-streaming scan + virtual void scan(const char *data, unsigned len, unsigned blockId, + ResultEntry &results, EngineContext &ectx) const = 0; + + // vectoring scan + virtual void scan_vectored(const char *const *data, + const unsigned int *len, unsigned int count, + unsigned int streamId, ResultEntry &result, + EngineContext &ectx) const = 0; + + // stream open + virtual std::unique_ptr streamOpen(EngineContext &ectx, + unsigned id) const = 0; + + // stream close + virtual void streamClose(std::unique_ptr stream, + ResultEntry &result) const = 0; + + // stream compress and expand + virtual void streamCompressExpand(EngineStream &stream, + std::vector &temp) const = 0; + + // streaming scan + virtual void streamScan(EngineStream &stream, const char *data, + unsigned int len, unsigned int id, + ResultEntry &result) const = 0; + + virtual void printStats() const = 0; + + virtual void sqlStats(SqlDB &db) const = 0; +}; + +#endif // ENGINE_H diff --git a/tools/hsbench/engine_chimera.cpp b/tools/hsbench/engine_chimera.cpp new file mode 100644 index 000000000..8a15c5bee --- /dev/null +++ b/tools/hsbench/engine_chimera.cpp @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "ExpressionParser.h" +#include "common.h" +#include "engine_chimera.h" +#include "expressions.h" +#include "heapstats.h" +#include "sqldb.h" +#include "timer.h" + +#include "chimera/ch_database.h" + +#include "util/make_unique.h" + +using namespace std; + +EngineCHContext::EngineCHContext(const ch_database_t *db) { + ch_alloc_scratch(db, &scratch); + assert(scratch); +} + +EngineCHContext::~EngineCHContext() { + ch_free_scratch(scratch); +} + +namespace /* anonymous */ { + +/** Scan context structure passed to the onMatch callback function. */ +struct ScanCHContext { + ScanCHContext(unsigned id_in, ResultEntry &result_in) + : id(id_in), result(result_in) {} + unsigned id; + ResultEntry &result; +}; + +} // namespace + +/** + * Callback function called for every match that Chimera produces, used when + * "echo matches" is off. + */ +static +int HS_CDECL onMatch(unsigned int, unsigned long long, unsigned long long, + unsigned int, unsigned int, const ch_capture_t *, + void *ctx) { + ScanCHContext *sc = static_cast(ctx); + assert(sc); + sc->result.matches++; + + return 0; +} + +/** + * Callback function called for every match that Chimera produces when "echo + * matches" is enabled. + */ +static +int HS_CDECL onMatchEcho(unsigned int id, unsigned long long, + unsigned long long to, unsigned int, unsigned int, + const ch_capture_t *, void *ctx) { + ScanCHContext *sc = static_cast(ctx); + assert(sc); + sc->result.matches++; + + printf("Match @%u:%llu for %u\n", sc->id, to, id); + + return 0; +} + +EngineChimera::EngineChimera(ch_database_t *db_in, CompileCHStats cs) + : db(db_in), compile_stats(move(cs)) { + assert(db); +} + +EngineChimera::~EngineChimera() { + ch_free_database(db); +} + +unique_ptr EngineChimera::makeContext() const { + return ue2::make_unique(db); +} + +void EngineChimera::scan(const char *data, unsigned int len, unsigned int id, + ResultEntry &result, EngineContext &ectx) const { + assert(data); + + auto &ctx = static_cast(ectx); + ScanCHContext sc(id, result); + auto callback = echo_matches ? onMatchEcho : onMatch; + ch_error_t rv = ch_scan(db, data, len, 0, ctx.scratch, callback, nullptr, + &sc); + + if (rv != CH_SUCCESS) { + printf("Fatal error: ch_scan returned error %d\n", rv); + abort(); + } +} + +// vectoring scan +void EngineChimera::scan_vectored(UNUSED const char *const *data, + UNUSED const unsigned int *len, + UNUSED unsigned int count, + UNUSED unsigned int streamId, + UNUSED ResultEntry &result, + UNUSED EngineContext &ectx) const { + printf("Hybrid matcher can't support vectored mode.\n"); + abort(); +} + +unique_ptr EngineChimera::streamOpen(UNUSED EngineContext &ectx, + UNUSED unsigned id) const { + printf("Hybrid matcher can't stream.\n"); + abort(); +} + +void EngineChimera::streamClose(UNUSED unique_ptr stream, + UNUSED ResultEntry &result) const { + printf("Hybrid matcher can't stream.\n"); + abort(); +} + +void EngineChimera::streamScan(UNUSED EngineStream &stream, + UNUSED const char *data, + UNUSED unsigned len, UNUSED unsigned id, + UNUSED ResultEntry &result) const { + printf("Hybrid matcher can't stream.\n"); + abort(); +} + +void EngineChimera::streamCompressExpand(UNUSED EngineStream &stream, + UNUSED vector &temp) const { + printf("Hybrid matcher can't stream.\n"); + abort(); +} + +void EngineChimera::printStats() const { + // Output summary information. + if (!compile_stats.sigs_name.empty()) { + printf("Signature set: %s\n", compile_stats.sigs_name.c_str()); + } + printf("Signatures: %s\n", compile_stats.signatures.c_str()); + printf("Chimera info: %s\n", compile_stats.db_info.c_str()); +#ifndef _WIN32 + printf("Expression count: %'zu\n", compile_stats.expressionCount); + printf("Bytecode size: %'zu bytes\n", compile_stats.compiledSize); +#else + printf("Expression count: %zu\n", compile_stats.expressionCount); + printf("Bytecode size: %zu bytes\n", compile_stats.compiledSize); +#endif + printf("Database CRC: 0x%x\n", compile_stats.crc32); +#ifndef _WIN32 + printf("Scratch size: %'zu bytes\n", compile_stats.scratchSize); + printf("Compile time: %'0.3Lf seconds\n", compile_stats.compileSecs); + printf("Peak heap usage: %'u bytes\n", compile_stats.peakMemorySize); +#else + printf("Scratch size: %zu bytes\n", compile_stats.scratchSize); + printf("Compile time: %0.3Lf seconds\n", compile_stats.compileSecs); + printf("Peak heap usage: %u bytes\n", compile_stats.peakMemorySize); +#endif +} + +void EngineChimera::sqlStats(SqlDB &sqldb) const { + ostringstream crc; + crc << "0x" << hex << compile_stats.crc32; + + static const string Q = + "INSERT INTO Compile (" + "sigsName, signatures, dbInfo, exprCount, dbSize, crc," + "scratchSize, compileSecs, peakMemory) " + "VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)"; + + sqldb.insert_all(Q, compile_stats.sigs_name, compile_stats.signatures, + compile_stats.db_info, compile_stats.expressionCount, + compile_stats.compiledSize, crc.str(), + compile_stats.scratchSize, compile_stats.compileSecs, + compile_stats.peakMemorySize); +} + +unique_ptr +buildEngineChimera(const ExpressionMap &expressions, const string &name, + const string &sigs_name) { + if (expressions.empty()) { + assert(0); + return nullptr; + } + + long double compileSecs = 0.0; + size_t compiledSize = 0.0; + size_t scratchSize = 0; + unsigned int peakMemorySize = 0; + string db_info; + + ch_database_t *db; + ch_error_t err; + + const unsigned int count = expressions.size(); + + vector exprs; + vector flags, ids; + vector ext; + + for (const auto &m : expressions) { + string expr; + unsigned int f = 0; + hs_expr_ext extparam; // unused + extparam.flags = 0; + if (!readExpression(m.second, expr, &f, &extparam)) { + printf("Error parsing PCRE: %s (id %u)\n", m.second.c_str(), + m.first); + return nullptr; + } + + if (extparam.flags) { + printf("Error parsing PCRE with extended flags: %s (id %u)\n", + m.second.c_str(), m.first); + return nullptr; + } + exprs.push_back(expr); + ids.push_back(m.first); + flags.push_back(f); + } + + // Our compiler takes an array of plain ol' C strings. + vector patterns(count); + for (unsigned int i = 0; i < count; i++) { + patterns[i] = exprs[i].c_str(); + } + + Timer timer; + timer.start(); + + // Capture groups by default + unsigned int mode = CH_MODE_GROUPS; + ch_compile_error_t *compile_err; + err = ch_compile_multi(patterns.data(), flags.data(), ids.data(), + count, mode, nullptr, &db, &compile_err); + + timer.complete(); + compileSecs = timer.seconds(); + peakMemorySize = getPeakHeap(); + + if (err == CH_COMPILER_ERROR) { + if (compile_err->expression >= 0) { + printf("Compile error for signature #%u: %s\n", + compile_err->expression, compile_err->message); + } else { + printf("Compile error: %s\n", compile_err->message); + } + ch_free_compile_error(compile_err); + return nullptr; + } + + err = ch_database_size(db, &compiledSize); + if (err != CH_SUCCESS) { + return nullptr; + } + assert(compiledSize > 0); + + char *info; + err = ch_database_info(db, &info); + if (err != CH_SUCCESS) { + return nullptr; + } else { + db_info = string(info); + free(info); + } + + // Allocate scratch temporarily to find its size: this is a good test + // anyway. + ch_scratch_t *scratch = nullptr; + err = ch_alloc_scratch(db, &scratch); + if (err != HS_SUCCESS) { + return nullptr; + } + + err = ch_scratch_size(scratch, &scratchSize); + if (err != CH_SUCCESS) { + return nullptr; + } + ch_free_scratch(scratch); + + // Collect summary information. + CompileCHStats cs; + cs.sigs_name = sigs_name; + if (!sigs_name.empty()) { + const auto pos = name.find_last_of('/'); + cs.signatures = name.substr(pos + 1); + } else { + cs.signatures = name; + } + cs.db_info = db_info; + cs.expressionCount = expressions.size(); + cs.compiledSize = compiledSize; + cs.scratchSize = scratchSize; + cs.compileSecs = compileSecs; + cs.peakMemorySize = peakMemorySize; + + return ue2::make_unique(db, move(cs)); +} diff --git a/tools/hsbench/engine_chimera.h b/tools/hsbench/engine_chimera.h new file mode 100644 index 000000000..8e2cd0f6c --- /dev/null +++ b/tools/hsbench/engine_chimera.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ENGINECHIMERA_H +#define ENGINECHIMERA_H + +#include "expressions.h" +#include "engine.h" + +#include "chimera/ch.h" + +#include +#include +#include + +/** Infomation about the database compile */ +struct CompileCHStats { + std::string sigs_name; + std::string signatures; + std::string db_info; + size_t expressionCount = 0; + size_t compiledSize = 0; + uint32_t crc32 = 0; + size_t scratchSize = 0; + long double compileSecs = 0; + unsigned int peakMemorySize = 0; +}; + +/** Engine context which is allocated on a per-thread basis. */ +class EngineCHContext : public EngineContext{ +public: + explicit EngineCHContext(const ch_database_t *db); + ~EngineCHContext(); + + ch_scratch_t *scratch = nullptr; +}; + +/** Chimera Engine for scanning data. */ +class EngineChimera : public Engine { +public: + explicit EngineChimera(ch_database_t *db, CompileCHStats cs); + ~EngineChimera(); + + std::unique_ptr makeContext() const; + + void scan(const char *data, unsigned int len, unsigned int id, + ResultEntry &result, EngineContext &ectx) const; + + void scan_vectored(const char *const *data, const unsigned int *len, + unsigned int count, unsigned int streamId, + ResultEntry &result, EngineContext &ectx) const; + + std::unique_ptr streamOpen(EngineContext &ectx, + unsigned id) const; + + void streamClose(std::unique_ptr stream, + ResultEntry &result) const; + + void streamCompressExpand(EngineStream &stream, + std::vector &temp) const; + + void streamScan(EngineStream &stream, const char *data, unsigned int len, + unsigned int id, ResultEntry &result) const; + + void printStats() const; + + void sqlStats(SqlDB &db) const; + +private: + ch_database_t *db; + CompileCHStats compile_stats; +}; + +std::unique_ptr +buildEngineChimera(const ExpressionMap &expressions, const std::string &name, + const std::string &sigs_name); + +#endif // ENGINECHIMERA_H diff --git a/tools/hsbench/engine_hyperscan.cpp b/tools/hsbench/engine_hyperscan.cpp index d98b3a400..3390c2638 100644 --- a/tools/hsbench/engine_hyperscan.cpp +++ b/tools/hsbench/engine_hyperscan.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2017, Intel Corporation + * Copyright (c) 2016-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -57,20 +57,22 @@ using namespace std; -EngineContext::EngineContext(const hs_database_t *db) { +EngineHSContext::EngineHSContext(const hs_database_t *db) { hs_alloc_scratch(db, &scratch); assert(scratch); } -EngineContext::~EngineContext() { +EngineHSContext::~EngineHSContext() { hs_free_scratch(scratch); } +EngineHSStream::~EngineHSStream() { } + namespace /* anonymous */ { /** Scan context structure passed to the onMatch callback function. */ -struct ScanContext { - ScanContext(unsigned id_in, ResultEntry &result_in, +struct ScanHSContext { + ScanHSContext(unsigned id_in, ResultEntry &result_in, const EngineStream *stream_in) : id(id_in), result(result_in), stream(stream_in) {} unsigned id; @@ -85,9 +87,9 @@ struct ScanContext { * "echo matches" is off. */ static -int onMatch(unsigned int, unsigned long long, unsigned long long, unsigned int, - void *ctx) { - ScanContext *sc = static_cast(ctx); +int HS_CDECL onMatch(unsigned int, unsigned long long, + unsigned long long, unsigned int, void *ctx) { + ScanHSContext *sc = static_cast(ctx); assert(sc); sc->result.matches++; @@ -99,9 +101,9 @@ int onMatch(unsigned int, unsigned long long, unsigned long long, unsigned int, * matches" is enabled. */ static -int onMatchEcho(unsigned int id, unsigned long long, unsigned long long to, - unsigned int, void *ctx) { - ScanContext *sc = static_cast(ctx); +int HS_CDECL onMatchEcho(unsigned int id, unsigned long long, + unsigned long long to, unsigned int, void *ctx) { + ScanHSContext *sc = static_cast(ctx); assert(sc); sc->result.matches++; @@ -114,7 +116,7 @@ int onMatchEcho(unsigned int id, unsigned long long, unsigned long long to, return 0; } -EngineHyperscan::EngineHyperscan(hs_database_t *db_in, CompileStats cs) +EngineHyperscan::EngineHyperscan(hs_database_t *db_in, CompileHSStats cs) : db(db_in), compile_stats(std::move(cs)) { assert(db); } @@ -124,14 +126,15 @@ EngineHyperscan::~EngineHyperscan() { } unique_ptr EngineHyperscan::makeContext() const { - return ue2::make_unique(db); + return ue2::make_unique(db); } void EngineHyperscan::scan(const char *data, unsigned int len, unsigned int id, - ResultEntry &result, EngineContext &ctx) const { + ResultEntry &result, EngineContext &ectx) const { assert(data); - ScanContext sc(id, result, nullptr); + EngineHSContext &ctx = static_cast(ectx); + ScanHSContext sc(id, result, nullptr); auto callback = echo_matches ? onMatchEcho : onMatch; hs_error_t rv = hs_scan(db, data, len, 0, ctx.scratch, callback, &sc); @@ -144,11 +147,12 @@ void EngineHyperscan::scan(const char *data, unsigned int len, unsigned int id, void EngineHyperscan::scan_vectored(const char *const *data, const unsigned int *len, unsigned int count, unsigned streamId, ResultEntry &result, - EngineContext &ctx) const { + EngineContext &ectx) const { assert(data); assert(len); - ScanContext sc(streamId, result, nullptr); + EngineHSContext &ctx = static_cast(ectx); + ScanHSContext sc(streamId, result, nullptr); auto callback = echo_matches ? onMatchEcho : onMatch; hs_error_t rv = hs_scan_vector(db, data, len, count, 0, ctx.scratch, callback, &sc); @@ -159,9 +163,10 @@ void EngineHyperscan::scan_vectored(const char *const *data, } } -unique_ptr EngineHyperscan::streamOpen(EngineContext &ctx, +unique_ptr EngineHyperscan::streamOpen(EngineContext &ectx, unsigned streamId) const { - auto stream = ue2::make_unique(); + EngineHSContext &ctx = static_cast(ectx); + auto stream = ue2::make_unique(); stream->ctx = &ctx; hs_open_stream(db, 0, &stream->id); @@ -170,17 +175,18 @@ unique_ptr EngineHyperscan::streamOpen(EngineContext &ctx, return nullptr; } stream->sn = streamId; - return stream; + return move(stream); } void EngineHyperscan::streamClose(unique_ptr stream, ResultEntry &result) const { assert(stream); - auto &s = static_cast(*stream); - EngineContext &ctx = *s.ctx; + auto &s = static_cast(*stream); + EngineContext &ectx = *s.ctx; + EngineHSContext &ctx = static_cast(ectx); - ScanContext sc(0, result, &s); + ScanHSContext sc(0, result, &s); auto callback = echo_matches ? onMatchEcho : onMatch; assert(s.id); @@ -193,10 +199,10 @@ void EngineHyperscan::streamScan(EngineStream &stream, const char *data, ResultEntry &result) const { assert(data); - auto &s = static_cast(stream); - EngineContext &ctx = *s.ctx; + auto &s = static_cast(stream); + EngineHSContext &ctx = *s.ctx; - ScanContext sc(id, result, &s); + ScanHSContext sc(id, result, &s); auto callback = echo_matches ? onMatchEcho : onMatch; hs_error_t rv = hs_scan_stream(s.id, data, len, 0, ctx.scratch, callback, &sc); @@ -210,11 +216,12 @@ void EngineHyperscan::streamScan(EngineStream &stream, const char *data, void EngineHyperscan::streamCompressExpand(EngineStream &stream, vector &temp) const { size_t used = 0; - hs_error_t err = hs_compress_stream(stream.id, temp.data(), temp.size(), + auto &s = static_cast(stream); + hs_error_t err = hs_compress_stream(s.id, temp.data(), temp.size(), &used); if (err == HS_INSUFFICIENT_SPACE) { temp.resize(used); - err = hs_compress_stream(stream.id, temp.data(), temp.size(), &used); + err = hs_compress_stream(s.id, temp.data(), temp.size(), &used); } if (err != HS_SUCCESS) { @@ -223,10 +230,10 @@ void EngineHyperscan::streamCompressExpand(EngineStream &stream, } if (printCompressSize) { - printf("stream %u: compressed to %zu\n", stream.sn, used); + printf("stream %u: compressed to %zu\n", s.sn, used); } - err = hs_reset_and_expand_stream(stream.id, temp.data(), temp.size(), + err = hs_reset_and_expand_stream(s.id, temp.data(), temp.size(), nullptr, nullptr, nullptr); if (err != HS_SUCCESS) { @@ -243,15 +250,30 @@ void EngineHyperscan::printStats() const { } printf("Signatures: %s\n", compile_stats.signatures.c_str()); printf("Hyperscan info: %s\n", compile_stats.db_info.c_str()); +#ifndef _WIN32 printf("Expression count: %'zu\n", compile_stats.expressionCount); printf("Bytecode size: %'zu bytes\n", compile_stats.compiledSize); +#else + printf("Expression count: %zu\n", compile_stats.expressionCount); + printf("Bytecode size: %zu bytes\n", compile_stats.compiledSize); +#endif printf("Database CRC: 0x%x\n", compile_stats.crc32); if (compile_stats.streaming) { +#ifndef _WIN32 printf("Stream state size: %'zu bytes\n", compile_stats.streamSize); +#else + printf("Stream state size: %zu bytes\n", compile_stats.streamSize); +#endif } +#ifndef _WIN32 printf("Scratch size: %'zu bytes\n", compile_stats.scratchSize); printf("Compile time: %'0.3Lf seconds\n", compile_stats.compileSecs); printf("Peak heap usage: %'u bytes\n", compile_stats.peakMemorySize); +#else + printf("Scratch size: %zu bytes\n", compile_stats.scratchSize); + printf("Compile time: %0.3Lf seconds\n", compile_stats.compileSecs); + printf("Peak heap usage: %u bytes\n", compile_stats.peakMemorySize); +#endif } void EngineHyperscan::sqlStats(SqlDB &sqldb) const { @@ -469,7 +491,7 @@ buildEngineHyperscan(const ExpressionMap &expressions, ScanMode scan_mode, hs_free_scratch(scratch); // Collect summary information. - CompileStats cs; + CompileHSStats cs; cs.sigs_name = sigs_name; if (!sigs_name.empty()) { const auto pos = name.find_last_of('/'); diff --git a/tools/hsbench/engine_hyperscan.h b/tools/hsbench/engine_hyperscan.h index d27aab757..a8105d753 100644 --- a/tools/hsbench/engine_hyperscan.h +++ b/tools/hsbench/engine_hyperscan.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2017, Intel Corporation + * Copyright (c) 2016-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -30,22 +30,15 @@ #define ENGINEHYPERSCAN_H #include "expressions.h" -#include "common.h" -#include "sqldb.h" +#include "engine.h" #include "hs_runtime.h" #include #include #include -/** Structure for the result of a single complete scan. */ -struct ResultEntry { - double seconds = 0; //!< Time taken for scan. - unsigned int matches = 0; //!< Count of matches found. -}; - /** Infomation about the database compile */ -struct CompileStats { +struct CompileHSStats { std::string sigs_name; std::string signatures; std::string db_info; @@ -60,38 +53,38 @@ struct CompileStats { }; /** Engine context which is allocated on a per-thread basis. */ -class EngineContext { +class EngineHSContext : public EngineContext { public: - explicit EngineContext(const hs_database_t *db); - ~EngineContext(); + explicit EngineHSContext(const hs_database_t *db); + ~EngineHSContext(); hs_scratch_t *scratch = nullptr; }; /** Streaming mode scans have persistent stream state associated with them. */ -class EngineStream { +class EngineHSStream : public EngineStream { public: + ~EngineHSStream(); hs_stream_t *id; - unsigned int sn; - EngineContext *ctx; + EngineHSContext *ctx; }; /** Hyperscan Engine for scanning data. */ -class EngineHyperscan { +class EngineHyperscan : public Engine { public: - explicit EngineHyperscan(hs_database_t *db, CompileStats cs); + explicit EngineHyperscan(hs_database_t *db, CompileHSStats cs); ~EngineHyperscan(); std::unique_ptr makeContext() const; void scan(const char *data, unsigned int len, unsigned int id, - ResultEntry &result, EngineContext &ctx) const; + ResultEntry &result, EngineContext &ectx) const; void scan_vectored(const char *const *data, const unsigned int *len, unsigned int count, unsigned int streamId, - ResultEntry &result, EngineContext &ctx) const; + ResultEntry &result, EngineContext &ectx) const; - std::unique_ptr streamOpen(EngineContext &ctx, + std::unique_ptr streamOpen(EngineContext &ectx, unsigned id) const; void streamClose(std::unique_ptr stream, @@ -109,7 +102,7 @@ class EngineHyperscan { private: hs_database_t *db; - CompileStats compile_stats; + CompileHSStats compile_stats; }; namespace ue2 { diff --git a/tools/hsbench/engine_pcre.cpp b/tools/hsbench/engine_pcre.cpp new file mode 100644 index 000000000..85616e987 --- /dev/null +++ b/tools/hsbench/engine_pcre.cpp @@ -0,0 +1,401 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef _WIN32 +#define PCRE_STATIC +#endif +#include "config.h" + +#include "common.h" +#include "engine_pcre.h" +#include "heapstats.h" +#include "huge.h" +#include "sqldb.h" +#include "timer.h" + +#include "util/make_unique.h" +#include "util/unicode_def.h" + +#include + +using namespace std; + +EnginePCREContext::EnginePCREContext(int capture_cnt) { + ovec = (int *)malloc((capture_cnt + 1)* sizeof(int) * 3); +} + +EnginePCREContext::~EnginePCREContext() { + free(ovec); +} + +namespace /* anonymous */ { + +/** Scan context structure passed to the onMatch callback function. */ +struct ScanPCREContext { + ScanPCREContext(unsigned id_in, ResultEntry &result_in) + : id(id_in), result(result_in) {} + unsigned id; + ResultEntry &result; +}; + +} // namespace + +/** + * Function called for every match that PCRE produces, used when + * "echo matches" is off. + */ +static +int onMatch(ScanPCREContext *sc) { + assert(sc); + sc->result.matches++; + + return 0; +} + +/** + * Function called for every match that PCRE produces when "echo + * matches" is enabled. + */ +static +int onMatchEcho(unsigned int id, unsigned long long, unsigned long long to, + ScanPCREContext *sc) { + assert(sc); + sc->result.matches++; + + printf("Match @%u:%llu for %u\n", sc->id, to, id); + + return 0; +} + +EnginePCRE::EnginePCRE(vector> dbs_in, CompilePCREStats cs, + int capture_cnt_in) + : dbs(move(dbs_in)), compile_stats(move(cs)), + capture_cnt(capture_cnt_in) {} + +EnginePCRE::~EnginePCRE() { + for (auto &pcreDB : dbs) { + free(pcreDB->extra); + free(pcreDB->db); + } +} + +unique_ptr EnginePCRE::makeContext() const { + return ue2::make_unique(capture_cnt); +} + +void EnginePCRE::scan(const char *data, unsigned int len, unsigned int id, + ResultEntry &result, EngineContext &ectx) const { + assert(data); + + ScanPCREContext sc(id, result); + auto &ctx = static_cast(ectx); + int *ovec = ctx.ovec; + int ovec_size = (capture_cnt + 1) * 3; + for (const auto &pcreDB : dbs) { + int startoffset = 0; + bool utf8 = pcreDB->utf8; + bool highlander = pcreDB->highlander; + + int flags = 0; + int ret; + do { + ret = pcre_exec(pcreDB->db, pcreDB->extra, data, len, + startoffset, flags, ovec, ovec_size); + if (ret <= PCRE_ERROR_NOMATCH) { + break; + } + + int from = ovec[0]; + int to = ovec[1]; + assert(from <= to); + + if (echo_matches) { + onMatchEcho(pcreDB->id, from, to, &sc); + } else { + onMatch(&sc); + } + + // If we only wanted a single match, we're done. + if (highlander) { + break; + } + + // Next scan starts at the first codepoint after the match. It's + // possible that we have a vacuous match, in which case we must step + // past it to ensure that we always progress. + if (from != to) { + startoffset = to; + } else if (utf8) { + startoffset = to + 1; + while (startoffset < (int)len && + ((data[startoffset] & 0xc0) == UTF_CONT_BYTE_HEADER)) { + ++startoffset; + } + } else { + startoffset = to + 1; + } + } while (startoffset <= (int)len); + + if (ret < PCRE_ERROR_NOMATCH) { + printf("Fatal error: pcre returned error %d\n", ret); + abort(); + } + } +} + +// vectoring scan +void EnginePCRE::scan_vectored(UNUSED const char *const *data, + UNUSED const unsigned int *len, + UNUSED unsigned int count, + UNUSED unsigned int streamId, + UNUSED ResultEntry &result, + UNUSED EngineContext &ectx) const { + printf("PCRE matcher can't support vectored mode.\n"); + abort(); +} + +unique_ptr EnginePCRE::streamOpen(UNUSED EngineContext &ectx, + UNUSED unsigned id) const { + printf("PCRE matcher can't stream.\n"); + abort(); +} + +void EnginePCRE::streamClose(UNUSED unique_ptr stream, + UNUSED ResultEntry &result) const { + printf("PCRE matcher can't stream.\n"); + abort(); +} + +void EnginePCRE::streamScan(UNUSED EngineStream &stream, + UNUSED const char *data, + UNUSED unsigned len, UNUSED unsigned id, + UNUSED ResultEntry &result) const { + printf("PCRE matcher can't stream.\n"); + abort(); +} + +void EnginePCRE::streamCompressExpand(UNUSED EngineStream &stream, + UNUSED vector &temp) const { + printf("PCRE matcher can't stream.\n"); + abort(); +} + +void EnginePCRE::printStats() const { + // Output summary information. + if (!compile_stats.sigs_name.empty()) { + printf("Signature set: %s\n", compile_stats.sigs_name.c_str()); + } + printf("Signatures: %s\n", compile_stats.signatures.c_str()); + printf("PCRE info: %s\n", compile_stats.db_info.c_str()); +#ifndef _WIN32 + printf("Expression count: %'zu\n", compile_stats.expressionCount); + printf("Bytecode size: %'zu bytes\n", compile_stats.compiledSize); + printf("Scratch size: %'zu bytes\n", compile_stats.scratchSize); + printf("Compile time: %'0.3Lf seconds\n", compile_stats.compileSecs); + printf("Peak heap usage: %'u bytes\n", compile_stats.peakMemorySize); +#else + printf("Expression count: %zu\n", compile_stats.expressionCount); + printf("Bytecode size: %zu bytes\n", compile_stats.compiledSize); + printf("Scratch size: %zu bytes\n", compile_stats.scratchSize); + printf("Compile time: %0.3Lf seconds\n", compile_stats.compileSecs); + printf("Peak heap usage: %u bytes\n", compile_stats.peakMemorySize); +#endif +} + +void EnginePCRE::sqlStats(SqlDB &sqldb) const { + ostringstream crc; + + static const string Q = + "INSERT INTO Compile (" + "sigsName, signatures, dbInfo, exprCount, dbSize, crc," + "scratchSize, compileSecs, peakMemory) " + "VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)"; + + sqldb.insert_all(Q, compile_stats.sigs_name, compile_stats.signatures, + compile_stats.db_info, compile_stats.expressionCount, + compile_stats.compiledSize, crc.str(), + compile_stats.scratchSize, compile_stats.compileSecs, + compile_stats.peakMemorySize); +} + +static +bool decodeExprPCRE(string &expr, unsigned *flags, struct PcreDB &db) { + if (expr[0] != '/') { + return false; + } + + size_t end = expr.find_last_of('/'); + if (end == string::npos) { + return false; + } + string strFlags = expr.substr(end + 1, expr.length() - end - 1); + + // strip starting and trailing slashes and the flags + expr.erase(end, expr.length() - end); + expr.erase(0, 1); + + // decode the flags + *flags = 0; + for (size_t i = 0; i != strFlags.length(); ++i) { + switch (strFlags[i]) { + case 's': + *flags |= PCRE_DOTALL; + break; + case 'm': + *flags |= PCRE_MULTILINE; + break; + case 'i': + *flags |= PCRE_CASELESS; + break; + case '8': + *flags |= PCRE_UTF8; + db.utf8 = true; + break; + case 'W': + *flags |= PCRE_UCP; + break; + case 'H': + db.highlander = true; + break; + default: + return false; + } + } + + return true; +} + +unique_ptr +buildEnginePcre(const ExpressionMap &expressions, const string &name, + const string &sigs_name) { + if (expressions.empty()) { + assert(0); + return nullptr; + } + + long double compileSecs = 0.0; + size_t compiledSize = 0.0; + unsigned int peakMemorySize = 0; + string db_info("Version: "); + db_info += string(pcre_version()); + + vector> dbs; + int capture_cnt = 0; + + Timer timer; + timer.start(); + + for (const auto &m : expressions) { + string expr(m.second); + unsigned int flags = 0; + auto pcreDB = ue2::make_unique(); + if (!decodeExprPCRE(expr, &flags, *pcreDB)) { + printf("Error parsing PCRE: %s (id %u)\n", m.second.c_str(), + m.first); + return nullptr; + } + + const char *errp; + int erro; + pcre *db = pcre_compile(expr.c_str(), flags, &errp, &erro, NULL); + + if (!db) { + printf("Compile error %s\n", errp); + return nullptr; + } + + pcre_extra *extra = pcre_study(db, PCRE_STUDY_JIT_COMPILE, &errp); + if (errp) { + printf("PCRE could not be studied: %s\n", errp); + return nullptr; + } + if (!extra) { + extra = (pcre_extra *)malloc(sizeof(pcre_extra)); + } + int cap = 0; // PCRE_INFO_CAPTURECOUNT demands an int + if (pcre_fullinfo(db, extra, PCRE_INFO_CAPTURECOUNT, &cap)) { + printf("PCRE fullinfo error\n"); + free(extra); + free(db); + return nullptr; + } + assert(cap >= 0); + capture_cnt = max(capture_cnt, cap); + + size_t db_size = 0; + if (pcre_fullinfo(db, extra, PCRE_INFO_SIZE, &db_size)) { + printf("PCRE fullinfo error\n"); + free(extra); + free(db); + return nullptr; + } + + size_t study_size = 0; + if (pcre_fullinfo(db, extra, PCRE_INFO_STUDYSIZE, + &study_size)) { + printf("PCRE fullinfo error\n"); + free(extra); + free(db); + return nullptr; + } + compiledSize += db_size + study_size; + + pcreDB->id = m.first; + pcreDB->db = db; + + extra->flags = + PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION; + extra->match_limit = 10000000; + extra->match_limit_recursion = 1500; + + pcreDB->extra = extra; + dbs.push_back(move(pcreDB)); + } + + timer.complete(); + compileSecs = timer.seconds(); + peakMemorySize = getPeakHeap(); + + // Collect summary information. + CompilePCREStats cs; + cs.sigs_name = sigs_name; + if (!sigs_name.empty()) { + const auto pos = name.find_last_of('/'); + cs.signatures = name.substr(pos + 1); + } else { + cs.signatures = name; + } + cs.db_info = db_info; + cs.expressionCount = expressions.size(); + cs.compiledSize = compiledSize; + cs.scratchSize = (capture_cnt + 1) * sizeof(int) * 3; + cs.compileSecs = compileSecs; + cs.peakMemorySize = peakMemorySize; + + return ue2::make_unique(move(dbs), move(cs), capture_cnt); +} diff --git a/tools/hsbench/engine_pcre.h b/tools/hsbench/engine_pcre.h new file mode 100644 index 000000000..2e7dad9c5 --- /dev/null +++ b/tools/hsbench/engine_pcre.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ENGINEPCRE_H +#define ENGINEPCRE_H + +#include "expressions.h" +#include "engine.h" + +#include + +#include +#include +#include + +/** Infomation about the database compile */ +struct CompilePCREStats { + std::string sigs_name; + std::string signatures; + std::string db_info; + size_t expressionCount = 0; + size_t compiledSize = 0; + size_t scratchSize = 0; + long double compileSecs = 0; + unsigned int peakMemorySize = 0; +}; + +/** Engine context which is allocated on a per-thread basis. */ +class EnginePCREContext : public EngineContext{ +public: + explicit EnginePCREContext(int capture_cnt); + ~EnginePCREContext(); + + int *ovec = nullptr; +}; + +struct PcreDB { + bool highlander = false; + bool utf8 = false; + u32 id; + pcre *db = nullptr; + pcre_extra *extra = nullptr; +}; + +/** PCRE Engine for scanning data. */ +class EnginePCRE : public Engine { +public: + explicit EnginePCRE(std::vector> dbs_in, + CompilePCREStats cs, int capture_cnt_in); + ~EnginePCRE(); + + std::unique_ptr makeContext() const; + + void scan(const char *data, unsigned int len, unsigned int id, + ResultEntry &result, EngineContext &ectx) const; + + void scan_vectored(const char *const *data, const unsigned int *len, + unsigned int count, unsigned int streamId, + ResultEntry &result, EngineContext &ectx) const; + + std::unique_ptr streamOpen(EngineContext &ectx, + unsigned id) const; + + void streamClose(std::unique_ptr stream, + ResultEntry &result) const; + + void streamCompressExpand(EngineStream &stream, + std::vector &temp) const; + + void streamScan(EngineStream &stream, const char *data, unsigned int len, + unsigned int id, ResultEntry &result) const; + + void printStats() const; + + void sqlStats(SqlDB &db) const; + +private: + std::vector> dbs; + + CompilePCREStats compile_stats; + + int capture_cnt; +}; + +std::unique_ptr +buildEnginePcre(const ExpressionMap &expressions, const std::string &name, + const std::string &sigs_name); + +#endif // ENGINEPCRE_H diff --git a/tools/hsbench/main.cpp b/tools/hsbench/main.cpp index ae46de774..fecdd3305 100644 --- a/tools/hsbench/main.cpp +++ b/tools/hsbench/main.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2017, Intel Corporation + * Copyright (c) 2016-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -31,6 +31,10 @@ #include "common.h" #include "data_corpus.h" #include "engine_hyperscan.h" +#if defined(HS_HYBRID) +#include "engine_chimera.h" +#include "engine_pcre.h" +#endif #include "expressions.h" #include "sqldb.h" #include "thread_barrier.h" @@ -54,7 +58,11 @@ #include #include +#ifndef _WIN32 #include +#else +#include "win_getopt.h" +#endif #ifndef _WIN32 #include #if defined(HAVE_PTHREAD_NP_H) @@ -87,6 +95,8 @@ namespace /* anonymous */ { bool display_per_scan = false; ScanMode scan_mode = ScanMode::STREAMING; +bool useHybrid = false; +bool usePcre = false; unsigned repeats = 20; string exprPath(""); string corpusFile(""); @@ -102,7 +112,7 @@ typedef void (*thread_func_t)(void *context); class ThreadContext : boost::noncopyable { public: - ThreadContext(unsigned num_in, const EngineHyperscan &db_in, + ThreadContext(unsigned num_in, const Engine &db_in, thread_barrier &tb_in, thread_func_t function_in, vector corpus_data_in) : num(num_in), results(repeats), engine(db_in), @@ -132,6 +142,16 @@ class ThreadContext : boost::noncopyable { // Apply processor affinity (if available) to this thread. bool affine(UNUSED int cpu) { + +#if defined(_WIN32) + SYSTEM_INFO system_info; + GetSystemInfo(&system_info); + assert(cpu >= 0 && (DWORD)cpu < system_info.dwNumberOfProcessors); + DWORD_PTR mask = 1 << cpu; + DWORD_PTR rv = SetThreadAffinityMask(thr.native_handle(), mask); + return rv != 0; +#endif + #ifdef HAVE_DECL_PTHREAD_SETAFFINITY_NP #if defined(__FreeBSD__) cpuset_t cpuset; @@ -155,7 +175,7 @@ class ThreadContext : boost::noncopyable { unsigned num; Timer timer; vector results; - const EngineHyperscan &engine; + const Engine &engine; unique_ptr enginectx; vector corpus_data; @@ -181,7 +201,11 @@ void usage(const char *error) { " (default: streaming).\n"); printf(" -V Benchmark in vectored mode" " (default: streaming).\n"); -#ifdef HAVE_DECL_PTHREAD_SETAFFINITY_NP +#if defined(HS_HYBRID) + printf(" -H Benchmark using Chimera (if supported).\n"); + printf(" -P Benchmark using PCRE (if supported).\n"); +#endif +#if defined(HAVE_DECL_PTHREAD_SETAFFINITY_NP) || defined(_WIN32) printf(" -T CPU,CPU,... Benchmark with threads on these CPUs.\n"); #endif printf(" -i DIR Don't compile, load from files in DIR" @@ -214,8 +238,8 @@ struct BenchmarkSigs { static void processArgs(int argc, char *argv[], vector &sigSets, UNUSED unique_ptr &grey) { - const char options[] = "-b:c:Cd:e:E:G:hi:n:No:p:sS:Vw:z:" -#ifdef HAVE_DECL_PTHREAD_SETAFFINITY_NP + const char options[] = "-b:c:Cd:e:E:G:hHi:n:No:p:PsS:Vw:z:" +#if defined(HAVE_DECL_PTHREAD_SETAFFINITY_NP) || defined(_WIN32) "T:" // add the thread flag #endif ; @@ -287,6 +311,14 @@ void processArgs(int argc, char *argv[], vector &sigSets, usage(nullptr); exit(0); break; + case 'H': +#if defined(HS_HYBRID) + useHybrid = true; +#else + usage("Hybrid matcher not enabled in this build"); + exit(1); +#endif + break; case 'n': if (!fromString(optarg, repeats) || repeats == 0) { usage("Couldn't parse argument to -n flag, should be" @@ -294,6 +326,14 @@ void processArgs(int argc, char *argv[], vector &sigSets, exit(1); } break; + case 'P': +#if defined(HS_HYBRID) + usePcre = true; +#else + usage("PCRE matcher not enabled in this build"); + exit(1); +#endif + break; case 's': in_sigfile = 2; break; @@ -306,7 +346,7 @@ void processArgs(int argc, char *argv[], vector &sigSets, case 'S': sigName.assign(optarg); break; -#ifdef HAVE_DECL_PTHREAD_SETAFFINITY_NP +#if defined(HAVE_DECL_PTHREAD_SETAFFINITY_NP) || defined(_WIN32) case 'T': if (!strToList(optarg, threadCores)) { usage("Couldn't parse argument to -T flag, should be" @@ -399,6 +439,24 @@ void processArgs(int argc, char *argv[], vector &sigSets, exit(1); } + // Constraints on Chimera and PCRE engines + if (useHybrid || usePcre) { + if (useHybrid && usePcre) { + usage("Can't run both Chimera and PCRE."); + exit(1); + } + if (scan_mode != ScanMode::BLOCK) { + usage("Must specify block mode in Chimera or PCRE with " + "the -N option."); + exit(1); + } + + if (forceEditDistance || loadDatabases || saveDatabases) { + usage("No extended options are supported in Chimera or PCRE."); + exit(1); + } + } + // Read in any -s signature sets. for (const auto &file : sigFiles) { SignatureSet sigs; @@ -503,7 +561,7 @@ static void benchStreamingInternal(ThreadContext *ctx, vector &streams, bool do_compress) { assert(ctx); - const EngineHyperscan &e = ctx->engine; + const Engine &e = ctx->engine; const vector &blocks = ctx->corpus_data; vector compress_buf(do_compress ? 1000 : 0); @@ -660,7 +718,11 @@ void displayPerScanResults(const vector> &threads, for (size_t j = 0; j != results.size(); j++) { const auto &r = results[j]; double mbps = calc_mbps(r.seconds, bytesPerRun); +#ifndef _WIN32 printf("T %2u Scan %2zu: %'0.2f Mbit/sec\n", t->num, j, mbps); +#else + printf("T %2u Scan %2zu: %0.2f Mbit/sec\n", t->num, j, mbps); +#endif } } printf("\n"); @@ -705,6 +767,7 @@ void displayResults(const vector> &threads, } } +#ifndef _WIN32 printf("Time spent scanning: %'0.3f seconds\n", totalSecs); printf("Corpus size: %'llu bytes ", bytesPerRun); switch (scan_mode) { @@ -720,22 +783,56 @@ void displayResults(const vector> &threads, printf("(%'zu blocks)\n", corpus_blocks.size()); break; } +#else + printf("Time spent scanning: %0.3f seconds\n", totalSecs); + printf("Corpus size: %llu bytes ", bytesPerRun); + switch (scan_mode) { + case ScanMode::STREAMING: + printf("(%zu blocks in %llu streams)\n", corpus_blocks.size(), + count_streams(corpus_blocks)); + break; + case ScanMode::VECTORED: + printf("(%zu blocks in %llu vectors)\n", corpus_blocks.size(), + count_streams(corpus_blocks)); + break; + case ScanMode::BLOCK: + printf("(%zu blocks)\n", corpus_blocks.size()); + break; + } +#endif u64a totalBytes = bytesPerRun * repeats * threads.size(); u64a totalBlocks = corpus_blocks.size() * repeats * threads.size(); double matchRate = ((double)matchesPerRun * 1024) / bytesPerRun; +#ifndef _WIN32 printf("Matches per iteration: %'llu (%'0.3f matches/kilobyte)\n", matchesPerRun, matchRate); +#else + printf("Matches per iteration: %llu (%0.3f matches/kilobyte)\n", + matchesPerRun, matchRate); +#endif double blockRate = (double)totalBlocks / (double)totalSecs; +#ifndef _WIN32 printf("Overall block rate: %'0.2f blocks/sec\n", blockRate); printf("Mean throughput (overall): %'0.2Lf Mbit/sec\n", calc_mbps(totalSecs, totalBytes)); +#else + printf("Overall block rate: %0.2f blocks/sec\n", blockRate); + printf("Mean throughput (overall): %0.2Lf Mbit/sec\n", + calc_mbps(totalSecs, totalBytes)); + +#endif double lowestScanTime = fastestResult(threads); +#ifndef _WIN32 printf("Max throughput (per core): %'0.2Lf Mbit/sec\n", calc_mbps(lowestScanTime, bytesPerRun)); +#else + printf("Max throughput (per core): %0.2Lf Mbit/sec\n", + calc_mbps(lowestScanTime, bytesPerRun)); +#endif printf("\n"); if (display_per_scan) { @@ -812,7 +909,7 @@ void sqlResults(const vector> &threads, * the same copy of the data. */ static -unique_ptr makeThreadContext(const EngineHyperscan &db, +unique_ptr makeThreadContext(const Engine &db, const vector &blocks, unsigned id, thread_barrier &sync_barrier) { @@ -839,7 +936,7 @@ unique_ptr makeThreadContext(const EngineHyperscan &db, /** Run the given benchmark. */ static -void runBenchmark(const EngineHyperscan &db, +void runBenchmark(const Engine &db, const vector &corpus_blocks) { size_t numThreads; bool useAffinity = false; @@ -848,7 +945,7 @@ void runBenchmark(const EngineHyperscan &db, numThreads = 1; } else { numThreads = threadCores.size(); -#ifdef HAVE_DECL_PTHREAD_SETAFFINITY_NP +#if defined(HAVE_DECL_PTHREAD_SETAFFINITY_NP) || defined(_WIN32) useAffinity = true; #else useAffinity = false; @@ -888,7 +985,7 @@ void runBenchmark(const EngineHyperscan &db, } // namespace /** Main driver. */ -int main(int argc, char *argv[]) { +int HS_CDECL main(int argc, char *argv[]) { unique_ptr grey; #if !defined(RELEASE_BUILD) grey = make_unique(); @@ -936,8 +1033,18 @@ int main(int argc, char *argv[]) { continue; } - auto engine = buildEngineHyperscan(exprMap, scan_mode, s.name, - sigName, *grey); + unique_ptr engine; + if (useHybrid) { +#if defined(HS_HYBRID) + engine = buildEngineChimera(exprMap, s.name, sigName); + } else if (usePcre) { + engine = buildEnginePcre(exprMap, s.name, sigName); +#endif + } else { + engine = buildEngineHyperscan(exprMap, scan_mode, s.name, + sigName, *grey); + } + if (!engine) { printf("Error: expressions failed to compile.\n"); exit(1); diff --git a/tools/hscheck/CMakeLists.txt b/tools/hscheck/CMakeLists.txt index 065d4c04b..8f45765a8 100644 --- a/tools/hscheck/CMakeLists.txt +++ b/tools/hscheck/CMakeLists.txt @@ -5,6 +5,21 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}") SET(hscheck_SOURCES main.cpp ) -add_executable(hscheck ${hscheck_SOURCES}) -target_link_libraries(hscheck hs expressionutil pthread) +if (BUILD_CHIMERA) + include_directories(${PCRE_INCLUDE_DIRS}) + add_definitions(-DHS_HYBRID) + add_executable(hscheck ${hscheck_SOURCES}) + if(NOT WIN32) + target_link_libraries(hscheck hs chimera ${PCRE_LDFLAGS} expressionutil pthread) + else() + target_link_libraries(hscheck hs chimera pcre expressionutil) + endif() +else() + add_executable(hscheck ${hscheck_SOURCES}) + if(NOT WIN32) + target_link_libraries(hscheck hs expressionutil pthread) + else() + target_link_libraries(hscheck hs expressionutil) + endif() +endif() diff --git a/tools/hscheck/main.cpp b/tools/hscheck/main.cpp index 59f802446..595c8b84f 100644 --- a/tools/hscheck/main.cpp +++ b/tools/hscheck/main.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -59,14 +59,22 @@ #include "hs_internal.h" #include "ue2common.h" +#ifdef HS_HYBRID +#include +#include "chimera/ch.h" +#endif + #include #include #include #include #include #include - +#ifndef _WIN32 #include +#else +#include "win_getopt.h" +#endif #include using namespace std; @@ -77,17 +85,36 @@ namespace /* anonymous */ { // are we in streaming mode? (default: yes) bool g_streaming = true; bool g_vectored = false; +bool g_hybrid = false; string g_exprPath(""); string g_signatureFile(""); bool g_allSignatures = false; bool g_forceEditDistance = false; bool build_sigs = false; +bool check_logical = false; unsigned int g_signature; unsigned int g_editDistance; unsigned int globalFlags = 0; unsigned int num_of_threads = 1; unsigned int countFailures = 0; +class ParsedExpr { +public: + ParsedExpr(string regex_in, unsigned int flags_in, hs_expr_ext ext_in) + : regex(regex_in), flags(flags_in), ext(ext_in) {} + ~ParsedExpr() {} + string regex; + unsigned int flags; + hs_expr_ext ext; +}; + +typedef map ExprExtMap; +ExprExtMap g_combs; +ExprExtMap g_validSubs; + +// Iterator pointing to next logical expression to process. +ExprExtMap::const_iterator comb_read_it; + // Global greybox structure, used in non-release builds. unique_ptr g_grey; @@ -106,6 +133,12 @@ std::mutex lk_read; // Mutex serialising access to output map and stdout. std::mutex lk_output; +// Mutex guarding access to write g_combs. +std::mutex lk_write_comb; + +// Mutex guarding access to write g_validSubs. +std::mutex lk_write_sub; + // Possible values for pattern check results. enum ExprStatus {NOT_PROCESSED, SUCCESS, FAILURE}; @@ -126,6 +159,32 @@ bool getNextExpressionId(ExpressionMap::const_iterator &it) { } } +static +bool getNextLogicalExpression(ExprExtMap::const_iterator &it) { + lock_guard lock(lk_read); + if (comb_read_it != g_combs.end()) { + it = comb_read_it; + ++comb_read_it; + return true; + } else { + return false; + } +} + +static +void cacheCombExpr(unsigned id, const string ®ex, unsigned int flags, + const hs_expr_ext &ext) { + lock_guard lock(lk_write_comb); + g_combs.emplace(id, ParsedExpr(regex, flags, ext)); +} + +static +void cacheSubExpr(unsigned id, const string ®ex, unsigned int flags, + const hs_expr_ext &ext) { + lock_guard lock(lk_write_sub); + g_validSubs.emplace(id, ParsedExpr(regex, flags, ext)); +} + // This function prints the Pattern IDs order // It creates the output for build sigs // Caller is required to hold lk_output when calling this function @@ -221,10 +280,146 @@ void checkExpression(UNUSED void *threadarg) { ext.flags |= HS_EXT_FLAG_EDIT_DISTANCE; } + if (flags & HS_FLAG_COMBINATION) { + if (check_logical) { + cacheCombExpr(it->first, regex, flags, ext); + } else { + recordFailure(g_exprMap, it->first, "Unsupported flag used."); + } + continue; + } + // Try and compile a database. const char *regexp = regex.c_str(); - const hs_expr_ext *extp = &ext; + hs_error_t err; + + if (g_hybrid) { +#ifdef HS_HYBRID + ch_compile_error_t *ch_compile_err; + ch_database_t *hybrid_db = nullptr; + err = ch_compile_multi(®exp, &flags, nullptr, 1, CH_MODE_GROUPS, + nullptr, &hybrid_db, &ch_compile_err); + if (err == HS_SUCCESS) { + assert(hybrid_db); + recordSuccess(g_exprMap, it->first); + ch_free_database(hybrid_db); + } else { + assert(!hybrid_db); + assert(ch_compile_err); + recordFailure(g_exprMap, it->first, ch_compile_err->message); + ch_free_compile_error(ch_compile_err); + } +#else + cerr << "Hybrid mode not available in this build." << endl; + exit(1); +#endif // HS_HYBRID + } else { + const hs_expr_ext *extp = &ext; + hs_compile_error_t *compile_err; + hs_database_t *db = nullptr; + +#if !defined(RELEASE_BUILD) + // This variant is available in non-release builds and allows us to + // modify greybox settings. + err = hs_compile_multi_int(®exp, &flags, nullptr, &extp, 1, mode, + nullptr, &db, &compile_err, *g_grey); +#else + err = hs_compile_ext_multi(®exp, &flags, nullptr, &extp, 1, mode, + nullptr, &db, &compile_err); +#endif + + if (err == HS_SUCCESS) { + assert(db); + recordSuccess(g_exprMap, it->first); + hs_free_database(db); + if (check_logical) { + cacheSubExpr(it->first, regex, flags, ext); + } + } else { + assert(!db); + assert(compile_err); + recordFailure(g_exprMap, it->first, compile_err->message); + hs_free_compile_error(compile_err); + } + } + } +} + +static +bool fetchSubIds(const char *logical, vector &ids) { + unsigned mult = 1; + unsigned id = 0; + for (int i = strlen(logical) - 1; i >= 0; i--) { + if (isdigit(logical[i])) { + if (mult > 100000000) { + return false; + } + id += (logical[i] - '0') * mult; + mult *= 10; + } else if (mult > 1) { + ids.push_back(id); + mult = 1; + id = 0; + } + } + if (mult > 1) { + ids.push_back(id); + } + return true; +} + +static +void checkLogicalExpression(UNUSED void *threadarg) { + unsigned int mode = g_streaming ? HS_MODE_STREAM + : g_vectored ? HS_MODE_VECTORED + : HS_MODE_BLOCK; + if (g_streaming) { + // Use SOM mode, for permissiveness' sake. + mode |= HS_MODE_SOM_HORIZON_LARGE; + } + + ExprExtMap::const_iterator it; + while (getNextLogicalExpression(it)) { + const ParsedExpr &comb = it->second; + + vector subIds; + if (!fetchSubIds(comb.regex.c_str(), subIds)) { + recordFailure(g_exprMap, it->first, "Sub-expression id too large."); + continue; + } + + vector regexv; + vector flagsv; + vector idv; + vector extv; + bool valid = true; + + for (const auto i : subIds) { + ExprExtMap::const_iterator jt = g_validSubs.find(i); + if (jt != g_validSubs.end()) { + const ParsedExpr &sub = jt->second; + regexv.push_back(sub.regex.c_str()); + flagsv.push_back(sub.flags); + idv.push_back(i); + extv.push_back(&sub.ext); + } else { + valid = false; + break; + } + } + + if (valid) { + regexv.push_back(comb.regex.c_str()); + flagsv.push_back(comb.flags); + idv.push_back(it->first); + extv.push_back(&comb.ext); + } else { + recordFailure(g_exprMap, it->first, "Sub-expression id not valid."); + continue; + } + + // Try and compile a database. hs_error_t err; hs_compile_error_t *compile_err; hs_database_t *db = nullptr; @@ -232,10 +427,12 @@ void checkExpression(UNUSED void *threadarg) { #if !defined(RELEASE_BUILD) // This variant is available in non-release builds and allows us to // modify greybox settings. - err = hs_compile_multi_int(®exp, &flags, nullptr, &extp, 1, mode, + err = hs_compile_multi_int(regexv.data(), flagsv.data(), idv.data(), + extv.data(), regexv.size(), mode, nullptr, &db, &compile_err, *g_grey); #else - err = hs_compile_ext_multi(®exp, &flags, nullptr, &extp, 1, mode, + err = hs_compile_ext_multi(regexv.data(), flagsv.data(), idv.data(), + extv.data(), regexv.size(), mode, nullptr, &db, &compile_err); #endif @@ -264,17 +461,21 @@ void usage() { #endif << " -V Operate in vectored mode." << endl << " -N Operate in block mode (default: streaming)." << endl +#ifdef HS_HYBRID + << " -H Operate in hybrid mode." << endl +#endif << " -L Pass HS_FLAG_SOM_LEFTMOST for all expressions (default: off)." << endl << " -8 Force UTF8 mode on all patterns." << endl << " -T NUM Run with NUM threads." << endl << " -h Display this help." << endl << " -B Build signature set." << endl + << " -C Check logical combinations (default: off)." << endl << endl; } static void processArgs(int argc, char *argv[], UNUSED unique_ptr &grey) { - const char options[] = "e:E:s:z:hLNV8G:T:B"; + const char options[] = "e:E:s:z:hHLNV8G:T:BC"; bool signatureSet = false; for (;;) { @@ -326,12 +527,18 @@ void processArgs(int argc, char *argv[], UNUSED unique_ptr &grey) { g_streaming = false; g_vectored = true; break; + case 'H': + g_hybrid = true; + break; case 'T': num_of_threads = atoi(optarg); break; case 'B': build_sigs = true; break; + case 'C': + check_logical = true; + break; default: usage(); exit(1); @@ -421,7 +628,7 @@ void loadSignatureBuildSigs(const string &inFile, } } -int main(int argc, char **argv) { +int HS_CDECL main(int argc, char **argv) { num_of_threads = max(1u, std::thread::hardware_concurrency()); #if !defined(RELEASE_BUILD) @@ -468,6 +675,18 @@ int main(int argc, char **argv) { threads[i].join(); } + if (check_logical) { + comb_read_it = g_combs.begin(); + + for (unsigned int i = 0; i < num_of_threads; i++) { + threads[i] = thread(checkLogicalExpression, nullptr); + } + + for (unsigned int i = 0; i < num_of_threads; i++) { + threads[i].join(); + } + } + if (!g_exprMap.empty() && !build_sigs) { cout << "SUMMARY: " << countFailures << " of " << g_exprMap.size() << " failed." << endl; diff --git a/tools/hscollider/CMakeLists.txt b/tools/hscollider/CMakeLists.txt index f05b444fb..4684964fb 100644 --- a/tools/hscollider/CMakeLists.txt +++ b/tools/hscollider/CMakeLists.txt @@ -1,9 +1,3 @@ -# we have a fixed requirement for PCRE -set(PCRE_REQUIRED_MAJOR_VERSION 8) -set(PCRE_REQUIRED_MINOR_VERSION 41) -set(PCRE_REQUIRED_VERSION ${PCRE_REQUIRED_MAJOR_VERSION}.${PCRE_REQUIRED_MINOR_VERSION}) - -include (${CMAKE_MODULE_PATH}/pcre.cmake) if (NOT CORRECT_PCRE_VERSION) message(STATUS "PCRE ${PCRE_REQUIRED_VERSION} not found, not building hscollider") return() @@ -29,6 +23,10 @@ set_source_files_properties( ragelmaker(ColliderCorporaParser.rl) +if (BUILD_CHIMERA) + add_definitions(-DHS_HYBRID) +endif() + # only set these after all tests are done set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}") @@ -69,17 +67,29 @@ add_dependencies(hscollider ragel_ColliderCorporaParser) add_dependencies(hscollider pcre) if(NOT WIN32) - target_link_libraries(hscollider hs ${PCRE_LDFLAGS} databaseutil - expressionutil corpusomatic crosscompileutil pthread - "${BACKTRACE_LDFLAGS}") + if (BUILD_CHIMERA) + target_link_libraries(hscollider hs chimera ${PCRE_LDFLAGS} databaseutil + expressionutil corpusomatic crosscompileutil pthread + "${BACKTRACE_LDFLAGS}") + else() + target_link_libraries(hscollider hs ${PCRE_LDFLAGS} databaseutil + expressionutil corpusomatic crosscompileutil pthread + "${BACKTRACE_LDFLAGS}") + endif() if(HAVE_BACKTRACE) set_source_files_properties(hscollider_SOURCES COMPILE_FLAGS "${BACKTRACE_CFLAGS}") endif() else() # WIN32 - target_link_libraries(hscollider hs ${PCRE_LDFLAGS} databaseutil - expressionutil corpusomatic crosscompileutil) + set_target_properties(hscollider PROPERTIES LINK_FLAGS "/STACK:8388608,8388608") + if (BUILD_CHIMERA) + target_link_libraries(hscollider hs chimera pcre databaseutil + expressionutil corpusomatic crosscompileutil) + else() + target_link_libraries(hscollider hs pcre databaseutil + expressionutil corpusomatic crosscompileutil) + endif() endif() add_custom_target( diff --git a/tools/hscollider/DatabaseProxy.h b/tools/hscollider/DatabaseProxy.h index 13b6f680f..831ab1484 100644 --- a/tools/hscollider/DatabaseProxy.h +++ b/tools/hscollider/DatabaseProxy.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -54,10 +54,10 @@ class DatabaseProxy : boost::noncopyable { explicit DatabaseProxy(const std::set &expr_ids) : ids(expr_ids) {} - explicit DatabaseProxy(std::shared_ptr built_db) + explicit DatabaseProxy(std::shared_ptr built_db) : db(built_db) {} - std::shared_ptr get(const UltimateTruth &ultimate) { + std::shared_ptr get(const UltimateTruth &ultimate) { std::lock_guard lock(mutex); if (failed) { // We have previously failed to compile this database. @@ -80,7 +80,7 @@ class DatabaseProxy : boost::noncopyable { private: std::mutex mutex; - std::shared_ptr db; + std::shared_ptr db; std::set ids; bool failed = false; // Database failed compilation. }; diff --git a/tools/hscollider/GraphTruth.cpp b/tools/hscollider/GraphTruth.cpp index 5c4cd8e75..b4b3f809b 100644 --- a/tools/hscollider/GraphTruth.cpp +++ b/tools/hscollider/GraphTruth.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -48,6 +48,7 @@ #include "nfagraph/ng_util.h" #include "parser/Parser.h" #include "parser/unsupported.h" +#include "parser/logical_combination.h" #include "util/compile_context.h" #include "util/make_unique.h" #include "util/report_manager.h" @@ -69,8 +70,11 @@ class CompiledNG : boost::noncopyable { CompiledNG(unique_ptr g_in, unique_ptr rm_in) : g(std::move(g_in)), rm(std::move(rm_in)) {} + CompiledNG(unique_ptr pl_in) + : pl(std::move(pl_in)) {} unique_ptr g; unique_ptr rm; + unique_ptr pl; }; static @@ -126,6 +130,14 @@ void CNGInfo::compile() { } try { + if (combination) { + auto pl = ue2::make_unique(); + pl->parseLogicalCombination(id, re.c_str(), ~0U, 0, ~0ULL); + pl->logicalKeyRenumber(); + cng = make_unique(move(pl)); + return; + } + bool isStreaming = colliderMode == MODE_STREAMING; bool isVectored = colliderMode == MODE_VECTORED; CompileContext cc(isStreaming, isVectored, get_current_target(), @@ -199,6 +211,8 @@ unique_ptr GraphTruth::preprocess(unsigned id, bool highlander = false; bool prefilter = false; bool som = false; + bool combination = false; + bool quiet = false; auto i = m_expr.find(id); if (i == m_expr.end()) { @@ -214,7 +228,8 @@ unique_ptr GraphTruth::preprocess(unsigned id, throw NGCompileFailure("Cannot parse expression flags."); } // read PCRE flags - if (!getPcreFlags(hs_flags, &flags, &highlander, &prefilter, &som)) { + if (!getPcreFlags(hs_flags, &flags, &highlander, &prefilter, &som, + &combination, &quiet)) { throw NGCompileFailure("Cannot get PCRE flags."); } if (force_utf8) { @@ -247,6 +262,8 @@ unique_ptr GraphTruth::preprocess(unsigned id, cngi->highlander = highlander; cngi->prefilter = prefilter; cngi->som = som; + cngi->combination = combination; + cngi->quiet = quiet; cngi->min_offset = ext.min_offset; cngi->max_offset = ext.max_offset; cngi->min_length = ext.min_length; @@ -256,8 +273,95 @@ unique_ptr GraphTruth::preprocess(unsigned id, return cngi; } +/** \brief Returns 1 if compliant to all logical combinations. */ +static +char isLogicalCombination(vector &lv, const vector &comb, + size_t lkeyCount, unsigned start, unsigned result) { + assert(start <= result); + for (unsigned i = start; i <= result; i++) { + const LogicalOp &op = comb[i - lkeyCount]; + assert(i == op.id); + switch (op.op) { + case LOGICAL_OP_NOT: + lv[op.id] = !lv[op.ro]; + break; + case LOGICAL_OP_AND: + lv[op.id] = lv[op.lo] & lv[op.ro]; // && + break; + case LOGICAL_OP_OR: + lv[op.id] = lv[op.lo] | lv[op.ro]; // || + break; + default: + assert(0); + break; + } + } + return lv[result]; +} + bool GraphTruth::run(unsigned, const CompiledNG &cng, const CNGInfo &cngi, - const string &buffer, ResultSet &rs, string &) { + const string &buffer, ResultSet &rs, string &error) { + if (cngi.quiet) { + return true; + } + + if (cngi.combination) { + // Compile and run sub-expressions, store match results. + map> offset_to_matches; + map> offset_to_lkeys; + set sub_exps; + const auto &m_lkey = cng.pl->getLkeyMap(); + for (const auto &it_lkey : m_lkey) { + if (sub_exps.find(it_lkey.first) == sub_exps.end()) { + sub_exps.emplace(it_lkey.first); + ResultSet sub_rs(RESULT_FROM_PCRE); + shared_ptr sub_cngi = preprocess(it_lkey.first); + const CompiledNG *sub_cng; + try { + sub_cng = sub_cngi->get(); + } + catch (const NGCompileFailure &err) { + return false; + } + catch (const NGUnsupportedFailure &err) { + return false; + } + sub_cngi->quiet = false; // force not quiet in sub-exp. + if (!run(it_lkey.first, *sub_cng, *sub_cngi, buffer, sub_rs, error)) { + rs.clear(); + return false; + } + for (const auto &it_mr : sub_rs.matches) { + offset_to_matches[it_mr.to].emplace(it_mr); + offset_to_lkeys[it_mr.to].emplace(it_lkey.second); + if (sub_cngi->highlander) { + break; + } + } + } + } + // Calculate rs for combination expression. + vector lv; + const auto &comb = cng.pl->getLogicalTree(); + lv.resize(m_lkey.size() + comb.size()); + const auto &li = cng.pl->getCombInfoById(cngi.id); + for (const auto &it : offset_to_lkeys) { + for (auto report : it.second) { + lv[report] = 1; + } + if (isLogicalCombination(lv, comb, m_lkey.size(), + li.start, li.result)) { + for (const auto &mr : offset_to_matches.at(it.first)) { + if ((mr.to >= cngi.min_offset) && + (mr.to <= cngi.max_offset)) { + rs.addMatch(mr.from, mr.to); + } + } + } + } + return true; + } + set> matches; if (g_streamOffset) { diff --git a/tools/hscollider/GraphTruth.h b/tools/hscollider/GraphTruth.h index 5f53899c8..e9f601db2 100644 --- a/tools/hscollider/GraphTruth.h +++ b/tools/hscollider/GraphTruth.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -106,6 +106,10 @@ class CNGInfo : boost::noncopyable { bool highlander = false; bool prefilter = false; bool som = false; + bool combination = false; + bool quiet = false; + + unsigned id; private: void compile(); // If NFA graph scan failed for some reason, we mark it as bad and skip @@ -116,8 +120,6 @@ class CNGInfo : boost::noncopyable { std::unique_ptr cng; // compiled NFA graph std::mutex cng_mutex; // serialised accesses to NFA graph - unsigned id; - // Our expression map const ExpressionMap &m_expr; }; diff --git a/tools/hscollider/GroundTruth.cpp b/tools/hscollider/GroundTruth.cpp index b0fe384d5..fe038c818 100644 --- a/tools/hscollider/GroundTruth.cpp +++ b/tools/hscollider/GroundTruth.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -26,6 +26,9 @@ * POSSIBILITY OF SUCH DAMAGE. */ +#ifdef _WIN32 +#define PCRE_STATIC +#endif #include "config.h" #include "common.h" @@ -100,7 +103,8 @@ int pcreCallOut(pcre_callout_block *block) { static bool decodeExprPcre(string &expr, unsigned *flags, bool *highlander, - bool *prefilter, bool *som, hs_expr_ext *ext) { + bool *prefilter, bool *som, bool *combination, + bool *quiet, hs_expr_ext *ext) { string regex; unsigned int hs_flags = 0; if (!readExpression(expr, regex, &hs_flags, ext)) { @@ -109,7 +113,8 @@ bool decodeExprPcre(string &expr, unsigned *flags, bool *highlander, expr.swap(regex); - if (!getPcreFlags(hs_flags, flags, highlander, prefilter, som)) { + if (!getPcreFlags(hs_flags, flags, highlander, prefilter, som, + combination, quiet)) { return false; } @@ -185,6 +190,14 @@ string pcreErrStr(int err) { } } +/* that is, a mode provided by native hyperscan */ +static +bool isStandardMode(unsigned int mode) { + return mode == MODE_BLOCK + || mode == MODE_STREAMING + || mode == MODE_VECTORED; +} + GroundTruth::GroundTruth(ostream &os, const ExpressionMap &expr, unsigned long int limit, unsigned long int limit_recursion) @@ -192,8 +205,10 @@ GroundTruth::GroundTruth(ostream &os, const ExpressionMap &expr, matchLimitRecursion(limit_recursion) {} void GroundTruth::global_prep() { - // We're using pcre callouts - pcre_callout = &pcreCallOut; + if (isStandardMode(colliderMode)) { + // We're using pcre callouts + pcre_callout = &pcreCallOut; + } } static @@ -221,6 +236,8 @@ GroundTruth::compile(unsigned id, bool no_callouts) { bool highlander = false; bool prefilter = false; bool som = false; + bool combination = false; + bool quiet = false; // we can still match approximate matching patterns with PCRE if edit // distance 0 is requested @@ -238,7 +255,8 @@ GroundTruth::compile(unsigned id, bool no_callouts) { hs_expr_ext ext; // Decode the flags - if (!decodeExprPcre(re, &flags, &highlander, &prefilter, &som, &ext)) { + if (!decodeExprPcre(re, &flags, &highlander, &prefilter, &som, + &combination, &quiet, &ext)) { throw PcreCompileFailure("Unable to decode flags."); } @@ -257,11 +275,17 @@ GroundTruth::compile(unsigned id, bool no_callouts) { throw PcreCompileFailure("Unsupported extended flags."); } + // Hybrid mode implies SOM. + if (colliderMode == MODE_HYBRID) { + assert(!use_NFA); + som = true; + } + // SOM flags might be set globally. som |= !!somFlags; // For traditional Hyperscan, add global callout to pattern. - if (!no_callouts) { + if (!combination && !no_callouts && isStandardMode(colliderMode)) { addCallout(re); } @@ -275,12 +299,22 @@ GroundTruth::compile(unsigned id, bool no_callouts) { compiled->highlander = highlander; compiled->prefilter = prefilter; compiled->som = som; + compiled->combination = combination; + compiled->quiet = quiet; compiled->min_offset = ext.min_offset; compiled->max_offset = ext.max_offset; compiled->min_length = ext.min_length; compiled->expression = i->second; // original PCRE flags |= PCRE_NO_AUTO_POSSESS; + if (compiled->combination) { + compiled->pl.parseLogicalCombination(id, re.c_str(), ~0U, 0, ~0ULL); + compiled->pl.logicalKeyRenumber(); + compiled->report = id; + return compiled; + } + + compiled->bytecode = pcre_compile2(re.c_str(), flags, &errcode, &errptr, &errloc, nullptr); @@ -388,6 +422,79 @@ int scanBasic(const CompiledPcre &compiled, const string &buffer, return ret; } +static +bool isUtf8(const CompiledPcre &compiled) { + unsigned long int options = 0; + pcre_fullinfo(compiled.bytecode, NULL, PCRE_INFO_OPTIONS, &options); + return options & PCRE_UTF8; +} + +static +CaptureVec makeCaptureVec(const vector &ovector, int ret) { + assert(ret > 0); + + CaptureVec cap; + + if (no_groups) { + return cap; // No group info requested. + } + + cap.reserve(ret * 2); + for (int i = 0; i < ret * 2; i += 2) { + int from = ovector[i], to = ovector[i + 1]; + cap.push_back(make_pair(from, to)); + } + return cap; +} + +static +int scanHybrid(const CompiledPcre &compiled, const string &buffer, + const pcre_extra &extra, vector &ovector, + ResultSet &rs, ostream &out) { + int len = (int)buffer.length(); + int startoffset = 0; + bool utf8 = isUtf8(compiled); + + int flags = 0; + int ret; + do { + ret = pcre_exec(compiled.bytecode, &extra, buffer.c_str(), len, + startoffset, flags, &ovector[0], ovector.size()); + + if (ret <= PCRE_ERROR_NOMATCH) { + return ret; + } + + int from = ovector.at(0); + int to = ovector.at(1); + rs.addMatch(from, to, makeCaptureVec(ovector, ret)); + + if (echo_matches) { + out << "PCRE Match @ (" << from << "," << to << ")" << endl; + } + + // If we only wanted a single match, we're done. + if (compiled.highlander) break; + + // Next scan starts at the first codepoint after the match. It's + // possible that we have a vacuous match, in which case we must step + // past it to ensure that we always progress. + if (from != to) { + startoffset = to; + } else if (utf8) { + startoffset = to + 1; + while (startoffset < len + && ((buffer[startoffset] & 0xc0) == UTF_CONT_BYTE_HEADER)) { + ++startoffset; + } + } else { + startoffset = to + 1; + } + } while (startoffset <= len); + + return ret; +} + static int scanOffset(const CompiledPcre &compiled, const string &buffer, const pcre_extra &extra, vector &ovector, @@ -424,22 +531,117 @@ int scanOffset(const CompiledPcre &compiled, const string &buffer, return ret; } +/** \brief Returns 1 if compliant to all logical combinations. */ +static +char isLogicalCombination(vector &lv, const vector &comb, + size_t lkeyCount, unsigned start, unsigned result) { + assert(start <= result); + for (unsigned i = start; i <= result; i++) { + const LogicalOp &op = comb[i - lkeyCount]; + assert(i == op.id); + switch (op.op) { + case LOGICAL_OP_NOT: + lv[op.id] = !lv[op.ro]; + break; + case LOGICAL_OP_AND: + lv[op.id] = lv[op.lo] & lv[op.ro]; // && + break; + case LOGICAL_OP_OR: + lv[op.id] = lv[op.lo] | lv[op.ro]; // || + break; + default: + assert(0); + break; + } + } + return lv[result]; +} + bool GroundTruth::run(unsigned, const CompiledPcre &compiled, const string &buffer, ResultSet &rs, string &error) { + if (compiled.quiet) { + return true; + } + + if (compiled.combination) { + // Compile and run sub-expressions, store match results. + map> offset_to_matches; + map> offset_to_lkeys; + set sub_exps; + const auto &m_lkey = compiled.pl.getLkeyMap(); + for (const auto &it_lkey : m_lkey) { + if (sub_exps.find(it_lkey.first) == sub_exps.end()) { + sub_exps.emplace(it_lkey.first); + ResultSet sub_rs(RESULT_FROM_PCRE); + shared_ptr sub_pcre; + try { + sub_pcre = compile(it_lkey.first); + } + catch (const SoftPcreCompileFailure &err) { + return false; + } + catch (const PcreCompileFailure &err) { + return false; + } + sub_pcre->quiet = false; // force not quiet in sub-exp. + if (!run(it_lkey.first, *sub_pcre, buffer, sub_rs, error)) { + rs.clear(); + return false; + } + for (const auto &it_mr : sub_rs.matches) { + offset_to_matches[it_mr.to].emplace(it_mr); + offset_to_lkeys[it_mr.to].emplace(it_lkey.second); + if (sub_pcre->highlander) { + break; + } + } + } + } + // Calculate rs for combination expression. + vector lv; + const auto &comb = compiled.pl.getLogicalTree(); + lv.resize(m_lkey.size() + comb.size()); + const auto &li = compiled.pl.getCombInfoById(compiled.report); + for (const auto &it : offset_to_lkeys) { + for (auto report : it.second) { + lv[report] = 1; + } + if (isLogicalCombination(lv, comb, m_lkey.size(), + li.start, li.result)) { + for (const auto &mr : offset_to_matches.at(it.first)) { + if ((mr.to >= compiled.min_offset) && + (mr.to <= compiled.max_offset)) { + rs.addMatch(mr.from, mr.to); + } + } + } + } + return true; + } + CalloutContext ctx(out); pcre_extra extra; extra.flags = 0; - // Switch on callouts. - extra.flags |= PCRE_EXTRA_CALLOUT_DATA; - extra.callout_data = &ctx; + // If running in traditional HyperScan mode, switch on callouts. + bool usingCallouts = isStandardMode(colliderMode); + if (usingCallouts) { + // Switch on callouts. + extra.flags |= PCRE_EXTRA_CALLOUT_DATA; + extra.callout_data = &ctx; + } // Set the match_limit (in order to bound execution time on very complex // patterns) extra.flags |= (PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION); - extra.match_limit = matchLimit; - extra.match_limit_recursion = matchLimitRecursion; + if (colliderMode == MODE_HYBRID) { + extra.match_limit = 10000000; + extra.match_limit_recursion = 1500; + } else { + extra.match_limit = matchLimit; + extra.match_limit_recursion = matchLimitRecursion; + } #ifdef PCRE_NO_START_OPTIMIZE // Switch off optimizations that may result in callouts not occurring. @@ -452,6 +654,7 @@ bool GroundTruth::run(unsigned, const CompiledPcre &compiled, ovector.resize(ovecsize); int ret; + bool hybrid = false; switch (colliderMode) { case MODE_BLOCK: case MODE_STREAMING: @@ -462,6 +665,10 @@ bool GroundTruth::run(unsigned, const CompiledPcre &compiled, ret = scanBasic(compiled, buffer, extra, ovector, ctx); } break; + case MODE_HYBRID: + ret = scanHybrid(compiled, buffer, extra, ovector, rs, out); + hybrid = true; + break; default: assert(0); ret = PCRE_ERROR_NULL; @@ -494,7 +701,7 @@ bool GroundTruth::run(unsigned, const CompiledPcre &compiled, return true; } - if (compiled.som) { + if (compiled.som && !hybrid) { filterLeftmostSom(rs); } diff --git a/tools/hscollider/GroundTruth.h b/tools/hscollider/GroundTruth.h index bcab55992..1607ef1df 100644 --- a/tools/hscollider/GroundTruth.h +++ b/tools/hscollider/GroundTruth.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -31,6 +31,7 @@ #include "expressions.h" #include "ResultSet.h" +#include "parser/logical_combination.h" #include #include @@ -85,6 +86,14 @@ class CompiledPcre : boost::noncopyable { bool highlander = false; bool prefilter = false; bool som = false; + bool combination = false; + bool quiet = false; + + // Parsed logical combinations. + ue2::ParsedLogical pl; + + // Combination expression report id. + unsigned report; private: // If a PCRE has hit its match recursion limit when scanning a corpus, we diff --git a/tools/hscollider/NfaGeneratedCorpora.cpp b/tools/hscollider/NfaGeneratedCorpora.cpp index 32933be42..b7c77ee15 100644 --- a/tools/hscollider/NfaGeneratedCorpora.cpp +++ b/tools/hscollider/NfaGeneratedCorpora.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -80,6 +80,39 @@ void NfaGeneratedCorpora::generate(unsigned id, vector &data) { throw CorpusFailure("Expression could not be read: " + i->second); } + // Combination's corpus is consist of sub-expressions' corpuses. + if (hs_flags & HS_FLAG_COMBINATION) { + ParsedLogical pl; + pl.parseLogicalCombination(id, re.c_str(), ~0U, 0, ~0ULL); + pl.logicalKeyRenumber(); + const auto &m_lkey = pl.getLkeyMap(); + assert(!m_lkey.empty()); + u32 a_subid; // arbitrary sub id + unordered_map> m_data; + for (const auto &it : m_lkey) { + a_subid = it.first; + vector sub_data; + generate(a_subid, sub_data); + m_data.emplace(a_subid, move(sub_data)); + } + assert(!m_data.empty()); + size_t num_corpus = m_data[a_subid].size(); + data.reserve(data.size() + num_corpus); + while (num_corpus) { + string cc; // 1 combination corpus + for (const auto &it : m_lkey) { + assert(!m_data[it.first].empty()); + cc += m_data[it.first].back().data; + if (m_data[it.first].size() > 1) { + m_data[it.first].pop_back(); + } + } + data.push_back(Corpus(cc)); + num_corpus--; + } + return; + } + if (force_utf8_mode) { hs_flags |= HS_FLAG_UTF8; } diff --git a/tools/hscollider/ResultSet.h b/tools/hscollider/ResultSet.h index 23c628ecb..067055cae 100644 --- a/tools/hscollider/ResultSet.h +++ b/tools/hscollider/ResultSet.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -35,25 +35,36 @@ #include #include +// Type for capturing groups: a vector of (from, to) offsets, with both set to +// -1 for inactive groups (like pcre's ovector). Used by hybrid modes. +typedef std::vector > CaptureVec; + // Class representing a single match, encapsulating to/from offsets. class MatchResult { public: MatchResult(unsigned long long start, unsigned long long end) : from(start), to(end) {} + MatchResult(unsigned long long start, unsigned long long end, + const CaptureVec &cap) + : from(start), to(end), captured(cap) {} bool operator<(const MatchResult &a) const { if (from != a.from) { return from < a.from; } - return to < a.to; + if (to != a.to) { + return to < a.to; + } + return captured < a.captured; } bool operator==(const MatchResult &a) const { - return from == a.from && to == a.to; + return from == a.from && to == a.to && captured == a.captured; } unsigned long long from; unsigned long long to; + CaptureVec captured; }; enum ResultSource { @@ -114,6 +125,26 @@ class ResultSet { } } + // Add a match (with capturing vector) + void addMatch(unsigned long long from, unsigned long long to, + const CaptureVec &cap, int block = 0) { + MatchResult m(from, to, cap); + matches.insert(m); + + if (matches_by_block[block].find(m) != matches_by_block[block].end()) { + dupe_matches.insert(m); + } else { + matches_by_block[block].insert(m); + } + } + + // Clear all matches. + void clear() { + matches.clear(); + dupe_matches.clear(); + matches_by_block.clear(); + } + // Unexpected out of order match seen. bool uoom = false; diff --git a/tools/hscollider/Thread.cpp b/tools/hscollider/Thread.cpp index 537fa0dd0..5fff82398 100644 --- a/tools/hscollider/Thread.cpp +++ b/tools/hscollider/Thread.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -35,8 +35,7 @@ #include #include -#include - +#ifndef _WIN32 static const size_t COLLIDER_THREAD_STACK_SIZE = 8192 * 1024; void Thread::start() { @@ -79,6 +78,16 @@ void Thread::start() { } } +void Thread::join() { pthread_join(thread, nullptr); } + +#else // windows + +void Thread::start() { thread = std::thread(&runThread, this); } + +void Thread::join() { thread.join(); } + +#endif + // Dispatch void *Thread::runThread(void *thr) { if (!no_signal_handler) { @@ -88,7 +97,6 @@ void *Thread::runThread(void *thr) { return nullptr; } -void Thread::join() { pthread_join(thread, nullptr); } Thread::Thread(size_t num) : thread_id(num) {} diff --git a/tools/hscollider/Thread.h b/tools/hscollider/Thread.h index 2ca50e38b..c6675dad9 100644 --- a/tools/hscollider/Thread.h +++ b/tools/hscollider/Thread.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -31,7 +31,11 @@ #include +#ifndef _WIN32 #include +#else +#include +#endif #include @@ -54,7 +58,11 @@ class Thread : boost::noncopyable { const size_t thread_id; private: +#ifndef _WIN32 pthread_t thread; +#else + std::thread thread; +#endif }; #endif // UE2COLLIDER_THREAD_H diff --git a/tools/hscollider/UltimateTruth.cpp b/tools/hscollider/UltimateTruth.cpp index 19c597be5..c37e39ba3 100644 --- a/tools/hscollider/UltimateTruth.cpp +++ b/tools/hscollider/UltimateTruth.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -90,19 +90,14 @@ hs_error_t open_magic_stream(const hs_database_t *db, unsigned flags, #endif // RELEASE_BUILD -class HyperscanDB : boost::noncopyable { +class BaseDB : boost::noncopyable { public: // Constructor takes iterators over a container of pattern IDs. template - HyperscanDB(hs_database_t *db_in, Iter ids_begin, Iter ids_end) - : db(db_in), ids(ids_begin, ids_end) {} - - ~HyperscanDB() { - hs_free_database(db); - } + BaseDB(Iter ids_begin, Iter ids_end) + : ids(ids_begin, ids_end) {} - // Underlying Hyperscan database pointer. - hs_database_t *db; + virtual ~BaseDB(); // The set of expression IDs that must return their matches in order. unordered_set ordered; @@ -111,15 +106,55 @@ class HyperscanDB : boost::noncopyable { unordered_set ids; }; +BaseDB::~BaseDB() { } + +class HyperscanDB : public BaseDB { +public: + // Constructor takes iterators over a container of pattern IDs. + template + HyperscanDB(hs_database_t *db_in, Iter ids_begin, Iter ids_end) + : BaseDB(ids_begin, ids_end), db(db_in) {} + + ~HyperscanDB(); + + // Underlying Hyperscan database pointer. + hs_database_t *db; +}; + +HyperscanDB::~HyperscanDB() { + hs_free_database(db); +} + +#ifdef HS_HYBRID + +class HybridDB : public BaseDB { +public: + // Constructor takes iterators over a container of pattern IDs. + template + HybridDB(ch_database_t *db_in, Iter ids_begin, Iter ids_end) + : BaseDB(ids_begin, ids_end), db(db_in) {} + + ~HybridDB(); + + // Underlying Hyperscan database pointer. + ch_database_t *db; +}; + +HybridDB::~HybridDB() { + ch_free_database(db); +} + +#endif // HS_HYBRID + // Used to track the ID and result set. namespace { struct MultiContext { - MultiContext(unsigned int id_in, const HyperscanDB &db_in, ResultSet *rs_in, + MultiContext(unsigned int id_in, const BaseDB &db_in, ResultSet *rs_in, bool single_in, ostream &os) : id(id_in), db(db_in), rs(rs_in), single(single_in), out(os) {} unsigned int id; int block = 0; - const HyperscanDB &db; + const BaseDB &db; ResultSet *rs; u64a lastRawMatch = 0; /* store last known unadjusted match location */ u64a lastOrderMatch = 0; @@ -134,8 +169,9 @@ struct MultiContext { // Callback used for all (both single and multi-mode) scans. static -int callbackMulti(unsigned int id, unsigned long long from, - unsigned long long to, UNUSED unsigned int flags, void *ctx) { +int HS_CDECL callbackMulti(unsigned int id, unsigned long long from, + unsigned long long to, + UNUSED unsigned int flags, void *ctx) { MultiContext *mctx = static_cast(ctx); assert(mctx); assert(mctx->rs); @@ -230,6 +266,76 @@ int callbackMulti(unsigned int id, unsigned long long from, return 0; } +#ifdef HS_HYBRID + +// Hybrid matcher callback. +static +ch_callback_t HS_CDECL callbackHybrid(unsigned id, unsigned long long from, + unsigned long long to, unsigned, unsigned size, + const ch_capture_t *captured, void *ctx) { + MultiContext *mctx = static_cast(ctx); + assert(mctx); + assert(mctx->rs); + assert(mctx->in_scan_call); + + ostream &out = mctx->out; + + to -= g_corpora_prefix.size(); + + if (mctx->terminated) { + out << "UE2 Match @ (" << from << "," << to << ") for " << id + << " after termination" << endl; + mctx->rs->match_after_halt = true; + } + + if (mctx->single || id == mctx->id) { + CaptureVec cap; + for (unsigned int i = 0; i < size; i++) { + if (!(captured[i].flags & CH_CAPTURE_FLAG_ACTIVE)) { + cap.push_back(make_pair(-1, -1)); + } else { + cap.push_back(make_pair(captured[i].from, captured[i].to)); + } + } + mctx->rs->addMatch(from, to, cap); + } + + if (echo_matches) { + out << "Match @ [" << from << "," << to << "] for " << id << endl; + out << " Captured " << size << " groups: "; + for (unsigned int i = 0; i < size; i++) { + if (!(captured[i].flags & CH_CAPTURE_FLAG_ACTIVE)) { + out << "{} "; + } else { + out << "{" << captured[i].from << "," << captured[i].to << "} "; + } + } + out << endl; + } + + if (limit_matches && mctx->rs->matches.size() == limit_matches) { + mctx->terminated = true; + return CH_CALLBACK_TERMINATE; + } + + return CH_CALLBACK_CONTINUE; +} + +// Hybrid matcher error callback. +static +ch_callback_t HS_CDECL errorCallback(UNUSED ch_error_event_t errorType, + UNUSED unsigned int id, void *, + void *ctx) { + UNUSED MultiContext *mctx = static_cast(ctx); + assert(mctx); + assert(mctx->rs); + assert(mctx->in_scan_call); + + return CH_CALLBACK_SKIP_PATTERN; +} + +#endif // HS_HYBRID + static void filterLeftmostSom(ResultSet &rs) { if (rs.matches.size() <= 1) { @@ -252,6 +358,9 @@ UltimateTruth::UltimateTruth(ostream &os, const ExpressionMap &expr, const Grey &grey_in, unsigned int streamBlocks) : grey(grey_in), out(os), m_expr(expr), m_xcompile(false), m_streamBlocks(streamBlocks), scratch(nullptr), +#ifdef HS_HYBRID + chimeraScratch(nullptr), +#endif platform(plat) { // Build our mode flags. @@ -265,15 +374,27 @@ UltimateTruth::UltimateTruth(ostream &os, const ExpressionMap &expr, case MODE_VECTORED: m_mode = HS_MODE_VECTORED; break; + case MODE_HYBRID: + m_mode = 0; + break; } // Set desired SOM precision, if we're in streaming mode. if (colliderMode == MODE_STREAMING) { m_mode |= somPrecisionMode; } + +#ifdef HS_HYBRID + if (colliderMode == MODE_HYBRID && !no_groups) { + m_mode |= CH_MODE_GROUPS; + } +#endif } UltimateTruth::~UltimateTruth() { +#ifdef HS_HYBRID + ch_free_scratch(chimeraScratch); +#endif hs_free_scratch(scratch); } @@ -327,13 +448,13 @@ void mangle_scratch(hs_scratch_t *scratch) { scratch->fdr_conf_offset = 0xe4; } -bool UltimateTruth::blockScan(const HyperscanDB &hdb, const string &buffer, +bool UltimateTruth::blockScan(const BaseDB &bdb, const string &buffer, size_t align, match_event_handler callback, void *ctx_in, ResultSet *) { assert(colliderMode == MODE_BLOCK); assert(!m_xcompile); - const hs_database_t *db = hdb.db; + const hs_database_t *db = reinterpret_cast(bdb).db; assert(db); MultiContext *ctx = (MultiContext *)ctx_in; @@ -438,13 +559,13 @@ hs_stream_t *compressAndResetExpandStream(const hs_database_t *db, return out; } -bool UltimateTruth::streamingScan(const HyperscanDB &hdb, const string &buffer, +bool UltimateTruth::streamingScan(const BaseDB &bdb, const string &buffer, size_t align, match_event_handler callback, void *ctx_in, ResultSet *rs) { assert(colliderMode == MODE_STREAMING); assert(!m_xcompile); - const hs_database_t *db = hdb.db; + const hs_database_t *db = reinterpret_cast(bdb).db; assert(db); MultiContext *ctx = (MultiContext *)ctx_in; @@ -594,13 +715,13 @@ bool UltimateTruth::streamingScan(const HyperscanDB &hdb, const string &buffer, return ret == HS_SUCCESS; } -bool UltimateTruth::vectoredScan(const HyperscanDB &hdb, const string &buffer, +bool UltimateTruth::vectoredScan(const BaseDB &bdb, const string &buffer, size_t align, match_event_handler callback, void *ctx_in, ResultSet *rs) { assert(colliderMode == MODE_VECTORED); assert(!m_xcompile); - const hs_database_t *db = hdb.db; + const hs_database_t *db = reinterpret_cast(bdb).db; assert(db); MultiContext *ctx = (MultiContext *)ctx_in; @@ -682,19 +803,67 @@ bool UltimateTruth::vectoredScan(const HyperscanDB &hdb, const string &buffer, return true; } -bool UltimateTruth::run(unsigned int id, shared_ptr hdb, +#ifdef HS_HYBRID +bool UltimateTruth::hybridScan(const BaseDB &bdb, const string &buffer, + size_t align, ch_match_event_handler callback, + ch_error_event_handler error_callback, + void *ctx_in, ResultSet *) { + assert(colliderMode == MODE_HYBRID); + assert(!m_xcompile); + + const ch_database_t *db = reinterpret_cast(bdb).db; + assert(db); + MultiContext *ctx = (MultiContext *)ctx_in; + + char *realigned = setupScanBuffer(buffer.c_str(), buffer.size(), align); + if (!realigned) { + return false; + } + + if (use_copy_scratch && !cloneScratch()) { + return false; + } + + ctx->in_scan_call = true; + ch_error_t ret = + ch_scan(db, realigned, buffer.size(), 0, chimeraScratch, callback, + error_callback, ctx); + ctx->in_scan_call = false; + + if (g_verbose) { + out << "Scan call returned " << ret << endl; + } + + if (ctx->terminated) { + if (g_verbose && ret != CH_SCAN_TERMINATED) { + out << "Scan should have returned CH_SCAN_TERMINATED, returned " + << ret << " instead." << endl; + } + return ret == CH_SCAN_TERMINATED; + } + + if (g_verbose && ret != CH_SUCCESS) { + out << "Scan should have returned CH_SUCCESS, returned " << ret + << " instead." << endl; + } + + return ret == CH_SUCCESS; +} +#endif + +bool UltimateTruth::run(unsigned int id, shared_ptr bdb, const string &buffer, bool single_pattern, unsigned int align, ResultSet &rs) { assert(!m_xcompile); - assert(hdb); + assert(bdb); // Ensure that scratch is appropriate for this database. - if (!allocScratch(hdb)) { + if (!allocScratch(bdb)) { out << "Scratch alloc failed." << endl; return false; } - MultiContext ctx(id, *hdb, &rs, single_pattern, out); + MultiContext ctx(id, *bdb, &rs, single_pattern, out); if (!g_corpora_suffix.empty()) { ctx.use_max_offset = true; ctx.max_offset = buffer.size() - g_corpora_suffix.size(); @@ -702,11 +871,20 @@ bool UltimateTruth::run(unsigned int id, shared_ptr hdb, switch (colliderMode) { case MODE_BLOCK: - return blockScan(*hdb, buffer, align, callbackMulti, &ctx, &rs); + return blockScan(*bdb, buffer, align, callbackMulti, &ctx, &rs); case MODE_STREAMING: - return streamingScan(*hdb, buffer, align, callbackMulti, &ctx, &rs); + return streamingScan(*bdb, buffer, align, callbackMulti, &ctx, &rs); case MODE_VECTORED: - return vectoredScan(*hdb, buffer, align, callbackMulti, &ctx, &rs); + return vectoredScan(*bdb, buffer, align, callbackMulti, &ctx, &rs); + case MODE_HYBRID: +#ifdef HS_HYBRID + return hybridScan(*bdb, buffer, align, callbackHybrid, errorCallback, + &ctx, &rs); +#else + cerr << "Hybrid mode not available in this build." << endl; + abort(); +#endif + break; } assert(0); @@ -739,7 +917,7 @@ bool isOrdered(const string &expr, unsigned int flags) { return ordered; } -static unique_ptr +static unique_ptr compileHyperscan(vector &patterns, vector &flags, vector &idsvec, ptr_vector &ext, unsigned mode, const hs_platform_info *platform, string &error, @@ -762,7 +940,30 @@ compileHyperscan(vector &patterns, vector &flags, return ue2::make_unique(db, idsvec.begin(), idsvec.end()); } -shared_ptr UltimateTruth::compile(const set &ids, +#ifdef HS_HYBRID +static unique_ptr +compileHybrid(vector &patterns, + vector &flags, vector &idsvec, + unsigned mode, const hs_platform_info *platform, string &error) { + const unsigned count = patterns.size(); + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err; + + ch_error_t err = ch_compile_multi(&patterns[0], &flags[0], + &idsvec[0], count, mode, platform, &db, + &compile_err); + + if (err != HS_SUCCESS) { + error = compile_err->message; + ch_free_compile_error(compile_err); + return nullptr; + } + + return ue2::make_unique(db, idsvec.begin(), idsvec.end()); +} +#endif + +shared_ptr UltimateTruth::compile(const set &ids, string &error) const { // Build our vectors for compilation const size_t count = ids.size(); @@ -811,6 +1012,17 @@ shared_ptr UltimateTruth::compile(const set &ids, ext[n].edit_distance = edit_distance; } + if (colliderMode == MODE_HYBRID) { + if (ext[n].flags) { + error = "Hybrid does not support extended parameters."; + return nullptr; + } + // We can also strip some other flags in the hybrid matcher. + flags[n] &= ~HS_FLAG_PREFILTER; // prefilter always used + flags[n] &= ~HS_FLAG_ALLOWEMPTY; // empty always allowed + flags[n] &= ~HS_FLAG_SOM_LEFTMOST; // SOM always on + } + n++; } @@ -827,8 +1039,18 @@ shared_ptr UltimateTruth::compile(const set &ids, idsvec.push_back(0); } - auto db = compileHyperscan(patterns, flags, idsvec, ext, m_mode, platform, - error, grey); + unique_ptr db; + if (colliderMode == MODE_HYBRID) { +#ifdef HS_HYBRID + db = compileHybrid(patterns, flags, idsvec, m_mode, platform, error); +#else + error = "Hybrid mode not available in this build."; +#endif + } else { + db = compileHyperscan(patterns, flags, idsvec, ext, m_mode, + platform, error, grey); + } + if (!db) { return nullptr; } @@ -850,18 +1072,29 @@ shared_ptr UltimateTruth::compile(const set &ids, return move(db); } -bool UltimateTruth::allocScratch(shared_ptr db) { +bool UltimateTruth::allocScratch(shared_ptr db) { assert(db); - // We explicitly avoid running scratch allocators for the same HyperscanDB + // We explicitly avoid running scratch allocators for the same BaseDB // over and over again by retaining a shared_ptr to the last one we saw. if (db == last_db) { return true; } - hs_error_t err = hs_alloc_scratch(db.get()->db, &scratch); - if (err != HS_SUCCESS) { - return false; + if (colliderMode == MODE_HYBRID) { +#ifdef HS_HYBRID + ch_error_t err = ch_alloc_scratch( + reinterpret_cast(db.get())->db, &chimeraScratch); + if (err != HS_SUCCESS) { + return false; + } +#endif // HS_HYBRID + } else { + hs_error_t err = hs_alloc_scratch( + reinterpret_cast(db.get())->db, &scratch); + if (err != HS_SUCCESS) { + return false; + } } last_db = db; @@ -869,20 +1102,40 @@ bool UltimateTruth::allocScratch(shared_ptr db) { } bool UltimateTruth::cloneScratch(void) { - hs_scratch_t *old_scratch = scratch; - hs_scratch_t *new_scratch; - hs_error_t ret = hs_clone_scratch(scratch, &new_scratch); - if (ret != HS_SUCCESS) { - DEBUG_PRINTF("failure to clone %d\n", ret); - return false; - } - scratch = new_scratch; - ret = hs_free_scratch(old_scratch); - if (ret != HS_SUCCESS) { - DEBUG_PRINTF("failure to free %d\n", ret); - return false; + if (colliderMode == MODE_HYBRID) { +#ifdef HS_HYBRID + ch_scratch_t *old_scratch = chimeraScratch; + ch_scratch_t *new_scratch; + ch_error_t ret = ch_clone_scratch(chimeraScratch, &new_scratch); + if (ret != CH_SUCCESS) { + DEBUG_PRINTF("failure to clone %d\n", ret); + return false; + } + chimeraScratch = new_scratch; + ret = ch_free_scratch(old_scratch); + if (ret != CH_SUCCESS) { + DEBUG_PRINTF("failure to free %d\n", ret); + return false; + } + DEBUG_PRINTF("hybrid scratch cloned from %p to %p\n", + old_scratch, chimeraScratch); +#endif // HS_HYBRID + } else { + hs_scratch_t *old_scratch = scratch; + hs_scratch_t *new_scratch; + hs_error_t ret = hs_clone_scratch(scratch, &new_scratch); + if (ret != HS_SUCCESS) { + DEBUG_PRINTF("failure to clone %d\n", ret); + return false; + } + scratch = new_scratch; + ret = hs_free_scratch(old_scratch); + if (ret != HS_SUCCESS) { + DEBUG_PRINTF("failure to free %d\n", ret); + return false; + } + DEBUG_PRINTF("scratch cloned from %p to %p\n", old_scratch, scratch); } - DEBUG_PRINTF("scratch cloned from %p to %p\n", old_scratch, scratch); return true; } @@ -947,20 +1200,35 @@ char *UltimateTruth::setupVecScanBuffer(const char *begin, size_t len, return ptr; } -bool UltimateTruth::saveDatabase(const HyperscanDB &hdb, +bool UltimateTruth::saveDatabase(const BaseDB &bdb, const string &filename) const { - return ::saveDatabase(hdb.db, filename.c_str(), g_verbose); + if (colliderMode == MODE_HYBRID) { + cerr << "Hybrid mode doesn't support serialization." << endl; + abort(); + } else { + return ::saveDatabase(reinterpret_cast(&bdb)->db, + filename.c_str(), g_verbose); + } + return false; } -shared_ptr +shared_ptr UltimateTruth::loadDatabase(const string &filename, const std::set &ids) const { - hs_database_t *hs_db = ::loadDatabase(filename.c_str(), g_verbose); - if (!hs_db) { - return nullptr; + shared_ptr db; + + if (colliderMode == MODE_HYBRID) { + cerr << "Hybrid mode doesn't support deserialization." << endl; + abort(); + } else { + hs_database_t *hs_db = ::loadDatabase(filename.c_str(), g_verbose); + if (!hs_db) { + return nullptr; + } + + db = make_shared(hs_db, ids.begin(), ids.end()); } - auto db = make_shared(hs_db, ids.begin(), ids.end()); assert(db); // Fill db::ordered with the expressions that require the ordered flag. diff --git a/tools/hscollider/UltimateTruth.h b/tools/hscollider/UltimateTruth.h index c8de86427..645cf2977 100644 --- a/tools/hscollider/UltimateTruth.h +++ b/tools/hscollider/UltimateTruth.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -33,6 +33,10 @@ #include "hs.h" +#ifdef HS_HYBRID +#include "chimera/ch.h" +#endif + #include #include #include @@ -47,7 +51,7 @@ struct Grey; } // namespace ue2 -class HyperscanDB; +class BaseDB; class ResultSet; // Wrapper around ue2 to generate results for an expression and corpus. @@ -59,13 +63,13 @@ class UltimateTruth : boost::noncopyable { ~UltimateTruth(); - std::shared_ptr compile(const std::set &ids, + std::shared_ptr compile(const std::set &ids, std::string &error) const; - bool saveDatabase(const HyperscanDB &db, + bool saveDatabase(const BaseDB &db, const std::string &filename) const; - std::shared_ptr + std::shared_ptr loadDatabase(const std::string &filename, const std::set &ids) const; @@ -74,7 +78,7 @@ class UltimateTruth : boost::noncopyable { return !m_xcompile; } - bool run(unsigned id, std::shared_ptr db, + bool run(unsigned id, std::shared_ptr db, const std::string &buffer, bool single_pattern, unsigned align, ResultSet &rs); @@ -84,22 +88,28 @@ class UltimateTruth : boost::noncopyable { std::string dbFilename(const std::set &ids) const; private: - bool blockScan(const HyperscanDB &db, const std::string &buffer, + bool blockScan(const BaseDB &db, const std::string &buffer, size_t align, match_event_handler callback, void *ctx, ResultSet *rs); - bool streamingScan(const HyperscanDB &db, const std::string &buffer, + bool streamingScan(const BaseDB &db, const std::string &buffer, size_t align, match_event_handler callback, void *ctx, ResultSet *rs); - bool vectoredScan(const HyperscanDB &db, const std::string &buffer, + bool vectoredScan(const BaseDB &db, const std::string &buffer, size_t align, match_event_handler callback, void *ctx, ResultSet *rs); +#ifdef HS_HYBRID + bool hybridScan(const BaseDB &db, const std::string &buffer, + size_t align, ch_match_event_handler callback, + ch_error_event_handler error_callback, + void *ctx, ResultSet *rs); +#endif // HS_HYBRID char *setupScanBuffer(const char *buf, size_t len, size_t align); char *setupVecScanBuffer(const char *buf, size_t len, size_t align, unsigned int block_id); - bool allocScratch(std::shared_ptr db); + bool allocScratch(std::shared_ptr db); bool cloneScratch(void); @@ -126,6 +136,11 @@ class UltimateTruth : boost::noncopyable { // Scratch space for Hyperscan. hs_scratch_t *scratch; +#ifdef HS_HYBRID + // Scratch space for Chimera. + ch_scratch_t *chimeraScratch; +#endif // HS_HYBRID + // Temporary scan buffer used for realigned scanning std::vector m_scanBuf; @@ -134,7 +149,7 @@ class UltimateTruth : boost::noncopyable { // Last database we successfully allocated scratch for, so that we can // avoid unnecessarily reallocating for it. - std::shared_ptr last_db; + std::shared_ptr last_db; const hs_platform_info *platform; }; diff --git a/tools/hscollider/args.cpp b/tools/hscollider/args.cpp index a15977f9a..3b515027f 100644 --- a/tools/hscollider/args.cpp +++ b/tools/hscollider/args.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -46,7 +46,11 @@ #include #include #include +#ifndef _WIN32 #include +#else +#include "win_getopt.h" +#endif #define xstr(s) str(s) #define str(s) #s @@ -76,6 +80,7 @@ void usage(const char *name, const char *error) { "blocks.\n"); printf(" -V NUM Use vectored mode, split data into ~NUM " "blocks.\n"); + printf(" -H Use hybrid mode.\n"); printf(" -Z {R or 0-%d} Only test one alignment, either as given or " "'R' for random.\n", MAX_MAX_UE2_ALIGN - 1); printf(" -q Quiet; display only match differences, no other " @@ -90,6 +95,7 @@ void usage(const char *name, const char *error) { printf(" -E DISTANCE Match all patterns within edit distance" " DISTANCE.\n"); printf(" --prefilter Apply HS_FLAG_PREFILTER to all patterns.\n"); + printf(" --no-groups Disable capturing in Hybrid mode.\n"); printf("\n"); printf("Testing mode options:\n"); printf("\n"); @@ -157,7 +163,7 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop, vector *corpora, UNUSED Grey *grey, unique_ptr *plat_out) { static const char options[] - = "-ab:cC:d:D:e:E:G:hi:k:Lm:M:n:o:O:p:P:qr:R:S:s:t:T:vV:w:x:X:Y:z:Z:8"; + = "-ab:cC:d:D:e:E:G:hHi:k:Lm:M:n:o:O:p:P:qr:R:S:s:t:T:vV:w:x:X:Y:z:Z:8"; s32 in_multi = 0; s32 in_corpora = 0; int pcreFlag = 1; @@ -180,6 +186,7 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop, {"no-signal-handler", 0, &no_signal_handler, 1}, {"compress-expand", 0, &compressFlag, 1}, {"compress-reset-expand", 0, &compressResetFlag, 1}, + {"no-groups", 0, &no_groups, 1}, {nullptr, 0, nullptr, 0}}; for (;;) { @@ -271,6 +278,15 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop, case 'h': usage(argv[0], nullptr); exit(0); + case 'H': + if (colliderMode != MODE_BLOCK) { + usage(argv[0], "You can only use one mode at a time!"); + exit(1); + } + colliderMode = MODE_HYBRID; + // Disable graph truth in hybrid mode + nfaFlag = 0; + break; case 'i': loadDatabases = true; serializePath = optarg; @@ -455,7 +471,7 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop, exit(1); } break; - case 'Z': + case 'Z': { // Parentheses save VS C2360 static constexpr unsigned ALIGN_LIMIT = MAX_MAX_UE2_ALIGN - 1; if (optarg == string("R")) { // Random min alignment selected. @@ -469,6 +485,7 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop, } max_ue2_align = min_ue2_align + 1; break; + } case '8': force_utf8 = true; break; @@ -542,6 +559,11 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop, exit(1); } + if (colliderMode == MODE_HYBRID && !ue2Flag) { + usage(argv[0], "You cannot disable UE2 engine in Hybrid mode."); + exit(1); + } + // need at least two pattern engines active if (nfaFlag + pcreFlag + ue2Flag < 2) { usage(argv[0], "At least two pattern engines should be active."); diff --git a/tools/hscollider/common.h b/tools/hscollider/common.h index da85790ca..d9a0144cc 100644 --- a/tools/hscollider/common.h +++ b/tools/hscollider/common.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -36,7 +36,8 @@ enum ColliderMode { MODE_BLOCK, MODE_STREAMING, - MODE_VECTORED + MODE_VECTORED, + MODE_HYBRID }; extern unsigned numThreads; @@ -68,6 +69,7 @@ extern unsigned max_ue2_align; extern size_t g_memoryLimit; extern bool force_utf8; extern int force_prefilter; +extern int no_groups; extern unsigned somPrecisionMode; extern unsigned limit_matches; extern unsigned randomSeed; diff --git a/tools/hscollider/main.cpp b/tools/hscollider/main.cpp index e1e543cc1..ec7cd6be5 100644 --- a/tools/hscollider/main.cpp +++ b/tools/hscollider/main.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -448,6 +448,9 @@ void printMode(void) { case MODE_VECTORED: cout << "Vectored-" << g_streamBlocks; break; + case MODE_HYBRID: + cout << "Hybrid"; + break; } if (use_copy_scratch) { @@ -690,7 +693,7 @@ shared_ptr constructDatabase(const set &ids, if (loadDatabases) { string filename = ultimate.dbFilename(ids); - shared_ptr db = ultimate.loadDatabase(filename, ids); + shared_ptr db = ultimate.loadDatabase(filename, ids); if (!db) { if (!g_quiet) { cout << "FAILED: could not load database " << filename << endl; @@ -706,7 +709,7 @@ shared_ptr constructDatabase(const set &ids, // If we're not runnable (i.e. we're cross-compiling), let's at least // try to build the database. if (!ultimate.runnable()) { - shared_ptr db = ue2->get(ultimate); + shared_ptr db = ue2->get(ultimate); assert(db); // throws otherwise } @@ -872,7 +875,7 @@ void runTestUnit(ostream &out, GroundTruth &ground, GraphTruth &graph, assert(use_UE2); Corpus &corpus = unit.corpus; - shared_ptr db; + shared_ptr db; if (use_UE2) { // Acquire UE2 database. debug_stage = STAGE_UE2_COMPILE; @@ -1060,7 +1063,7 @@ void addCorporaToQueue(ostream &out, BoundedQueue &testq, unsigned id, // is undefined. if (utf8) { auto is_invalid_utf8 = [](const Corpus &corpus) { - return !isValidUtf8(corpus.data.c_str()); + return !isValidUtf8(corpus.data.c_str(), corpus.data.size()); }; c.erase(remove_if(begin(c), end(c), is_invalid_utf8), end(c)); } @@ -1648,6 +1651,7 @@ void printSettingsV(const vector &corporaFiles, case MODE_BLOCK: cout << "block mode"; break; case MODE_STREAMING: cout << "streaming mode"; break; case MODE_VECTORED: cout << "vectored mode"; break; + case MODE_HYBRID: cout << "hybrid mode"; break; } cout << endl; @@ -1746,6 +1750,7 @@ void printSettingsQ(const vector &corporaFiles, case MODE_BLOCK: cout << "block mode"; break; case MODE_STREAMING: cout << "streaming mode"; break; case MODE_VECTORED: cout << "vectored mode"; break; + case MODE_HYBRID: cout << "hybrid mode"; break; } cout << endl; @@ -1835,13 +1840,17 @@ unique_ptr buildCorpora(const vector &corporaFiles, static bool needsQuotes(const char *s) { size_t len = strlen(s); - // don't confuse the correct isblank for the one in locale - int (*blank)(int) = &std::isblank; if (len == 0) { return true; } +#ifndef _WIN32 + // don't confuse the correct isblank for the one in locale + int (*blank)(int) = &std::isblank; if (find_if(s, s + len, blank) != s + len) { +#else + if (find_if(s, s + len, [](unsigned char c){ return std::isblank(c); }) != s + len) { +#endif return true; } @@ -1905,7 +1914,7 @@ bool runTests(CorporaSource &corpora_source, const ExpressionMap &exprMap, return !summary.hasFailure(); } -int main(int argc, char *argv[]) { +int HS_CDECL main(int argc, char *argv[]) { Grey grey; vector corporaFiles; diff --git a/tools/hscollider/pcre_util.cpp b/tools/hscollider/pcre_util.cpp index 0e1aa0ec6..da8dbd11c 100644 --- a/tools/hscollider/pcre_util.cpp +++ b/tools/hscollider/pcre_util.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -34,7 +34,8 @@ #include /* for pcre flags */ bool getPcreFlags(unsigned int hs_flags, unsigned int *flags, - bool *highlander, bool *prefilter, bool *som) { + bool *highlander, bool *prefilter, bool *som, + bool *combination, bool *quiet) { assert(flags); assert(highlander); assert(prefilter); @@ -76,6 +77,14 @@ bool getPcreFlags(unsigned int hs_flags, unsigned int *flags, *som = true; hs_flags &= ~HS_FLAG_SOM_LEFTMOST; } + if (hs_flags & HS_FLAG_COMBINATION) { + *combination = true; + hs_flags &= ~HS_FLAG_COMBINATION; + } + if (hs_flags & HS_FLAG_QUIET) { + *quiet = true; + hs_flags &= ~HS_FLAG_QUIET; + } // Flags that are irrelevant to PCRE. hs_flags &= ~HS_FLAG_ALLOWEMPTY; diff --git a/tools/hscollider/pcre_util.h b/tools/hscollider/pcre_util.h index 877588735..4355579b3 100644 --- a/tools/hscollider/pcre_util.h +++ b/tools/hscollider/pcre_util.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -35,7 +35,8 @@ * Returns false if an unknown hyperscan flag is encountered. */ bool getPcreFlags(unsigned int hs_flags, unsigned int *pcre_flags, - bool *highlander, bool *prefilter, bool *som); + bool *highlander, bool *prefilter, bool *som, + bool *combination = nullptr, bool *quiet = nullptr); #endif /* PCRE_UTIL_H */ diff --git a/tools/hscollider/sig.cpp b/tools/hscollider/sig.cpp index b48be98a9..dc8151400 100644 --- a/tools/hscollider/sig.cpp +++ b/tools/hscollider/sig.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -36,7 +36,7 @@ #include #include -#ifdef HAVE_SIGACTION +#if defined(HAVE_SIGACTION) || defined(_WIN32) #include #endif @@ -56,8 +56,12 @@ TLS_VARIABLE volatile size_t debug_corpus_len = 0; extern std::string g_cmdline; -#ifdef HAVE_SIGACTION +#if defined(_WIN32) +static void __cdecl sighandler(int signum) { +#elif defined(HAVE_SIGACTION) static void sighandler(int signum) { +#endif +#if defined(HAVE_SIGACTION) || defined(_WIN32) /* NOTE: This signal handler is designed solely to provide more information * when a crash occurs in ue2collider -- it makes calls to signal-unsafe * functions like printf() and backtrace() by design, since we're already @@ -141,7 +145,13 @@ static void sighandler(int signum) { #endif // HAVE_SIGACTION void installSignalHandler(void) { -#ifdef HAVE_SIGACTION + +#ifdef _WIN32 + signal(SIGABRT, sighandler); + signal(SIGFPE, sighandler); + signal(SIGILL, sighandler); + signal(SIGSEGV, sighandler); +#elif defined(HAVE_SIGACTION) struct sigaction act; memset(&act, 0, sizeof(act)); act.sa_handler = sighandler; diff --git a/tools/hscollider/sig.h b/tools/hscollider/sig.h index fc6438269..4b24e95f6 100644 --- a/tools/hscollider/sig.h +++ b/tools/hscollider/sig.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -40,7 +40,11 @@ #define STAGE_GRAPH_COMPILE 6 #define STAGE_GRAPH_RUN 7 +#ifndef WIN32 #define TLS_VARIABLE __thread +#else +#define TLS_VARIABLE __declspec(thread) +#endif extern TLS_VARIABLE volatile int debug_stage; extern TLS_VARIABLE volatile int debug_expr; diff --git a/tools/hsdump/CMakeLists.txt b/tools/hsdump/CMakeLists.txt index c3db52353..4350b0f6d 100644 --- a/tools/hsdump/CMakeLists.txt +++ b/tools/hsdump/CMakeLists.txt @@ -3,10 +3,6 @@ if (NOT DUMP_SUPPORT) return() endif () -if (WIN32) - return() -endif () - include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_SOURCE_DIR}/util) diff --git a/tools/hsdump/main.cpp b/tools/hsdump/main.cpp index 53a72d208..3221d1b69 100644 --- a/tools/hsdump/main.cpp +++ b/tools/hsdump/main.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -58,9 +58,19 @@ #include #include +#ifndef _WIN32 #include +#else +#include "win_getopt.h" +#endif #include + +#ifndef _WIN32 #include +#else +#include +#define stat _stat +#endif #include @@ -318,6 +328,7 @@ u32 buildDumpFlags(void) { return flags; } +#ifndef _WIN32 static void clearDir(const string &path) { DIR *dir = opendir(path.c_str()); @@ -341,15 +352,54 @@ void clearDir(const string &path) { } closedir(dir); } +#else // windows +static +void clearDir(const string &path) { + WIN32_FIND_DATA ffd; + HANDLE hFind = INVALID_HANDLE_VALUE; + string glob = path + "/*"; + hFind = FindFirstFile(glob.c_str(), &ffd); + if (hFind == INVALID_HANDLE_VALUE) { + printf("ERROR: couldn't open location %s\n", path.c_str()); + exit(1); + } + do { + string basename(ffd.cFileName); + string fname(path); + fname.push_back('/'); + fname.append(basename); + + // Ignore '.' and '..' + if (basename == "." || basename == "..") { + continue; + } + + if (!DeleteFile(fname.c_str())) { + printf("ERROR: couldn't remove file %s\n", fname.c_str()); + } + + } while (FindNextFile(hFind, &ffd) != 0); + FindClose(hFind); +} +#endif + +static +int makeDirectory(const string &dirName) { +#ifndef _WIN32 + mode_t mode = S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IXGRP | + S_IROTH | S_IXOTH; + return mkdir(dirName.c_str(), mode); +#else + return _mkdir(dirName.c_str()); +#endif +} static void prepareDumpLoc(string parent, string path, u32 flags, Grey &grey) { struct stat st; if (stat(parent.c_str(), &st)) { // Create dump location if not found - mode_t mode = S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IXGRP | - S_IROTH | S_IXOTH; - if (mkdir(parent.c_str(), mode) < 0) { + if (makeDirectory(parent) < 0) { printf("ERROR: could not create dump location %s: %s\n", parent.c_str(), strerror(errno)); exit(1); @@ -365,9 +415,7 @@ void prepareDumpLoc(string parent, string path, u32 flags, Grey &grey) { path = parent.append(path); if (stat(path.c_str(), &st)) { // Create dump location if not found - mode_t mode = S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IXGRP | - S_IROTH | S_IXOTH; - if (mkdir(path.c_str(), mode) < 0) { + if (makeDirectory(path) < 0) { printf("ERROR: could not create dump location %s: %s\n", path.c_str(), strerror(errno)); exit(1); @@ -546,7 +594,7 @@ unsigned int dumpData(const ExpressionMap &exprMap, Grey &grey) { return dumpDataMulti(patterns, flags, ids, ext, grey); } -int main(int argc, char *argv[]) { +int HS_CDECL main(int argc, char *argv[]) { Grey grey; grey.dumpFlags = Grey::DUMP_BASICS; diff --git a/unit/CMakeLists.txt b/unit/CMakeLists.txt index 06cddebda..32e014508 100644 --- a/unit/CMakeLists.txt +++ b/unit/CMakeLists.txt @@ -1,6 +1,13 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}") +if(CMAKE_C_FLAGS MATCHES "/Gv" ) + string(REPLACE "/Gv" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") +endif() +if(CMAKE_CXX_FLAGS MATCHES "/Gv" ) + string(REPLACE "/Gv" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") +endif() + set(gtest_SOURCES gtest/gtest-all.cc gtest/gtest.h) include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR}) include_directories(${PROJECT_SOURCE_DIR}) @@ -31,6 +38,10 @@ endif() add_definitions(-DGTEST_HAS_PTHREAD=0 -DSRCDIR=${PROJECT_SOURCE_DIR}) +if (WIN32) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4309 /wd4018") +endif() + set(unit_hyperscan_SOURCES ${gtest_SOURCES} hyperscan/allocators.cpp @@ -42,6 +53,7 @@ set(unit_hyperscan_SOURCES hyperscan/extparam.cpp hyperscan/identical.cpp hyperscan/literals.cpp + hyperscan/logical_combination.cpp hyperscan/main.cpp hyperscan/multi.cpp hyperscan/order.cpp @@ -77,6 +89,7 @@ set(unit_internal_SOURCES internal/flat_set.cpp internal/flat_map.cpp internal/graph.cpp + internal/graph_undirected.cpp internal/insertion_ordered.cpp internal/lbr.cpp internal/limex_nfa.cpp @@ -121,22 +134,58 @@ set_target_properties(unit-internal PROPERTIES COMPILE_FLAGS "${HS_CXX_FLAGS}") target_link_libraries(unit-internal hs corpusomatic) endif(NOT (RELEASE_BUILD OR FAT_RUNTIME)) -# -# build target to run unit tests -# -if (NOT RELEASE_BUILD) -add_custom_target( - unit - COMMAND bin/unit-internal - COMMAND bin/unit-hyperscan - WORKING_DIRECTORY ${CMAKE_BINARY_DIR} - DEPENDS unit-internal unit-hyperscan -) -else () -add_custom_target( - unit - COMMAND bin/unit-hyperscan - WORKING_DIRECTORY ${CMAKE_BINARY_DIR} - DEPENDS unit-hyperscan -) +if (BUILD_CHIMERA) + # enable Chimera unit tests + set(unit_chimera_SOURCES + ${gtest_SOURCES} + chimera/allocators.cpp + chimera/arg_checks.cpp + chimera/bad_patterns.cpp + chimera/compat.cpp + chimera/main.cpp + chimera/scan.cpp + ) + add_executable(unit-chimera ${unit_chimera_SOURCES}) + target_link_libraries(unit-chimera chimera hs pcre) + # + # build target to run unit tests + # + if (NOT RELEASE_BUILD) + add_custom_target( + unit + COMMAND bin/unit-internal + COMMAND bin/unit-hyperscan + COMMAND bin/unit-chimera + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + DEPENDS unit-internal unit-hyperscan unit-chimera + ) + else () + add_custom_target( + unit + COMMAND bin/unit-hyperscan + COMMAND bin/unit-chimera + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + DEPENDS unit-hyperscan unit-chimera + ) + endif() +else() + # + # build target to run unit tests + # + if (NOT RELEASE_BUILD) + add_custom_target( + unit + COMMAND bin/unit-internal + COMMAND bin/unit-hyperscan + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + DEPENDS unit-internal unit-hyperscan + ) + else () + add_custom_target( + unit + COMMAND bin/unit-hyperscan + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + DEPENDS unit-hyperscan + ) + endif() endif() diff --git a/unit/chimera/allocators.cpp b/unit/chimera/allocators.cpp new file mode 100644 index 000000000..bfceba767 --- /dev/null +++ b/unit/chimera/allocators.cpp @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "gtest/gtest.h" +#include "chimera/ch.h" + +#include +#include + +using std::string; + +static void *null_malloc(size_t) { return nullptr; } + +// Helper: correctly construct a simple database. +static +void makeDatabase(ch_database_t **hydb) { + static const char *expr[] = { "foobar" }; + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + ch_error_t err; + + err = ch_compile_multi(expr, nullptr, nullptr, 1, 0, nullptr, &db, + &compile_err); + + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(db != nullptr); + + *hydb = db; +} + +TEST(HybridAllocator, DatabaseInfoBadAlloc) { + ch_database_t *db = nullptr; + makeDatabase(&db); + ASSERT_TRUE(db != nullptr); + + ch_set_allocator(null_malloc, nullptr); + + char *info = nullptr; + ch_error_t err = ch_database_info(db, &info); + ASSERT_EQ(CH_NOMEM, err); + + ch_set_allocator(nullptr, nullptr); + ch_free_database(db); +} + +static +void * two_aligned_malloc(size_t len) { + void *mem = malloc(len + 2); + if (!mem) { + return nullptr; + } + return (char *)mem + 2; +} + +static +void two_aligned_free(void *mem) { + if (!mem) { + return; + } + // Allocated with two_aligned_malloc above. + free((char *)mem - 2); +} + +TEST(HybridAllocator, TwoAlignedCompile) { + ch_set_database_allocator(two_aligned_malloc, two_aligned_free); + + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + const hs_platform_info_t *platform = nullptr; + ch_error_t err = + ch_compile("foobar", 0, CH_MODE_GROUPS, platform, &db, &compile_err); + ASSERT_EQ(CH_COMPILER_ERROR, err); + ASSERT_EQ(nullptr, db); + ASSERT_NE(nullptr, compile_err); + ch_free_compile_error(compile_err); + ch_set_database_allocator(nullptr, nullptr); +} + +TEST(HybridAllocator, TwoAlignedCompileError) { + ch_set_misc_allocator(two_aligned_malloc, two_aligned_free); + + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + const hs_platform_info_t *platform = nullptr; + ch_error_t err = + ch_compile("\\1", 0, CH_MODE_GROUPS, platform, &db, &compile_err); + ASSERT_EQ(CH_COMPILER_ERROR, err); + ASSERT_EQ(nullptr, db); + ASSERT_NE(nullptr, compile_err); + EXPECT_STREQ("Allocator returned misaligned memory.", compile_err->message); + ch_free_compile_error(compile_err); + ch_set_database_allocator(nullptr, nullptr); + ch_set_misc_allocator(nullptr, nullptr); +} + +TEST(HybridAllocator, TwoAlignedDatabaseInfo) { + ch_database_t *db = nullptr; + makeDatabase(&db); + + ch_set_misc_allocator(two_aligned_malloc, two_aligned_free); + + char *info = nullptr; + ch_error_t err = ch_database_info(db, &info); + ASSERT_EQ(CH_BAD_ALLOC, err); + + ch_set_misc_allocator(nullptr, nullptr); + ch_free_database(db); +} + +TEST(HybridAllocator, TwoAlignedAllocScratch) { + ch_database_t *db = nullptr; + makeDatabase(&db); + + ch_set_scratch_allocator(two_aligned_malloc, two_aligned_free); + + ch_scratch_t *scratch = nullptr; + ch_error_t err = ch_alloc_scratch(db, &scratch); + ASSERT_EQ(CH_BAD_ALLOC, err); + + ch_set_scratch_allocator(nullptr, nullptr); + ch_free_database(db); +} diff --git a/unit/chimera/arg_checks.cpp b/unit/chimera/arg_checks.cpp new file mode 100644 index 000000000..ea1cda153 --- /dev/null +++ b/unit/chimera/arg_checks.cpp @@ -0,0 +1,591 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "gtest/gtest.h" +#include "chimera/ch.h" + +static char garbage[] = "TEST(HybridArgChecks, DatabaseSizeNoDatabase) {" \ + " size_t sz = ch_database_size(0);" \ + " ASSERT_EQ(0, sz);"; + +namespace /* anonymous */ { + +// Dummy callback: does nothing, returns 0 (keep matching) +ch_callback_t dummyHandler(unsigned, unsigned long long, + unsigned long long, unsigned, unsigned, + const ch_capture_t *, void *) { + // empty + return CH_CALLBACK_CONTINUE; +} + +// Helper: correctly construct a simple database. +static +void makeDatabase(ch_database_t **hydb) { + static const char *expr[] = { "foo.*bar" }; + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + ch_error_t err; + + err = ch_compile_multi(expr, nullptr, nullptr, 1, 0, nullptr, &db, + &compile_err); + + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(db != nullptr); + + *hydb = db; +} + +// Helper: given a database, build me some scratch. +static +void makeScratch(const ch_database_t *db, + ch_scratch_t **scratch) { + ch_error_t err = ch_alloc_scratch(db, scratch); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(*scratch != nullptr); +} + +// Break the magic number of the given database. +void breakDatabaseMagic(ch_database *db) { + // database magic should be 0xdbdb at the start + ASSERT_TRUE(memcmp("\xde\xde", db, 2) == 0); + *(char *)db = 0xdc; +} + +// Break the version number of the given database. +void breakDatabaseVersion(ch_database *db) { + // database version is the second u32 + *((char *)db + 4) += 1; +} + +// Check that CH_version gives us a reasonable string back +TEST(HybridArgChecks, Version) { + const char *version = ch_version(); + ASSERT_TRUE(version != nullptr); + ASSERT_TRUE(version[0] >= '0' && version[0] <= '9') + << "First byte should be a digit."; + ASSERT_EQ('.', version[1]) << "Second byte should be a dot."; +} + +// ch_compile: Hand the compiler a bogus flag. +TEST(HybridArgChecks, SingleBogusFlags) { + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + ch_error_t err; + + static const unsigned int badflags[] = { + 0xffffffff, + 16, + 128, + 256, + 512, + }; + + for (size_t i = 0; i < sizeof(badflags)/sizeof(badflags[0]); i++) { + const char expr[] = "foobar"; + err = ch_compile(expr, badflags[i], 0, nullptr, &db, &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(db == nullptr); + EXPECT_TRUE(compile_err != nullptr); + EXPECT_STREQ("Unrecognized flag used.", compile_err->message); + ch_free_compile_error(compile_err); + } +} + +// ch_compile: Hand the compiler a bogus mode. +TEST(HybridArgChecks, SingleBogusMode) { + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + ch_error_t err; + + static const unsigned int badModes[] = { + 0xffffffff, + 1, + 2, + CH_MODE_GROUPS << 1, // this was our largest mode flag + }; + + for (size_t i = 0; i < sizeof(badModes)/sizeof(badModes[0]); i++) { + const char expr[] = "foobar"; + err = ch_compile(expr, 0, badModes[i], nullptr, &db, &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(db == nullptr); + EXPECT_TRUE(compile_err != nullptr); + EXPECT_STREQ("Invalid mode flag supplied.", compile_err->message); + ch_free_compile_error(compile_err); + } +} + +// ch_compile: Compile a nullptr pattern set) +TEST(HybridArgChecks, SingleCompileBlockNoPattern) { + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + ch_error_t err; + err = ch_compile(nullptr, 0, 0, nullptr, &db, &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(db == nullptr); + EXPECT_TRUE(compile_err != nullptr); + ch_free_compile_error(compile_err); +} + +// ch_compile: Compile a pattern to a nullptr database ptr +TEST(HybridArgChecks, SingleCompileBlockNoDatabase) { + ch_compile_error_t *compile_err = nullptr; + const char expr[] = "foobar"; + ch_error_t err; + err = ch_compile(expr, 0, 0, nullptr, nullptr, &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(compile_err != nullptr); + ch_free_compile_error(compile_err); +} + +// ch_compile_multi: Hand the compiler a bogus flag. +TEST(HybridArgChecks, MultiBogusFlags) { + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + ch_error_t err; + + static const unsigned int badflags[] = { + 0xffffffff, + 16, // HS_FLAG_ERROREOD + 128, + 256, + 512, + }; + + for (size_t i = 0; i < sizeof(badflags)/sizeof(badflags[0]); i++) { + const char *expr[] = { "foobar" }; + err = ch_compile_multi(expr, &badflags[i], nullptr, 1, 0, nullptr, &db, + &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(db == nullptr); + EXPECT_TRUE(compile_err != nullptr); + EXPECT_STREQ("Unrecognized flag used.", compile_err->message); + ch_free_compile_error(compile_err); + } +} + +// ch_compile_multi: Hand the ch_compile_multi a bogus mode. +TEST(HybridArgChecks, MultiBogusMode) { + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + ch_error_t err; + + static const unsigned int badModes[] = { + 0xffffffff, + 1, + 2, + CH_MODE_GROUPS << 1, // this was our largest mode flag + }; + + for (size_t i = 0; i < sizeof(badModes)/sizeof(badModes[0]); i++) { + const char *expr[] = { "foobar" }; + err = ch_compile_multi(expr, nullptr, nullptr, 1, badModes[i], nullptr, + &db, &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(db == nullptr); + EXPECT_TRUE(compile_err != nullptr); + EXPECT_STREQ("Invalid mode flag supplied.", compile_err->message); + ch_free_compile_error(compile_err); + } +} + +// ch_compile_multi: Compile a nullptr pattern set (block mode) +TEST(HybridArgChecks, MultiCompileBlockNoPattern) { + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + ch_error_t err; + err = ch_compile_multi(nullptr, nullptr, nullptr, 1, 0, nullptr, &db, + &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(db == nullptr); + EXPECT_TRUE(compile_err != nullptr); + ch_free_compile_error(compile_err); +} + +// ch_compile_multi: Compile a set of zero patterns +TEST(HybridArgChecks, MultiCompileZeroPatterns) { + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + const char *expr[] = {"foobar"}; + ch_error_t err; + err = ch_compile_multi(expr, nullptr, nullptr, 0, 0, nullptr, &db, + &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(db == nullptr); + EXPECT_TRUE(compile_err != nullptr); + ch_free_compile_error(compile_err); +} + +// ch_compile_multi: Compile a pattern to a nullptr database ptr +TEST(HybridArgChecks, MultiCompileBlockNoDatabase) { + ch_compile_error_t *compile_err = nullptr; + const char *expr[] = {"foobar"}; + ch_error_t err; + err = ch_compile_multi(expr, nullptr, nullptr, 1, 0, nullptr, nullptr, + &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(compile_err != nullptr); + ch_free_compile_error(compile_err); +} + +// ch_compile_ext_multi: Hand the compiler a bogus flag. +TEST(HybridArgChecks, ExtMultiBogusFlags) { + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + ch_error_t err; + + static const unsigned int badflags[] = { + 0xffffffff, + 16, // HS_FLAG_ERROREOD + 128, + 256, + 512, + }; + + for (size_t i = 0; i < sizeof(badflags)/sizeof(badflags[0]); i++) { + const char *expr[] = { "foobar" }; + err = ch_compile_ext_multi(expr, &badflags[i], nullptr, 1, 0, + 10000000, 8000, nullptr, &db, &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(db == nullptr); + EXPECT_TRUE(compile_err != nullptr); + EXPECT_STREQ("Unrecognized flag used.", compile_err->message); + ch_free_compile_error(compile_err); + } +} + +// ch_compile_ext_multi: Hand the ch_compile_multi a bogus mode. +TEST(HybridArgChecks, ExtMultiBogusMode) { + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + ch_error_t err; + + static const unsigned int badModes[] = { + 0xffffffff, + 1, + 2, + CH_MODE_GROUPS << 1, // this was our largest mode flag + }; + + for (size_t i = 0; i < sizeof(badModes)/sizeof(badModes[0]); i++) { + const char *expr[] = { "foobar" }; + err = ch_compile_ext_multi(expr, nullptr, nullptr, 1, badModes[i], + 10000000, 8000, nullptr, &db, &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(db == nullptr); + EXPECT_TRUE(compile_err != nullptr); + EXPECT_STREQ("Invalid mode flag supplied.", compile_err->message); + ch_free_compile_error(compile_err); + } +} + +// ch_compile_ext_multi: Compile a nullptr pattern set (block mode) +TEST(HybridArgChecks, ExtMultiCompileBlockNoPattern) { + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + ch_error_t err; + err = ch_compile_ext_multi(nullptr, nullptr, nullptr, 1, 0, 10000000, + 8000, nullptr, &db, &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(db == nullptr); + EXPECT_TRUE(compile_err != nullptr); + ch_free_compile_error(compile_err); +} + +// ch_compile_ext_multi: Compile a set of zero patterns +TEST(HybridArgChecks, ExtMultiCompileZeroPatterns) { + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + const char *expr[] = {"foobar"}; + ch_error_t err; + err = ch_compile_ext_multi(expr, nullptr, nullptr, 0, 0, 10000000, + 8000, nullptr, &db, &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(db == nullptr); + EXPECT_TRUE(compile_err != nullptr); + ch_free_compile_error(compile_err); +} + +// ch_compile_ext_multi: Compile a pattern to a nullptr database ptr +TEST(HybridArgChecks, ExtMultiCompileBlockNoDatabase) { + ch_compile_error_t *compile_err = nullptr; + const char *expr[] = {"foobar"}; + ch_error_t err; + err = ch_compile_ext_multi(expr, nullptr, nullptr, 1, 0, 10000000, + 8000, nullptr, nullptr, &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(compile_err != nullptr); + ch_free_compile_error(compile_err); +} + +// ch_scan: Call with no database +TEST(HybridArgChecks, ScanBlockNoDatabase) { + ch_database_t *db = nullptr; + makeDatabase(&db); + ch_scratch_t *scratch = nullptr; + makeScratch(db, &scratch); + + ch_error_t err = ch_scan(nullptr, "data", 4, 0, scratch, + dummyHandler, nullptr, nullptr); + ASSERT_NE(CH_SUCCESS, err); + EXPECT_NE(CH_SCAN_TERMINATED, err); + + // teardown + err = ch_free_scratch(scratch); + ASSERT_EQ(CH_SUCCESS, err); + ch_free_database(db); +} + +// ch_scan: Call with a database with broken magic +TEST(HybridArgChecks, ScanBlockBrokenDatabaseMagic) { + ch_database_t *db = nullptr; + makeDatabase(&db); + ch_scratch_t *scratch = nullptr; + makeScratch(db, &scratch); + + // break the database here, after scratch alloc + breakDatabaseMagic(db); + + ch_error_t err = ch_scan(db, "data", 4, 0, scratch, + dummyHandler, nullptr, nullptr); + ASSERT_EQ(CH_INVALID, err); + + // teardown + err = ch_free_scratch(scratch); + ASSERT_EQ(CH_SUCCESS, err); + free(db); +} + +// ch_scan: Call with a database with broken version +TEST(HybridArgChecks, ScanBlockBrokenDatabaseVersion) { + ch_database_t *db = nullptr; + makeDatabase(&db); + ch_scratch_t *scratch = nullptr; + makeScratch(db, &scratch); + + // break the database here, after scratch alloc + breakDatabaseVersion(db); + + ch_error_t err = ch_scan(db, "data", 4, 0, scratch, + dummyHandler, nullptr, nullptr); + ASSERT_EQ(CH_DB_VERSION_ERROR, err); + + // teardown + err = ch_free_scratch(scratch); + ASSERT_EQ(CH_SUCCESS, err); + ch_free_database(db); +} + +// ch_scan: Call with no data +TEST(HybridArgChecks, ScanBlockNoData) { + ch_database_t *db = nullptr; + makeDatabase(&db); + ch_scratch_t *scratch = nullptr; + makeScratch(db, &scratch); + + ch_error_t err = ch_scan(db, nullptr, 4, 0, scratch, dummyHandler, + nullptr, nullptr); + ASSERT_NE(CH_SUCCESS, err); + EXPECT_NE(CH_SCAN_TERMINATED, err); + + // teardown + err = ch_free_scratch(scratch); + ASSERT_EQ(CH_SUCCESS, err); + ch_free_database(db); +} + +// ch_scan: Call with no scratch +TEST(HybridArgChecks, ScanBlockNoScratch) { + ch_database_t *db = nullptr; + makeDatabase(&db); + + ch_error_t err = ch_scan(db, "data", 4, 0, nullptr, dummyHandler, + nullptr, nullptr); + ASSERT_NE(CH_SUCCESS, err); + EXPECT_NE(CH_SCAN_TERMINATED, err); + + // teardown + ch_free_database(db); +} + +// ch_scan: Call with no event handler +TEST(HybridArgChecks, ScanBlockNoHandler) { + ch_database_t *db = nullptr; + makeDatabase(&db); + ch_scratch_t *scratch = nullptr; + makeScratch(db, &scratch); + + ch_error_t err = ch_scan(db, "data", 4, 0, scratch, nullptr, nullptr, + nullptr); + ASSERT_EQ(CH_SUCCESS, err); + EXPECT_NE(CH_SCAN_TERMINATED, err); + + // teardown + err = ch_free_scratch(scratch); + ASSERT_EQ(CH_SUCCESS, err); + ch_free_database(db); +} + +// ch_alloc_scratch: Call with no database +TEST(HybridArgChecks, AllocScratchNoDatabase) { + ch_scratch_t *scratch = nullptr; + ch_error_t err = ch_alloc_scratch(nullptr, &scratch); + EXPECT_NE(CH_SUCCESS, err); + EXPECT_TRUE(scratch == nullptr); +} + +// ch_alloc_scratch: Call with nullptr ptr-to-scratch +TEST(HybridArgChecks, AllocScratchNullScratchPtr) { + ch_database_t *db = nullptr; + makeDatabase(&db); + + ch_error_t err = ch_alloc_scratch(db, nullptr); + ASSERT_EQ(CH_INVALID, err); + + // teardown + ch_free_database(db); +} + +// ch_alloc_scratch: Call with bogus scratch +TEST(HybridArgChecks, AllocScratchBogusScratch) { + ch_database_t *db = nullptr; + makeDatabase(&db); + + ch_scratch_t *blah = (ch_scratch_t *)malloc(100); + memset(blah, 0xf0, 100); + ch_error_t err = ch_alloc_scratch(db, &blah); + ASSERT_EQ(CH_INVALID, err); + + // teardown + free(blah); + ch_free_database(db); +} + +// ch_alloc_scratch: Call with broken database magic +TEST(HybridArgChecks, AllocScratchBadDatabaseMagic) { + ch_database_t *db = nullptr; + makeDatabase(&db); + + breakDatabaseMagic(db); + + ch_scratch_t *scratch = nullptr; + ch_error_t err = ch_alloc_scratch(db, &scratch); + ASSERT_EQ(CH_INVALID, err); + + // teardown + free(db); +} + +// ch_alloc_scratch: Call with broken database version +TEST(HybridArgChecks, AllocScratchBadDatabaseVersion) { + ch_database_t *db = nullptr; + makeDatabase(&db); + + breakDatabaseVersion(db); + + ch_scratch_t *scratch = nullptr; + ch_error_t err = ch_alloc_scratch(db, &scratch); + ASSERT_EQ(CH_DB_VERSION_ERROR, err); + + // teardown + ch_free_database(db); +} + +// ch_clone_scratch: Call with no source scratch +TEST(HybridArgChecks, CloneScratchNoSource) { + ch_scratch_t *scratch = nullptr, *scratch2 = nullptr; + ch_error_t err = ch_clone_scratch(scratch, &scratch2); + EXPECT_NE(CH_SUCCESS, err); + EXPECT_TRUE(scratch2 == nullptr); +} + +// ch_database_size: Call with no database +TEST(HybridArgChecks, DatabaseSizeNoDatabase) { + size_t sz = 0; + ch_error_t err = ch_database_size(0, &sz); + ASSERT_EQ(CH_INVALID, err); + ASSERT_EQ(0U, sz); +} + +// ch_clone_scratch: bad scratch arg +TEST(HybridArgChecks, CloneBadScratch) { + // Try cloning the scratch + void *local_garbage = malloc(sizeof(garbage)); + memcpy(local_garbage, garbage, sizeof(garbage)); + ch_scratch_t *cloned = nullptr; + ch_scratch_t *scratch = (ch_scratch_t *)local_garbage; + ch_error_t err = ch_clone_scratch(scratch, &cloned); + free(local_garbage); + ASSERT_EQ(CH_INVALID, err); +} + +// ch_scan: bad scratch arg +TEST(HybridArgChecks, ScanBadScratch) { + ch_database_t *db = nullptr; + makeDatabase(&db); + + void *local_garbage = malloc(sizeof(garbage)); + memcpy(local_garbage, garbage, sizeof(garbage)); + + ch_scratch_t *scratch = (ch_scratch_t *)local_garbage; + ch_error_t err = ch_scan(db, "data", 4, 0, scratch, + dummyHandler, nullptr, nullptr); + free(local_garbage); + ASSERT_EQ(CH_INVALID, err); + + // teardown + ch_free_database(db); +} + +TEST(HybridArgChecks, ch_free_database_null) { + ch_error_t err = ch_free_database(nullptr); + ASSERT_EQ(CH_SUCCESS, err); +} + +TEST(HybridArgChecks, ch_free_database_garbage) { + ch_error_t err = ch_free_database((ch_database_t *)garbage); + ASSERT_EQ(CH_INVALID, err); +} + +TEST(HybridArgChecks, ch_free_scratch_null) { + ch_error_t err = ch_free_scratch(nullptr); + ASSERT_EQ(CH_SUCCESS, err); +} + +TEST(HybridArgChecks, ch_free_scratch_garbage) { + ch_error_t err = ch_free_scratch((ch_scratch_t *)garbage); + ASSERT_EQ(CH_INVALID, err); +} + +TEST(HybridArgChecks, ch_free_compile_error_null) { + ch_error_t err = ch_free_compile_error(nullptr); + ASSERT_EQ(CH_SUCCESS, err); +} + +} // namespace + diff --git a/unit/chimera/bad_patterns.cpp b/unit/chimera/bad_patterns.cpp new file mode 100644 index 000000000..0e6ce5d90 --- /dev/null +++ b/unit/chimera/bad_patterns.cpp @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "gtest/gtest.h" +#include "chimera/ch.h" + +using namespace testing; + +class HybridCompile : public TestWithParam { + // empty +}; + +TEST_P(HybridCompile, BadPattern) { + ch_error_t err; + ch_compile_error_t *compile_err = nullptr; + const char *pattern = GetParam(); + ch_database_t *db = nullptr; + + err = ch_compile_multi(&pattern, nullptr, nullptr, 1, 0, nullptr, &db, + &compile_err); + ASSERT_NE(CH_SUCCESS, err) << "Compile should have failed for expr: " + << pattern; + ASSERT_TRUE(db == nullptr); + ASSERT_TRUE(compile_err != nullptr); + + ch_free_compile_error(compile_err); +} + +static +const char * BAD_PATTERNS[] = { + // unmatched parens + "(foo", + "foo)", + "((foo)", + "(foo))", + // nothing to repeat + "a+++", + "a+?+", + "a???", + "a??+", + "?qa", + "*abc", + "+abc", + // repeating boundaries is not allowed (UE-1007) + "^?0", + "^*0", + "^+0", + "^{1,3}0", + "0$?", + "0$*", + "0$+", + "0${1,3}", + // char classes + "[]", + "[]foobar", + "[`-\\80", + // bad named classes + "[[:foo:]]", + "[[:1234:]]", + "[[:f\\oo:]]", + "[[: :]]", + "[[:...:]]", + "[[:l\\ower:]]", + "[[:abc\\:]]", + "[abc[:x\\]pqr:]]", + "[[:a\\dz:]]", + "foobar\\", // trailing unescaped backslash +}; + +INSTANTIATE_TEST_CASE_P(Compile, HybridCompile, ValuesIn(BAD_PATTERNS)); diff --git a/unit/chimera/compat.cpp b/unit/chimera/compat.cpp new file mode 100644 index 000000000..2599656ac --- /dev/null +++ b/unit/chimera/compat.cpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "gtest/gtest.h" +#include "chimera/ch.h" +#include "hs.h" + +// We currently depend on our common (meaning) hash defines having the same +// values. +TEST(HybridCompat, Defines) { + // flags + EXPECT_EQ(HS_FLAG_CASELESS, CH_FLAG_CASELESS); + EXPECT_EQ(HS_FLAG_DOTALL, CH_FLAG_DOTALL); + EXPECT_EQ(HS_FLAG_MULTILINE, CH_FLAG_MULTILINE); + EXPECT_EQ(HS_FLAG_SINGLEMATCH, CH_FLAG_SINGLEMATCH); + EXPECT_EQ(HS_FLAG_UTF8, CH_FLAG_UTF8); + EXPECT_EQ(HS_FLAG_UCP, CH_FLAG_UCP); + + // errors + EXPECT_EQ(HS_SUCCESS, CH_SUCCESS); + EXPECT_EQ(HS_INVALID, CH_INVALID); + EXPECT_EQ(HS_NOMEM, CH_NOMEM); + EXPECT_EQ(HS_SCAN_TERMINATED, CH_SCAN_TERMINATED); + EXPECT_EQ(HS_COMPILER_ERROR, CH_COMPILER_ERROR); + EXPECT_EQ(HS_DB_VERSION_ERROR, CH_DB_VERSION_ERROR); + EXPECT_EQ(HS_DB_PLATFORM_ERROR, CH_DB_PLATFORM_ERROR); + EXPECT_EQ(HS_DB_MODE_ERROR, CH_DB_MODE_ERROR); + EXPECT_EQ(HS_BAD_ALIGN, CH_BAD_ALIGN); + EXPECT_EQ(HS_BAD_ALLOC, CH_BAD_ALLOC); + EXPECT_EQ(HS_SCRATCH_IN_USE, CH_SCRATCH_IN_USE); +} diff --git a/unit/chimera/main.cpp b/unit/chimera/main.cpp new file mode 100644 index 000000000..9ab663c35 --- /dev/null +++ b/unit/chimera/main.cpp @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "gtest/gtest.h" + +// Driver: run all the tests (defined in other source files in this directory) +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/unit/chimera/scan.cpp b/unit/chimera/scan.cpp new file mode 100644 index 000000000..b1dd73b25 --- /dev/null +++ b/unit/chimera/scan.cpp @@ -0,0 +1,551 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +#include "gtest/gtest.h" +#include "chimera/ch.h" + +using namespace std; +using namespace testing; + +namespace { + +class HybridScanParams { +public: + HybridScanParams() {} + HybridScanParams(const char *s, unsigned int f) + : patterns(1, s), flags(1, f) {} + + void add(const char *pattern, unsigned int myflags) { + patterns.push_back(pattern); + flags.push_back(myflags); + } + + size_t size() const { + return patterns.size(); + } + + const char * const * getPatterns() const { + return &patterns[0]; + } + + const unsigned int * getFlags() const { + return &flags[0]; + } + +private: + vector patterns; + vector flags; +}; + +static +vector paramFactory() { + vector hsp; + + // Some simple single-pattern cases. + hsp.push_back(HybridScanParams(".", CH_FLAG_DOTALL)); + hsp.push_back(HybridScanParams("foobar", 0)); + hsp.push_back(HybridScanParams("foo.*bar", 0)); + hsp.push_back(HybridScanParams("fred.*bill", CH_FLAG_DOTALL)); + hsp.push_back(HybridScanParams(".*", 0)); // vacuosity! + hsp.push_back(HybridScanParams("\\A(.?.{7,27}jf[tmqq]l(f|t|hgmr.+.fg|abks)){3,7}", 0)); + hsp.push_back(HybridScanParams("^begin", CH_FLAG_MULTILINE)); + hsp.push_back(HybridScanParams("match", CH_FLAG_SINGLEMATCH)); + + // Single-pattern cases where the pattern isn't supported by hyperscan but + // can be prefiltered. + hsp.push_back(HybridScanParams("foo(?!bar)", 0)); + hsp.push_back(HybridScanParams("(sens|respons)e and \\1ibility", 0)); + + // A case that can't be prefiltered (as of this writing) because it's too + // gosh-darned big. This tests that the hybrid matcher can run without the + // multi-matcher (or with a "fake" one). + hsp.push_back(HybridScanParams("((c(p|p)h{2,}bh.|p|((((cq|j|c|(\\b)|.[^nbgn]|(\\B)[qfh]a)){10,12}|ih|a|mnde[pa].|.g)){5,8})){3}", 0)); + + // Simple multi-pattern literal case. + hsp.push_back(HybridScanParams()); + hsp.back().add("hatstand", 0); + hsp.back().add("teakettle", 0); + hsp.back().add("badgerbrush", 0); + hsp.back().add("mnemosyne", 0); + + // More complex multi-pattern case. + hsp.push_back(HybridScanParams()); + hsp.back().add("foo.{3,7}bar", 0); + hsp.back().add("foo.{30,70}bar", 0); + hsp.back().add("foobar.*foobar", 0); + hsp.back().add("^blingwrapper.*foo", 0); + hsp.back().add("[0-9a-f]{70,}\\n", 0); + + // A couple of trivial Unicode patterns, mostly to make sure we accept + // the flags. + hsp.push_back(HybridScanParams()); + hsp.back().add("foo.*bar", CH_FLAG_UTF8); + hsp.back().add("today", CH_FLAG_UTF8|CH_FLAG_UCP); + + // PCRE exotica. + hsp.push_back(HybridScanParams()); + hsp.back().add("benign literal", 0); + hsp.back().add("(?|(abc)|(def))\\1", 0); + hsp.back().add("(?|(abc)|(def))(?1)", 0); + hsp.back().add("(sens|respons)e and \\1ibility", 0); + hsp.back().add("\\w+(?=;)", 0); + hsp.back().add("foo(?!bar)", 0); + hsp.back().add("(?<=bullock|donkey)", 0); + + return hsp; +} + +// Dummy callback. +static +ch_callback_t dummyHandler(unsigned, unsigned long long, unsigned long long, + unsigned, unsigned,const ch_capture_t *, void *) { + // empty + return CH_CALLBACK_CONTINUE; +} + +static +void checkGroups(unsigned int num, const ch_capture_t *captured) { + // We should have _some_ group info. + ASSERT_LT(0U, num); + ASSERT_TRUE(captured != nullptr); + + // Group 0 is always active. + ASSERT_TRUE(captured[0].flags & CH_CAPTURE_FLAG_ACTIVE); + + // Sanity-checking. + for (unsigned int i = 0; i < num; i++) { + if (!(captured[i].flags & CH_CAPTURE_FLAG_ACTIVE)) { + continue; + } + ASSERT_LE(captured[i].from, captured[i].to) << "Group " << i + << "not sane."; + } +} + +// Dummy callback that checks that we had some groups set. +static +ch_callback_t dummyGroupHandler(unsigned, unsigned long long, + unsigned long long, unsigned, unsigned num, + const ch_capture_t *captured, void *) { + checkGroups(num, captured); + return CH_CALLBACK_CONTINUE; +} + +class HybridScan : public TestWithParam> { +protected: + virtual void SetUp() { + ch_error_t err; + ch_compile_error_t *compile_err = nullptr; + const HybridScanParams &hsp = get<0>(GetParam()); + groups = get<1>(GetParam()); + + err = ch_compile_ext_multi(hsp.getPatterns(), hsp.getFlags(), nullptr, + hsp.size(), groups ? CH_MODE_GROUPS : + CH_MODE_NOGROUPS, 10000000, 8000, + nullptr, &db, &compile_err); + ASSERT_EQ(err, CH_SUCCESS); + ASSERT_TRUE(db != nullptr); + + err = ch_alloc_scratch(db, &scratch); + ASSERT_EQ(err, CH_SUCCESS); + ASSERT_TRUE(scratch != nullptr); + } + + virtual void TearDown() { + ch_free_database(db); + ch_free_scratch(scratch); + } + + ch_database_t *db = nullptr; + ch_scratch_t *scratch = nullptr; + bool groups; +}; + +static const string SCAN_DATA( + "Beware the Jabberwock, my son!\n" + "The jaws that bite, the claws that catch!\n" + "Beware the Jubjub bird, and shun\n" + "The frumious Bandersnatch!\n"); + +TEST_P(HybridScan, BuildAndScan) { + ASSERT_TRUE(db != nullptr); + + size_t sz; + ch_error_t err = ch_database_size(db, &sz); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_LT(16U, sz); + + ch_match_event_handler cb = groups ? dummyGroupHandler : dummyHandler; + + err = ch_scan(db, SCAN_DATA.c_str(), SCAN_DATA.length(), 0, + scratch, cb, nullptr, nullptr); + ASSERT_EQ(CH_SUCCESS, err); +} + +TEST_P(HybridScan, ScanNearly4KData) { + ASSERT_TRUE(db != nullptr); + + string data(4000, '*'); // it's full of stars! + + // Insert some strings that will match a few patterns. + data.insert(278, "foo"); + data.insert(285, "bar"); + data.insert(1178, "foobar"); + data.insert(1894, "bar"); + data.insert(3000, "foobar"); + + ch_match_event_handler cb = groups ? dummyGroupHandler : dummyHandler; + + ch_error_t err = ch_scan(db, data.c_str(), data.length(), 0, + scratch, cb, nullptr, nullptr); + ASSERT_EQ(CH_SUCCESS, err); +} + +TEST_P(HybridScan, ScanBigData) { + ASSERT_TRUE(db != nullptr); + + // More than 4MB, as that pushes us into using PCRE for non-Pawn cases. + string data(5*1024*1024, '*'); // it's full of stars! + + // Insert some strings that will match a few patterns. + data.insert(278, "foo"); + data.insert(285, "bar"); + data.insert(1178, "foobar"); + data.insert(1894, "bar"); + data.insert(3000, "foobar"); + + ch_match_event_handler cb = groups ? dummyGroupHandler : dummyHandler; + + ch_error_t err = ch_scan(db, data.c_str(), data.length(), 0, + scratch, cb, nullptr, nullptr); + ASSERT_EQ(CH_SUCCESS, err); +} + +TEST_P(HybridScan, ScanClonedScratch) { + ASSERT_TRUE(db != nullptr); + + ch_error_t err; + ch_scratch_t *clonedScratch = nullptr; + err = ch_clone_scratch(scratch, &clonedScratch); + ASSERT_EQ(CH_SUCCESS, err); + + ch_match_event_handler cb = groups ? dummyGroupHandler : dummyHandler; + + err = ch_scan(db, SCAN_DATA.c_str(), SCAN_DATA.length(), 0, + clonedScratch, cb, nullptr, nullptr); + ASSERT_EQ(CH_SUCCESS, err); + + ch_free_scratch(clonedScratch); +} + +TEST_P(HybridScan, DatabaseInfo) { + ASSERT_TRUE(db != nullptr); + + char *info = nullptr; + ch_error_t err = ch_database_info(db, &info); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(info != nullptr); + + const string strinfo(info); + const string prefix("Chimera "); + ASSERT_GE(strinfo.size(), prefix.size()); + ASSERT_EQ(prefix, strinfo.substr(0, prefix.size())); + + free(info); +} + +TEST_P(HybridScan, NonZeroScratchSize) { + ASSERT_TRUE(db != nullptr); + size_t curr_size; + ch_error_t err = ch_scratch_size(scratch, &curr_size); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_LT(0, curr_size); +} + +INSTANTIATE_TEST_CASE_P(Scan, HybridScan, + Combine(ValuesIn(paramFactory()), Bool())); + +// Counting callback that returns CH_CALLBACK_CONTINUE. +static +ch_callback_t countHandler(unsigned, unsigned long long, unsigned long long, + unsigned, unsigned, const ch_capture_t *, + void *ctx) { + unsigned int *count = (unsigned int *)ctx; + ++(*count); + return CH_CALLBACK_CONTINUE; +} + +// Counting callback that returns CH_CALLBACK_SKIP_PATTERN. +static +ch_callback_t skipHandler(unsigned, unsigned long long, unsigned long long, + unsigned, unsigned, const ch_capture_t *, + void *ctx) { + unsigned int *count = (unsigned int *)ctx; + ++(*count); + return CH_CALLBACK_SKIP_PATTERN; +} + +// Counting callback that returns CH_CALLBACK_TERMINATE. +static +ch_callback_t terminateHandler(unsigned, unsigned long long, unsigned long long, + unsigned, unsigned, const ch_capture_t *, + void *ctx) { + unsigned int *count = (unsigned int *)ctx; + ++(*count); + return CH_CALLBACK_TERMINATE; +} + +static +void makeDatabase(ch_database_t **db, const char * const expr[], size_t num) { + *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + ch_error_t err = ch_compile_ext_multi(expr, nullptr, nullptr, num, 0, + 10000000, 8000, nullptr, db, + &compile_err); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(*db != nullptr); +} + +struct RescanContext { + RescanContext(const ch_database_t *db_in, ch_scratch_t *scratch_in) + : db(db_in), scratch(scratch_in) {} + const ch_database_t *db; + ch_scratch_t *scratch; + size_t matches = 0; +}; + +static +int rescan_block_cb(unsigned, unsigned long long, unsigned long long, unsigned, + unsigned, const ch_capture_t *, void *ctx) { + RescanContext *rctx = (RescanContext *)ctx; + rctx->matches++; + + const string data = "___foo___bar_"; + + hs_error_t err = ch_scan(rctx->db, data.c_str(), data.length(), 0, + rctx->scratch, nullptr, nullptr, nullptr); + EXPECT_EQ(CH_SCRATCH_IN_USE, err); + return 0; +} + +TEST(Scan, ScratchInUse) { + static const char * const expr[] = { "foo.*bar" }; + ch_database_t *db = nullptr; + makeDatabase(&db, expr, 1); + + ch_scratch_t *scratch = nullptr; + ch_error_t err = ch_alloc_scratch(db, &scratch); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + RescanContext rc(db, scratch); + + const string data("___foo___bar_"); + err = ch_scan(db, data.c_str(), data.length(), 0, + scratch, rescan_block_cb, 0, &rc); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_EQ(1U, rc.matches); + + ch_free_scratch(scratch); + ch_free_database(db); +} + +TEST(Scan, CallbackSkip1) { + static const char * const expr[] = { "." }; + ch_database_t *db = nullptr; + makeDatabase(&db, expr, 1); + + ch_scratch_t *scratch = nullptr; + ch_error_t err = ch_alloc_scratch(db, &scratch); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + unsigned int count = 0; + const string data("qwertyuiop"); + err = ch_scan(db, data.c_str(), data.length(), 0, + scratch, skipHandler, 0, &count); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_EQ(1U, count); + + ch_free_scratch(scratch); + ch_free_database(db); +} + +TEST(Scan, CallbackSkip2) { + static const char * const expr[] = { "[a-z]+", "[0-9]" }; + ch_database_t *db = nullptr; + makeDatabase(&db, expr, 2); + + ch_scratch_t *scratch = nullptr; + ch_error_t err = ch_alloc_scratch(db, &scratch); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + unsigned int count = 0; + const string data("foo 0123 0 bar 39483 n34jfhlqekrcoi3q4"); + err = ch_scan(db, data.c_str(), data.length(), 0, + scratch, skipHandler, 0, &count); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_EQ(2U, count); // both patterns should match once + + ch_free_scratch(scratch); + ch_free_database(db); +} + +// This case includes a pattern that we use libpcre for. +TEST(Scan, CallbackSkip3) { + static const char * const expr[] = { "[a-z]+", "foo(?!bar)" }; + ch_database_t *db = nullptr; + makeDatabase(&db, expr, 2); + + ch_scratch_t *scratch = nullptr; + ch_error_t err = ch_alloc_scratch(db, &scratch); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + unsigned int count = 0; + const string data("foobaz foobing foobar"); + err = ch_scan(db, data.c_str(), data.length(), 0, + scratch, skipHandler, 0, &count); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_EQ(2U, count); // both patterns should match once + + ch_free_scratch(scratch); + ch_free_database(db); +} + +TEST(Scan, CallbackNoSkip1) { + static const char * const expr[] = { "foo|bar", "[0-9]{3}" }; + ch_database_t *db = nullptr; + makeDatabase(&db, expr, 2); + + ch_scratch_t *scratch = nullptr; + ch_error_t err = ch_alloc_scratch(db, &scratch); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + unsigned int count = 0; + const string data("foo 012 bar 345 foobar 678"); + err = ch_scan(db, data.c_str(), data.length(), 0, + scratch, countHandler, 0, &count); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_EQ(7U, count); // seven matches in total + + ch_free_scratch(scratch); + ch_free_database(db); +} + +TEST(Scan, CallbackNoSkip2) { + static const char * const expr[] = { "foo(?!bar)", "[0-9]{3}" }; + ch_database_t *db = nullptr; + makeDatabase(&db, expr, 2); + + ch_scratch_t *scratch = nullptr; + ch_error_t err = ch_alloc_scratch(db, &scratch); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + unsigned int count = 0; + const string data("foo 012 bar 345 foobar 678"); + err = ch_scan(db, data.c_str(), data.length(), 0, + scratch, countHandler, 0, &count); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_EQ(4U, count); // four matches in total + + ch_free_scratch(scratch); + ch_free_database(db); +} + +TEST(Scan, CallbackTerm1) { + static const char * const expr[] = { "." }; + ch_database_t *db = nullptr; + makeDatabase(&db, expr, 1); + + ch_scratch_t *scratch = nullptr; + ch_error_t err = ch_alloc_scratch(db, &scratch); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + unsigned int count = 0; + const string data("qwertyuiop"); + err = ch_scan(db, data.c_str(), data.length(), 0, + scratch, terminateHandler, 0, &count); + ASSERT_EQ(CH_SCAN_TERMINATED, err); + ASSERT_EQ(1U, count); + + ch_free_scratch(scratch); + ch_free_database(db); +} + +TEST(Scan, CallbackTerm2) { + static const char * const expr[] = { "[a-z]+", "[0-9]" }; + ch_database_t *db = nullptr; + makeDatabase(&db, expr, 2); + + ch_scratch_t *scratch = nullptr; + ch_error_t err = ch_alloc_scratch(db, &scratch); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(scratch != 0); + + unsigned int count = 0; + const string data("foo 0123 0 bar 39483 n34jfhlqekrcoi3q4"); + err = ch_scan(db, data.c_str(), data.length(), 0, + scratch, terminateHandler, 0, &count); + ASSERT_EQ(CH_SCAN_TERMINATED, err); + ASSERT_EQ(1U, count); + + ch_free_scratch(scratch); + ch_free_database(db); +} + +// This case includes a pattern that we use libpcre for. +TEST(Scan, CallbackTerm3) { + static const char * const expr[] = { "[a-z]+", "foo(?!bar)" }; + ch_database_t *db = nullptr; + makeDatabase(&db, expr, 2); + + ch_scratch_t *scratch = nullptr; + ch_error_t err = ch_alloc_scratch(db, &scratch); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + unsigned int count = 0; + const string data("foobaz foobing foobar"); + err = ch_scan(db, data.c_str(), data.length(), 0, + scratch, terminateHandler, 0, &count); + ASSERT_EQ(CH_SCAN_TERMINATED, err); + ASSERT_EQ(1U, count); + + ch_free_scratch(scratch); + ch_free_database(db); +} + +} // namespace diff --git a/unit/hyperscan/arg_checks.cpp b/unit/hyperscan/arg_checks.cpp index 0ff4ce5fd..2cbd08426 100644 --- a/unit/hyperscan/arg_checks.cpp +++ b/unit/hyperscan/arg_checks.cpp @@ -171,7 +171,9 @@ TEST(HyperscanArgChecks, SingleCompileBogusFlags) { nullptr, &db, &compile_err); EXPECT_EQ(HS_COMPILER_ERROR, err); EXPECT_TRUE(compile_err != nullptr); - EXPECT_STREQ("Unrecognised flag.", compile_err->message); + EXPECT_STREQ("only HS_FLAG_QUIET and HS_FLAG_SINGLEMATCH " + "are supported in combination " + "with HS_FLAG_COMBINATION.", compile_err->message); hs_free_compile_error(compile_err); } diff --git a/unit/hyperscan/bad_patterns.txt b/unit/hyperscan/bad_patterns.txt index 7cc038340..6d4283dac 100644 --- a/unit/hyperscan/bad_patterns.txt +++ b/unit/hyperscan/bad_patterns.txt @@ -145,3 +145,21 @@ 148:/\QÀ\Eaaaa/8 #Expression is not valid UTF-8. 149:/[\QÀ\Eaaaa]/8 #Expression is not valid UTF-8. 150:/abcd/{edit_distance=1,hamming_distance=1} #In hs_expr_ext, cannot have both edit distance and Hamming distance. +151:/141 | abc/C #Unknown character at index 6. +152:/141 & | 142/C #Not enough operand at index 6. +153:/141 142 & 143/C #Not enough operator at index 13. +154:/141 !142/C #Not enough operator at index 8. +155:/141 & 142 |/C #Not enough operand at index 11. +156:/)141 & 142 /C #Not enough left parentheses at index 0. +157:/(141 & (142|!143) |144/C #Not enough right parentheses at index 22. +158:/141 & (142|!143) )| 144/C #Not enough left parentheses at index 17. +159:/1234567890 & (142|!143 )/C #Expression id too large at index 10. +160:/141 & (142|!143 )|/C #Not enough operand at index 18. +161:/!141/C #Has match from purely negative sub-expressions. +162:/!141 | 142 | 143/C #Has match from purely negative sub-expressions. +163:/!141 & !142 & !143/C #Has match from purely negative sub-expressions. +164:/(141 | !142 & !143)/C #Has match from purely negative sub-expressions. +165:/!(141 | 142 | 143)/C #Has match from purely negative sub-expressions. +166:/141/C #No logical operation. +167:/119 & 121/C #Unknown sub-expression id. +168:/166 & 167/C #Unknown sub-expression id. diff --git a/unit/hyperscan/logical_combination.cpp b/unit/hyperscan/logical_combination.cpp new file mode 100644 index 000000000..169de333b --- /dev/null +++ b/unit/hyperscan/logical_combination.cpp @@ -0,0 +1,696 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include + +#include "gtest/gtest.h" +#include "hs.h" +#include "config.h" +#include "test_util.h" + +using namespace std; + +TEST(LogicalCombination, SingleComb1) { + hs_database_t *db = nullptr; + hs_compile_error_t *compile_err = nullptr; + CallBackContext c; + string data = "abcdefxxfoobarrrghabcxdefxteakettleeeeexxxxijklmxxdef"; + const char *expr[] = {"abc", "def", "foobar.*gh", "teakettle{4,10}", + "ijkl[mMn]", "(101 & 102 & 103) | (104 & !105)"}; + unsigned flags[] = {0, 0, 0, 0, 0, HS_FLAG_COMBINATION}; + unsigned ids[] = {101, 102, 103, 104, 105, 1001}; + hs_error_t err = hs_compile_multi(expr, flags, ids, 6, HS_MODE_NOSTREAM, + nullptr, &db, &compile_err); + + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(db != nullptr); + + hs_scratch_t *scratch = nullptr; + err = hs_alloc_scratch(db, &scratch); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + c.halt = 0; + err = hs_scan(db, data.c_str(), data.size(), 0, scratch, record_cb, + (void *)&c); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_EQ(16U, c.matches.size()); + ASSERT_EQ(MatchRecord(3, 101), c.matches[0]); + ASSERT_EQ(MatchRecord(6, 102), c.matches[1]); + ASSERT_EQ(MatchRecord(18, 103), c.matches[2]); + ASSERT_EQ(MatchRecord(18, 1001), c.matches[3]); + ASSERT_EQ(MatchRecord(21, 101), c.matches[4]); + ASSERT_EQ(MatchRecord(21, 1001), c.matches[5]); + ASSERT_EQ(MatchRecord(25, 102), c.matches[6]); + ASSERT_EQ(MatchRecord(25, 1001), c.matches[7]); + ASSERT_EQ(MatchRecord(38, 104), c.matches[8]); + ASSERT_EQ(MatchRecord(38, 1001), c.matches[9]); + ASSERT_EQ(MatchRecord(39, 104), c.matches[10]); + ASSERT_EQ(MatchRecord(39, 1001), c.matches[11]); + ASSERT_EQ(MatchRecord(48, 105), c.matches[12]); + ASSERT_EQ(MatchRecord(48, 1001), c.matches[13]); + ASSERT_EQ(MatchRecord(53, 102), c.matches[14]); + ASSERT_EQ(MatchRecord(53, 1001), c.matches[15]); + + hs_free_database(db); + err = hs_free_scratch(scratch); + ASSERT_EQ(HS_SUCCESS, err); +} + +TEST(LogicalCombination, SingleCombQuietSub1) { + hs_database_t *db = nullptr; + hs_compile_error_t *compile_err = nullptr; + CallBackContext c; + string data = "abcdefxxfoobarrrghabcxdefxteakettleeeeexxxxijklmxxdef"; + const char *expr[] = {"abc", "def", "foobar.*gh", "teakettle{4,10}", + "ijkl[mMn]", "(101 & 102 & 103) | (104 & !105)"}; + unsigned flags[] = {HS_FLAG_QUIET, HS_FLAG_QUIET, HS_FLAG_QUIET, + HS_FLAG_QUIET, 0, HS_FLAG_COMBINATION}; + unsigned ids[] = {101, 102, 103, 104, 105, 1001}; + hs_error_t err = hs_compile_multi(expr, flags, ids, 6, HS_MODE_NOSTREAM, + nullptr, &db, &compile_err); + + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(db != nullptr); + + hs_scratch_t *scratch = nullptr; + err = hs_alloc_scratch(db, &scratch); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + c.halt = 0; + err = hs_scan(db, data.c_str(), data.size(), 0, scratch, record_cb, + (void *)&c); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_EQ(8U, c.matches.size()); + ASSERT_EQ(MatchRecord(18, 1001), c.matches[0]); + ASSERT_EQ(MatchRecord(21, 1001), c.matches[1]); + ASSERT_EQ(MatchRecord(25, 1001), c.matches[2]); + ASSERT_EQ(MatchRecord(38, 1001), c.matches[3]); + ASSERT_EQ(MatchRecord(39, 1001), c.matches[4]); + ASSERT_EQ(MatchRecord(48, 105), c.matches[5]); + ASSERT_EQ(MatchRecord(48, 1001), c.matches[6]); + ASSERT_EQ(MatchRecord(53, 1001), c.matches[7]); + + hs_free_database(db); + err = hs_free_scratch(scratch); + ASSERT_EQ(HS_SUCCESS, err); +} + +TEST(LogicalCombination, MultiCombQuietSub1) { + hs_database_t *db = nullptr; + hs_compile_error_t *compile_err = nullptr; + CallBackContext c; + string data = "abcdefxxfoobarrrghabcxdefxteakettleeeeexxxxijklmxxdef"; + const char *expr[] = {"abc", "def", "foobar.*gh", "teakettle{4,10}", + "ijkl[mMn]", "(101 & 102 & 103) | (104 & !105)", + "!101 & 102", "!(!101 | 102)", "101 & !102"}; + unsigned flags[] = {HS_FLAG_QUIET, HS_FLAG_QUIET, HS_FLAG_QUIET, + HS_FLAG_QUIET, 0, HS_FLAG_COMBINATION, + HS_FLAG_COMBINATION, HS_FLAG_COMBINATION, + HS_FLAG_COMBINATION}; + unsigned ids[] = {101, 102, 103, 104, 105, 1001, 1002, 1003, 1004}; + hs_error_t err = hs_compile_multi(expr, flags, ids, 9, HS_MODE_NOSTREAM, + nullptr, &db, &compile_err); + + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(db != nullptr); + + hs_scratch_t *scratch = nullptr; + err = hs_alloc_scratch(db, &scratch); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + c.halt = 0; + err = hs_scan(db, data.c_str(), data.size(), 0, scratch, record_cb, + (void *)&c); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_EQ(10U, c.matches.size()); + ASSERT_EQ(MatchRecord(3, 1003), c.matches[0]); + ASSERT_EQ(MatchRecord(3, 1004), c.matches[1]); + ASSERT_EQ(MatchRecord(18, 1001), c.matches[2]); + ASSERT_EQ(MatchRecord(21, 1001), c.matches[3]); + ASSERT_EQ(MatchRecord(25, 1001), c.matches[4]); + ASSERT_EQ(MatchRecord(38, 1001), c.matches[5]); + ASSERT_EQ(MatchRecord(39, 1001), c.matches[6]); + ASSERT_EQ(MatchRecord(48, 105), c.matches[7]); + ASSERT_EQ(MatchRecord(48, 1001), c.matches[8]); + ASSERT_EQ(MatchRecord(53, 1001), c.matches[9]); + + hs_free_database(db); + err = hs_free_scratch(scratch); + ASSERT_EQ(HS_SUCCESS, err); +} + +TEST(LogicalCombination, MultiHighlanderCombQuietSub1) { + hs_database_t *db = nullptr; + hs_compile_error_t *compile_err = nullptr; + CallBackContext c; + string data = "abcdefxxfoobarrrghabcxdefxteakettleeeeexxxxijklmxxdef"; + const char *expr[] = {"abc", "def", "foobar.*gh", "teakettle{4,10}", + "ijkl[mMn]", "(101 & 102 & 103) | (104 & !105)", + "!101 & 102", "!(!101 | 102)", "101 & !102"}; + unsigned flags[] = {HS_FLAG_QUIET, HS_FLAG_QUIET, HS_FLAG_QUIET, + HS_FLAG_QUIET, 0, + HS_FLAG_COMBINATION | HS_FLAG_SINGLEMATCH, + HS_FLAG_COMBINATION, + HS_FLAG_COMBINATION | HS_FLAG_SINGLEMATCH, + HS_FLAG_COMBINATION | HS_FLAG_SINGLEMATCH}; + unsigned ids[] = {101, 102, 103, 104, 105, 1001, 1002, 1003, 1004}; + hs_error_t err = hs_compile_multi(expr, flags, ids, 9, HS_MODE_NOSTREAM, + nullptr, &db, &compile_err); + + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(db != nullptr); + + hs_scratch_t *scratch = nullptr; + err = hs_alloc_scratch(db, &scratch); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + c.halt = 0; + err = hs_scan(db, data.c_str(), data.size(), 0, scratch, record_cb, + (void *)&c); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_EQ(4U, c.matches.size()); + ASSERT_EQ(MatchRecord(3, 1003), c.matches[0]); + ASSERT_EQ(MatchRecord(3, 1004), c.matches[1]); + ASSERT_EQ(MatchRecord(18, 1001), c.matches[2]); + ASSERT_EQ(MatchRecord(48, 105), c.matches[3]); + + hs_free_database(db); + err = hs_free_scratch(scratch); + ASSERT_EQ(HS_SUCCESS, err); +} + +TEST(LogicalCombination, MultiQuietCombQuietSub1) { + hs_database_t *db = nullptr; + hs_compile_error_t *compile_err = nullptr; + CallBackContext c; + string data = "abcdefxxfoobarrrghabcxdefxteakettleeeeexxxxijklmxxdef"; + const char *expr[] = {"abc", "def", "foobar.*gh", "teakettle{4,10}", + "ijkl[mMn]", "(101 & 102 & 103) | (104 & !105)", + "!101 & 102", "!(!101 | 102)", "101 & !102"}; + unsigned flags[] = {HS_FLAG_QUIET, HS_FLAG_QUIET, HS_FLAG_QUIET, + HS_FLAG_QUIET, 0, HS_FLAG_COMBINATION | HS_FLAG_QUIET, + HS_FLAG_COMBINATION, HS_FLAG_COMBINATION, + HS_FLAG_COMBINATION | HS_FLAG_QUIET}; + unsigned ids[] = {101, 102, 103, 104, 105, 1001, 1002, 1003, 1004}; + hs_error_t err = hs_compile_multi(expr, flags, ids, 9, HS_MODE_NOSTREAM, + nullptr, &db, &compile_err); + + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(db != nullptr); + + hs_scratch_t *scratch = nullptr; + err = hs_alloc_scratch(db, &scratch); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + c.halt = 0; + err = hs_scan(db, data.c_str(), data.size(), 0, scratch, record_cb, + (void *)&c); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_EQ(2U, c.matches.size()); + ASSERT_EQ(MatchRecord(3, 1003), c.matches[0]); + ASSERT_EQ(MatchRecord(48, 105), c.matches[1]); + + hs_free_database(db); + err = hs_free_scratch(scratch); + ASSERT_EQ(HS_SUCCESS, err); +} + +TEST(LogicalCombination, SingleComb2) { + hs_database_t *db = nullptr; + hs_compile_error_t *compile_err = nullptr; + CallBackContext c; + string data = "abbdefxxfoobarrrghabcxdefxteakettleeeeexxxxijklmxxdef"; + const char *expr[] = {"abc", "def", "foobar.*gh", "teakettle{4,10}", + "ijkl[mMn]", "(201 | 202 & 203) & (!204 | 205)"}; + unsigned flags[] = {0, 0, 0, 0, 0, HS_FLAG_COMBINATION}; + unsigned ids[] = {201, 202, 203, 204, 205, 1002}; + hs_error_t err = hs_compile_multi(expr, flags, ids, 6, HS_MODE_NOSTREAM, + nullptr, &db, &compile_err); + + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(db != nullptr); + + hs_scratch_t *scratch = nullptr; + err = hs_alloc_scratch(db, &scratch); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + c.halt = 0; + err = hs_scan(db, data.c_str(), data.size(), 0, scratch, record_cb, + (void *)&c); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_EQ(13U, c.matches.size()); + ASSERT_EQ(MatchRecord(6, 202), c.matches[0]); + ASSERT_EQ(MatchRecord(18, 203), c.matches[1]); + ASSERT_EQ(MatchRecord(18, 1002), c.matches[2]); + ASSERT_EQ(MatchRecord(21, 201), c.matches[3]); + ASSERT_EQ(MatchRecord(21, 1002), c.matches[4]); + ASSERT_EQ(MatchRecord(25, 202), c.matches[5]); + ASSERT_EQ(MatchRecord(25, 1002), c.matches[6]); + ASSERT_EQ(MatchRecord(38, 204), c.matches[7]); + ASSERT_EQ(MatchRecord(39, 204), c.matches[8]); + ASSERT_EQ(MatchRecord(48, 205), c.matches[9]); + ASSERT_EQ(MatchRecord(48, 1002), c.matches[10]); + ASSERT_EQ(MatchRecord(53, 202), c.matches[11]); + ASSERT_EQ(MatchRecord(53, 1002), c.matches[12]); + + hs_free_database(db); + err = hs_free_scratch(scratch); + ASSERT_EQ(HS_SUCCESS, err); +} + +TEST(LogicalCombination, SingleCombQuietSub2) { + hs_database_t *db = nullptr; + hs_compile_error_t *compile_err = nullptr; + CallBackContext c; + string data = "abbdefxxfoobarrrghabcxdefxteakettleeeeexxxxijklmxxdef"; + const char *expr[] = {"abc", "def", "foobar.*gh", "teakettle{4,10}", + "ijkl[mMn]", "(201 | 202 & 203) & (!204 | 205)"}; + unsigned flags[] = {0, HS_FLAG_QUIET, HS_FLAG_QUIET, 0, HS_FLAG_QUIET, + HS_FLAG_COMBINATION}; + unsigned ids[] = {201, 202, 203, 204, 205, 1002}; + hs_error_t err = hs_compile_multi(expr, flags, ids, 6, HS_MODE_NOSTREAM, + nullptr, &db, &compile_err); + + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(db != nullptr); + + hs_scratch_t *scratch = nullptr; + err = hs_alloc_scratch(db, &scratch); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + c.halt = 0; + err = hs_scan(db, data.c_str(), data.size(), 0, scratch, record_cb, + (void *)&c); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_EQ(8U, c.matches.size()); + ASSERT_EQ(MatchRecord(18, 1002), c.matches[0]); + ASSERT_EQ(MatchRecord(21, 201), c.matches[1]); + ASSERT_EQ(MatchRecord(21, 1002), c.matches[2]); + ASSERT_EQ(MatchRecord(25, 1002), c.matches[3]); + ASSERT_EQ(MatchRecord(38, 204), c.matches[4]); + ASSERT_EQ(MatchRecord(39, 204), c.matches[5]); + ASSERT_EQ(MatchRecord(48, 1002), c.matches[6]); + ASSERT_EQ(MatchRecord(53, 1002), c.matches[7]); + + hs_free_database(db); + err = hs_free_scratch(scratch); + ASSERT_EQ(HS_SUCCESS, err); +} + +TEST(LogicalCombination, SingleComb3) { + hs_database_t *db = nullptr; + hs_compile_error_t *compile_err = nullptr; + CallBackContext c; + string data = "abcijklndefxxfoobarrrghabcxdefxteakettleeeeexxxxijklnxxdef"; + const char *expr[] = {"abc", "def", "foobar.*gh", "teakettle{4,10}", + "ijkl[mMn]", "((301 | 302) & 303) & (304 | 305)"}; + unsigned flags[] = {0, 0, 0, 0, 0, HS_FLAG_COMBINATION}; + unsigned ids[] = {301, 302, 303, 304, 305, 1003}; + hs_error_t err = hs_compile_multi(expr, flags, ids, 6, HS_MODE_NOSTREAM, + nullptr, &db, &compile_err); + + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(db != nullptr); + + hs_scratch_t *scratch = nullptr; + err = hs_alloc_scratch(db, &scratch); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + c.halt = 0; + err = hs_scan(db, data.c_str(), data.size(), 0, scratch, record_cb, + (void *)&c); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_EQ(17U, c.matches.size()); + ASSERT_EQ(MatchRecord(3, 301), c.matches[0]); + ASSERT_EQ(MatchRecord(8, 305), c.matches[1]); + ASSERT_EQ(MatchRecord(11, 302), c.matches[2]); + ASSERT_EQ(MatchRecord(23, 303), c.matches[3]); + ASSERT_EQ(MatchRecord(23, 1003), c.matches[4]); + ASSERT_EQ(MatchRecord(26, 301), c.matches[5]); + ASSERT_EQ(MatchRecord(26, 1003), c.matches[6]); + ASSERT_EQ(MatchRecord(30, 302), c.matches[7]); + ASSERT_EQ(MatchRecord(30, 1003), c.matches[8]); + ASSERT_EQ(MatchRecord(43, 304), c.matches[9]); + ASSERT_EQ(MatchRecord(43, 1003), c.matches[10]); + ASSERT_EQ(MatchRecord(44, 304), c.matches[11]); + ASSERT_EQ(MatchRecord(44, 1003), c.matches[12]); + ASSERT_EQ(MatchRecord(53, 305), c.matches[13]); + ASSERT_EQ(MatchRecord(53, 1003), c.matches[14]); + ASSERT_EQ(MatchRecord(58, 302), c.matches[15]); + ASSERT_EQ(MatchRecord(58, 1003), c.matches[16]); + + hs_free_database(db); + err = hs_free_scratch(scratch); + ASSERT_EQ(HS_SUCCESS, err); +} + +TEST(LogicalCombination, SingleCombQuietSub3) { + hs_database_t *db = nullptr; + hs_compile_error_t *compile_err = nullptr; + CallBackContext c; + string data = "abcijklndefxxfoobarrrghabcxdefxteakettleeeeexxxxijklnxxdef"; + const char *expr[] = {"abc", "def", "foobar.*gh", "teakettle{4,10}", + "ijkl[mMn]", "((301 | 302) & 303) & (304 | 305)"}; + unsigned flags[] = {HS_FLAG_QUIET, HS_FLAG_QUIET, 0, HS_FLAG_QUIET, + HS_FLAG_QUIET, HS_FLAG_COMBINATION}; + unsigned ids[] = {301, 302, 303, 304, 305, 1003}; + hs_error_t err = hs_compile_multi(expr, flags, ids, 6, HS_MODE_NOSTREAM, + nullptr, &db, &compile_err); + + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(db != nullptr); + + hs_scratch_t *scratch = nullptr; + err = hs_alloc_scratch(db, &scratch); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + c.halt = 0; + err = hs_scan(db, data.c_str(), data.size(), 0, scratch, record_cb, + (void *)&c); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_EQ(8U, c.matches.size()); + ASSERT_EQ(MatchRecord(23, 303), c.matches[0]); + ASSERT_EQ(MatchRecord(23, 1003), c.matches[1]); + ASSERT_EQ(MatchRecord(26, 1003), c.matches[2]); + ASSERT_EQ(MatchRecord(30, 1003), c.matches[3]); + ASSERT_EQ(MatchRecord(43, 1003), c.matches[4]); + ASSERT_EQ(MatchRecord(44, 1003), c.matches[5]); + ASSERT_EQ(MatchRecord(53, 1003), c.matches[6]); + ASSERT_EQ(MatchRecord(58, 1003), c.matches[7]); + + hs_free_database(db); + err = hs_free_scratch(scratch); + ASSERT_EQ(HS_SUCCESS, err); +} + +TEST(LogicalCombination, MultiCombDupSub4) { + hs_database_t *db = nullptr; + hs_compile_error_t *compile_err = nullptr; + CallBackContext c; + string data = "abbdefxxfoobarrrghabcxdefxteakettleeeeexxxxijklmxxdef"; + const char *expr[] = {"abc", "def", "foobar.*gh", "teakettle{4,10}", + "ijkl[mMn]", "(201 & 202 & 203) | (204 & !205)", + "(201 | 202 & 203) & (!204 | 205)", + "((201 | 202) & 203) & (204 | 205)"}; + unsigned flags[] = {0, 0, 0, 0, 0, HS_FLAG_COMBINATION, + HS_FLAG_COMBINATION, HS_FLAG_COMBINATION}; + unsigned ids[] = {201, 202, 203, 204, 205, 1001, 1002, 1003}; + hs_error_t err = hs_compile_multi(expr, flags, ids, 8, HS_MODE_NOSTREAM, + nullptr, &db, &compile_err); + + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(db != nullptr); + + hs_scratch_t *scratch = nullptr; + err = hs_alloc_scratch(db, &scratch); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + c.halt = 0; + err = hs_scan(db, data.c_str(), data.size(), 0, scratch, record_cb, + (void *)&c); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_EQ(23U, c.matches.size()); + ASSERT_EQ(MatchRecord(6, 202), c.matches[0]); + ASSERT_EQ(MatchRecord(18, 203), c.matches[1]); + ASSERT_EQ(MatchRecord(18, 1002), c.matches[2]); + ASSERT_EQ(MatchRecord(21, 201), c.matches[3]); + ASSERT_EQ(MatchRecord(21, 1001), c.matches[4]); + ASSERT_EQ(MatchRecord(21, 1002), c.matches[5]); + ASSERT_EQ(MatchRecord(25, 202), c.matches[6]); + ASSERT_EQ(MatchRecord(25, 1001), c.matches[7]); + ASSERT_EQ(MatchRecord(25, 1002), c.matches[8]); + ASSERT_EQ(MatchRecord(38, 204), c.matches[9]); + ASSERT_EQ(MatchRecord(38, 1001), c.matches[10]); + ASSERT_EQ(MatchRecord(38, 1003), c.matches[11]); + ASSERT_EQ(MatchRecord(39, 204), c.matches[12]); + ASSERT_EQ(MatchRecord(39, 1001), c.matches[13]); + ASSERT_EQ(MatchRecord(39, 1003), c.matches[14]); + ASSERT_EQ(MatchRecord(48, 205), c.matches[15]); + ASSERT_EQ(MatchRecord(48, 1001), c.matches[16]); + ASSERT_EQ(MatchRecord(48, 1002), c.matches[17]); + ASSERT_EQ(MatchRecord(48, 1003), c.matches[18]); + ASSERT_EQ(MatchRecord(53, 202), c.matches[19]); + ASSERT_EQ(MatchRecord(53, 1001), c.matches[20]); + ASSERT_EQ(MatchRecord(53, 1002), c.matches[21]); + ASSERT_EQ(MatchRecord(53, 1003), c.matches[22]); + + hs_free_database(db); + err = hs_free_scratch(scratch); + ASSERT_EQ(HS_SUCCESS, err); +} + +TEST(LogicalCombination, MultiCombQuietDupSub4) { + hs_database_t *db = nullptr; + hs_compile_error_t *compile_err = nullptr; + CallBackContext c; + string data = "abbdefxxfoobarrrghabcxdefxteakettleeeeexxxxijklmxxdef"; + const char *expr[] = {"abc", "def", "foobar.*gh", "teakettle{4,10}", + "ijkl[mMn]", "(201 & 202 & 203) | (204 & !205)", + "(201 | 202 & 203) & (!204 | 205)", + "((201 | 202) & 203) & (204 | 205)"}; + unsigned flags[] = {HS_FLAG_QUIET, HS_FLAG_QUIET, HS_FLAG_QUIET, 0, + HS_FLAG_QUIET, HS_FLAG_COMBINATION, + HS_FLAG_COMBINATION, HS_FLAG_COMBINATION}; + unsigned ids[] = {201, 202, 203, 204, 205, 1001, 1002, 1003}; + hs_error_t err = hs_compile_multi(expr, flags, ids, 8, HS_MODE_NOSTREAM, + nullptr, &db, &compile_err); + + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(db != nullptr); + + hs_scratch_t *scratch = nullptr; + err = hs_alloc_scratch(db, &scratch); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + c.halt = 0; + err = hs_scan(db, data.c_str(), data.size(), 0, scratch, record_cb, + (void *)&c); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_EQ(17U, c.matches.size()); + ASSERT_EQ(MatchRecord(18, 1002), c.matches[0]); + ASSERT_EQ(MatchRecord(21, 1001), c.matches[1]); + ASSERT_EQ(MatchRecord(21, 1002), c.matches[2]); + ASSERT_EQ(MatchRecord(25, 1001), c.matches[3]); + ASSERT_EQ(MatchRecord(25, 1002), c.matches[4]); + ASSERT_EQ(MatchRecord(38, 204), c.matches[5]); + ASSERT_EQ(MatchRecord(38, 1001), c.matches[6]); + ASSERT_EQ(MatchRecord(38, 1003), c.matches[7]); + ASSERT_EQ(MatchRecord(39, 204), c.matches[8]); + ASSERT_EQ(MatchRecord(39, 1001), c.matches[9]); + ASSERT_EQ(MatchRecord(39, 1003), c.matches[10]); + ASSERT_EQ(MatchRecord(48, 1001), c.matches[11]); + ASSERT_EQ(MatchRecord(48, 1002), c.matches[12]); + ASSERT_EQ(MatchRecord(48, 1003), c.matches[13]); + ASSERT_EQ(MatchRecord(53, 1001), c.matches[14]); + ASSERT_EQ(MatchRecord(53, 1002), c.matches[15]); + ASSERT_EQ(MatchRecord(53, 1003), c.matches[16]); + + hs_free_database(db); + err = hs_free_scratch(scratch); + ASSERT_EQ(HS_SUCCESS, err); +} + +TEST(LogicalCombination, MultiCombUniSub5) { + hs_database_t *db = nullptr; + hs_compile_error_t *compile_err = nullptr; + CallBackContext c; + string data = "abcdefxxfoobarrrghabcxdefxteakettleeeeexxxxijklmxxdef" + "-----------------------------------------------" + "cbbfedxxgoogleeecncbaxfedxhaystacksssssxxxxijkloxxfed" + "-----------------------------------------------" + "cabijklRfeexxgoobarrrjpcabxfeexshockwaveeeeexxxxijklsxxfee" + "------------------------------------------"; + const char *expr[] = {"abc", "def", "foobar.*gh", "teakettle{4,10}", + "ijkl[mMn]", "cba", "fed", "google.*cn", + "haystacks{4,8}", "ijkl[oOp]", "cab", "fee", + "goobar.*jp", "shockwave{4,6}", "ijkl[rRs]", + "(101 & 102 & 103) | (104 & !105)", + "(201 | 202 & 203) & (!204 | 205)", + "((301 | 302) & 303) & (304 | 305)"}; + unsigned flags[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + HS_FLAG_COMBINATION, HS_FLAG_COMBINATION, + HS_FLAG_COMBINATION}; + unsigned ids[] = {101, 102, 103, 104, 105, 201, 202, 203, 204, 205, 301, + 302, 303, 304, 305, 1001, 1002, 1003}; + hs_error_t err = hs_compile_multi(expr, flags, ids, 18, HS_MODE_NOSTREAM, + nullptr, &db, &compile_err); + + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(db != nullptr); + + hs_scratch_t *scratch = nullptr; + err = hs_alloc_scratch(db, &scratch); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + c.halt = 0; + err = hs_scan(db, data.c_str(), data.size(), 0, scratch, record_cb, + (void *)&c); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_EQ(46U, c.matches.size()); + ASSERT_EQ(MatchRecord(3, 101), c.matches[0]); + ASSERT_EQ(MatchRecord(6, 102), c.matches[1]); + ASSERT_EQ(MatchRecord(18, 103), c.matches[2]); + ASSERT_EQ(MatchRecord(18, 1001), c.matches[3]); + ASSERT_EQ(MatchRecord(21, 101), c.matches[4]); + ASSERT_EQ(MatchRecord(21, 1001), c.matches[5]); + ASSERT_EQ(MatchRecord(25, 102), c.matches[6]); + ASSERT_EQ(MatchRecord(25, 1001), c.matches[7]); + ASSERT_EQ(MatchRecord(38, 104), c.matches[8]); + ASSERT_EQ(MatchRecord(38, 1001), c.matches[9]); + ASSERT_EQ(MatchRecord(39, 104), c.matches[10]); + ASSERT_EQ(MatchRecord(39, 1001), c.matches[11]); + ASSERT_EQ(MatchRecord(48, 105), c.matches[12]); + ASSERT_EQ(MatchRecord(48, 1001), c.matches[13]); + ASSERT_EQ(MatchRecord(53, 102), c.matches[14]); + ASSERT_EQ(MatchRecord(53, 1001), c.matches[15]); + ASSERT_EQ(MatchRecord(106, 202), c.matches[16]); + ASSERT_EQ(MatchRecord(118, 203), c.matches[17]); + ASSERT_EQ(MatchRecord(118, 1002), c.matches[18]); + ASSERT_EQ(MatchRecord(121, 201), c.matches[19]); + ASSERT_EQ(MatchRecord(121, 1002), c.matches[20]); + ASSERT_EQ(MatchRecord(125, 202), c.matches[21]); + ASSERT_EQ(MatchRecord(125, 1002), c.matches[22]); + ASSERT_EQ(MatchRecord(138, 204), c.matches[23]); + ASSERT_EQ(MatchRecord(139, 204), c.matches[24]); + ASSERT_EQ(MatchRecord(148, 205), c.matches[25]); + ASSERT_EQ(MatchRecord(148, 1002), c.matches[26]); + ASSERT_EQ(MatchRecord(153, 202), c.matches[27]); + ASSERT_EQ(MatchRecord(153, 1002), c.matches[28]); + ASSERT_EQ(MatchRecord(203, 301), c.matches[29]); + ASSERT_EQ(MatchRecord(208, 305), c.matches[30]); + ASSERT_EQ(MatchRecord(211, 302), c.matches[31]); + ASSERT_EQ(MatchRecord(223, 303), c.matches[32]); + ASSERT_EQ(MatchRecord(223, 1003), c.matches[33]); + ASSERT_EQ(MatchRecord(226, 301), c.matches[34]); + ASSERT_EQ(MatchRecord(226, 1003), c.matches[35]); + ASSERT_EQ(MatchRecord(230, 302), c.matches[36]); + ASSERT_EQ(MatchRecord(230, 1003), c.matches[37]); + ASSERT_EQ(MatchRecord(243, 304), c.matches[38]); + ASSERT_EQ(MatchRecord(243, 1003), c.matches[39]); + ASSERT_EQ(MatchRecord(244, 304), c.matches[40]); + ASSERT_EQ(MatchRecord(244, 1003), c.matches[41]); + ASSERT_EQ(MatchRecord(253, 305), c.matches[42]); + ASSERT_EQ(MatchRecord(253, 1003), c.matches[43]); + ASSERT_EQ(MatchRecord(258, 302), c.matches[44]); + ASSERT_EQ(MatchRecord(258, 1003), c.matches[45]); + + hs_free_database(db); + err = hs_free_scratch(scratch); + ASSERT_EQ(HS_SUCCESS, err); +} + +TEST(LogicalCombination, MultiCombQuietUniSub5) { + hs_database_t *db = nullptr; + hs_compile_error_t *compile_err = nullptr; + CallBackContext c; + string data = "abcdefxxfoobarrrghabcxdefxteakettleeeeexxxxijklmxxdef" + "-----------------------------------------------" + "cbbfedxxgoogleeecncbaxfedxhaystacksssssxxxxijkloxxfed" + "-----------------------------------------------" + "cabijklRfeexxgoobarrrjpcabxfeexshockwaveeeeexxxxijklsxxfee" + "------------------------------------------"; + const char *expr[] = {"abc", "def", "foobar.*gh", "teakettle{4,10}", + "ijkl[mMn]", "cba", "fed", "google.*cn", + "haystacks{4,8}", "ijkl[oOp]", "cab", "fee", + "goobar.*jp", "shockwave{4,6}", "ijkl[rRs]", + "(101 & 102 & 103) | (104 & !105)", + "(201 | 202 & 203) & (!204 | 205)", + "((301 | 302) & 303) & (304 | 305)"}; + unsigned flags[] = {0, HS_FLAG_QUIET, HS_FLAG_QUIET, HS_FLAG_QUIET, 0, + HS_FLAG_QUIET, 0, HS_FLAG_QUIET, 0, HS_FLAG_QUIET, + HS_FLAG_QUIET, HS_FLAG_QUIET, 0, HS_FLAG_QUIET, 0, + HS_FLAG_COMBINATION, HS_FLAG_COMBINATION, + HS_FLAG_COMBINATION}; + unsigned ids[] = {101, 102, 103, 104, 105, 201, 202, 203, 204, 205, 301, + 302, 303, 304, 305, 1001, 1002, 1003}; + hs_error_t err = hs_compile_multi(expr, flags, ids, 18, HS_MODE_NOSTREAM, + nullptr, &db, &compile_err); + + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(db != nullptr); + + hs_scratch_t *scratch = nullptr; + err = hs_alloc_scratch(db, &scratch); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + c.halt = 0; + err = hs_scan(db, data.c_str(), data.size(), 0, scratch, record_cb, + (void *)&c); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_EQ(30U, c.matches.size()); + ASSERT_EQ(MatchRecord(3, 101), c.matches[0]); + ASSERT_EQ(MatchRecord(18, 1001), c.matches[1]); + ASSERT_EQ(MatchRecord(21, 101), c.matches[2]); + ASSERT_EQ(MatchRecord(21, 1001), c.matches[3]); + ASSERT_EQ(MatchRecord(25, 1001), c.matches[4]); + ASSERT_EQ(MatchRecord(38, 1001), c.matches[5]); + ASSERT_EQ(MatchRecord(39, 1001), c.matches[6]); + ASSERT_EQ(MatchRecord(48, 105), c.matches[7]); + ASSERT_EQ(MatchRecord(48, 1001), c.matches[8]); + ASSERT_EQ(MatchRecord(53, 1001), c.matches[9]); + ASSERT_EQ(MatchRecord(106, 202), c.matches[10]); + ASSERT_EQ(MatchRecord(118, 1002), c.matches[11]); + ASSERT_EQ(MatchRecord(121, 1002), c.matches[12]); + ASSERT_EQ(MatchRecord(125, 202), c.matches[13]); + ASSERT_EQ(MatchRecord(125, 1002), c.matches[14]); + ASSERT_EQ(MatchRecord(138, 204), c.matches[15]); + ASSERT_EQ(MatchRecord(139, 204), c.matches[16]); + ASSERT_EQ(MatchRecord(148, 1002), c.matches[17]); + ASSERT_EQ(MatchRecord(153, 202), c.matches[18]); + ASSERT_EQ(MatchRecord(153, 1002), c.matches[19]); + ASSERT_EQ(MatchRecord(208, 305), c.matches[20]); + ASSERT_EQ(MatchRecord(223, 303), c.matches[21]); + ASSERT_EQ(MatchRecord(223, 1003), c.matches[22]); + ASSERT_EQ(MatchRecord(226, 1003), c.matches[23]); + ASSERT_EQ(MatchRecord(230, 1003), c.matches[24]); + ASSERT_EQ(MatchRecord(243, 1003), c.matches[25]); + ASSERT_EQ(MatchRecord(244, 1003), c.matches[26]); + ASSERT_EQ(MatchRecord(253, 305), c.matches[27]); + ASSERT_EQ(MatchRecord(253, 1003), c.matches[28]); + ASSERT_EQ(MatchRecord(258, 1003), c.matches[29]); + + hs_free_database(db); + err = hs_free_scratch(scratch); + ASSERT_EQ(HS_SUCCESS, err); +} diff --git a/unit/internal/graph_undirected.cpp b/unit/internal/graph_undirected.cpp new file mode 100644 index 000000000..babc01a6a --- /dev/null +++ b/unit/internal/graph_undirected.cpp @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2015-2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "gtest/gtest.h" +#include "util/container.h" +#include "util/graph.h" +#include "util/graph_range.h" +#include "util/graph_undirected.h" +#include "util/ue2_graph.h" + +#include + +using namespace std; +using namespace ue2; + +struct SimpleV { + size_t index; + string test_v = "SimpleV"; +}; + +struct SimpleE { + size_t index; + string test_e = "SimpleE"; +}; + +struct SimpleG : public ue2_graph {}; + +using SimpleVertex = SimpleG::vertex_descriptor; + +template +vector to_indices(const Range &range, const Graph &g) { + vector indices; + for (const auto &elem : range) { + indices.push_back(g[elem].index); + } + sort(indices.begin(), indices.end()); + return indices; +} + +template +vector to_indices(const std::initializer_list &range, + const Graph &g) { + vector indices; + for (const auto &elem : range) { + indices.push_back(g[elem].index); + } + sort(indices.begin(), indices.end()); + return indices; +} + +TEST(graph_undirected, simple_ue2_graph) { + SimpleG g; + auto a = add_vertex(g); + ASSERT_NE(SimpleG::null_vertex(), a); + auto b = add_vertex(g); + ASSERT_NE(SimpleG::null_vertex(), b); + auto c = add_vertex(g); + ASSERT_NE(SimpleG::null_vertex(), c); + + add_edge(a, b, g); + add_edge(b, a, g); + add_edge(a, c, g); + add_edge(c, b, g); + add_edge(c, c, g); + + auto ug = make_undirected_graph(g); + + ASSERT_EQ(3, num_vertices(ug)); + ASSERT_EQ(4, num_edges(ug)); + + // Check adjacencies + + ASSERT_EQ(2, out_degree(a, ug)); + ASSERT_EQ(to_indices({b, c}, ug), + to_indices(adjacent_vertices_range(a, ug), ug)); + + ASSERT_EQ(2, out_degree(b, ug)); + ASSERT_EQ(to_indices({a, c}, ug), + to_indices(adjacent_vertices_range(b, ug), ug)); + + ASSERT_EQ(3, out_degree(c, ug)); + ASSERT_EQ(to_indices({a, b, c}, ug), + to_indices(adjacent_vertices_range(c, ug), ug)); + + ASSERT_EQ(2, in_degree(b, ug)); + ASSERT_EQ(to_indices({a, c}, ug), + to_indices(inv_adjacent_vertices_range(b, ug), ug)); + + // Test reverse edge existence + + ASSERT_TRUE(edge(a, b, ug).second); + ASSERT_TRUE(edge(b, a, ug).second); + ASSERT_TRUE(edge(a, c, ug).second); + ASSERT_TRUE(edge(c, a, ug).second); // (a,c) actually exists + ASSERT_TRUE(edge(b, c, ug).second); // (c,b) actually exists + ASSERT_FALSE(edge(a, a, ug).second); + + // Vertex properties + + g[c].test_v = "vertex c"; + ASSERT_EQ("vertex c", ug[c].test_v); + ASSERT_EQ("vertex c", get(&SimpleV::test_v, ug, c)); + + ug[c].test_v = "vertex c again"; + ASSERT_EQ("vertex c again", g[c].test_v); + ASSERT_EQ("vertex c again", get(&SimpleV::test_v, g, c)); + + put(&SimpleV::test_v, ug, c, "vertex c once more"); + ASSERT_EQ("vertex c once more", g[c].test_v); + + const auto &vprops1 = ug[b]; + ASSERT_EQ(1, vprops1.index); + + const auto &vprops2 = get(boost::vertex_all, ug, b); + ASSERT_EQ(1, vprops2.index); + + // Edge Properties + + auto edge_undirected = edge(a, b, ug).first; + ug[edge_undirected].test_e = "edge (a,b)"; + ASSERT_EQ("edge (a,b)", ug[edge_undirected].test_e); + ASSERT_EQ("edge (a,b)", get(&SimpleE::test_e, ug, edge_undirected)); + + ug[edge_undirected].test_e = "edge (a,b) again"; + put(&SimpleE::test_e, ug, edge_undirected, "edge (a,b) once more"); +} + +TEST(graph_undirected, simple_adjacency_list) { + using AdjListG = + boost::adjacency_list; + + AdjListG g; + auto a = add_vertex(g); + ASSERT_NE(AdjListG::null_vertex(), a); + g[a].index = 0; + auto b = add_vertex(g); + ASSERT_NE(AdjListG::null_vertex(), b); + g[b].index = 1; + auto c = add_vertex(g); + ASSERT_NE(AdjListG::null_vertex(), c); + g[c].index = 2; + + add_edge(a, b, g); + add_edge(b, a, g); + add_edge(a, c, g); + add_edge(c, b, g); + add_edge(c, c, g); + + auto ug = make_undirected_graph(g); + + ASSERT_EQ(3, num_vertices(ug)); + ASSERT_EQ(4, num_edges(ug)); + + // Check adjacencies + + ASSERT_EQ(2, out_degree(a, ug)); + ASSERT_EQ(to_indices({b, c}, ug), + to_indices(adjacent_vertices_range(a, ug), ug)); + + ASSERT_EQ(2, out_degree(b, ug)); + ASSERT_EQ(to_indices({a, c}, ug), + to_indices(adjacent_vertices_range(b, ug), ug)); + + ASSERT_EQ(3, out_degree(c, ug)); + ASSERT_EQ(to_indices({a, b, c}, ug), + to_indices(adjacent_vertices_range(c, ug), ug)); + + ASSERT_EQ(2, in_degree(b, ug)); + ASSERT_EQ(to_indices({a, c}, ug), + to_indices(inv_adjacent_vertices_range(b, ug), ug)); + + // Test reverse edge existence + + ASSERT_TRUE(edge(a, b, ug).second); + ASSERT_TRUE(edge(b, a, ug).second); + ASSERT_TRUE(edge(a, c, ug).second); + ASSERT_TRUE(edge(c, a, ug).second); // (a,c) actually exists + ASSERT_TRUE(edge(b, c, ug).second); // (c,b) actually exists + ASSERT_FALSE(edge(a, a, ug).second); + + // Vertex properties + + g[c].test_v = "vertex c"; + ASSERT_EQ("vertex c", ug[c].test_v); + ASSERT_EQ("vertex c", get(&SimpleV::test_v, ug, c)); + + ug[c].test_v = "vertex c again"; + ASSERT_EQ("vertex c again", g[c].test_v); + ASSERT_EQ("vertex c again", get(&SimpleV::test_v, g, c)); + + put(&SimpleV::test_v, ug, c, "vertex c once more"); + ASSERT_EQ("vertex c once more", g[c].test_v); + + const auto &vprops1 = ug[b]; + ASSERT_EQ(1, vprops1.index); + + const auto &vprops2 = get(boost::vertex_all, ug, b); + ASSERT_EQ(1, vprops2.index); + + // Edge Properties + + auto edge_undirected = edge(a, b, ug).first; + ug[edge_undirected].test_e = "edge (a,b)"; + ASSERT_EQ("edge (a,b)", ug[edge_undirected].test_e); + ASSERT_EQ("edge (a,b)", get(&SimpleE::test_e, ug, edge_undirected)); + + ug[edge_undirected].test_e = "edge (a,b) again"; + put(&SimpleE::test_e, ug, edge_undirected, "edge (a,b) once more"); +} diff --git a/unit/internal/utf8_validate.cpp b/unit/internal/utf8_validate.cpp index f570e6b02..033579420 100644 --- a/unit/internal/utf8_validate.cpp +++ b/unit/internal/utf8_validate.cpp @@ -118,5 +118,5 @@ INSTANTIATE_TEST_CASE_P(ValidUtf8, ValidUtf8Test, ValuesIn(valid_utf8_tests)); TEST_P(ValidUtf8Test, check) { const auto &info = GetParam(); SCOPED_TRACE(testing::Message() << "String is: " << printable(info.str)); - ASSERT_EQ(info.is_valid, isValidUtf8(info.str.c_str())); + ASSERT_EQ(info.is_valid, isValidUtf8(info.str.c_str(), info.str.size())); } diff --git a/util/ExpressionParser.rl b/util/ExpressionParser.rl index 94d035080..fec479229 100644 --- a/util/ExpressionParser.rl +++ b/util/ExpressionParser.rl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -76,6 +76,8 @@ enum ParamKey { case '8': *flags |= HS_FLAG_UTF8; break; case 'P': *flags |= HS_FLAG_PREFILTER; break; case 'L': *flags |= HS_FLAG_SOM_LEFTMOST; break; + case 'C': *flags |= HS_FLAG_COMBINATION; break; + case 'Q': *flags |= HS_FLAG_QUIET; break; default: fbreak; } } @@ -159,7 +161,7 @@ bool HS_CDECL readExpression(const std::string &input, std::string &expr, enum ParamKey key = PARAM_NONE; %%{ - single_flag = [ismW8HPLVO]; + single_flag = [ismW8HPLVOCQ]; param = ('min_offset' @{ key = PARAM_MIN_OFFSET; } | 'max_offset' @{ key = PARAM_MAX_OFFSET; } | 'min_length' @{ key = PARAM_MIN_LENGTH; } | diff --git a/util/win_getopt.h b/util/win_getopt.h new file mode 100644 index 000000000..7ec9abfbc --- /dev/null +++ b/util/win_getopt.h @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef WIN_GETOPT_H +#define WIN_GETOPT_H + +#include +#define ILLEGAL (int)'?' +#define END -1 +#define SPECIAL_OPT 1 + +int optind = 0; +char *optarg; +static char EMPT[] = ""; +static char *ptr = EMPT; +static int no_argument = 0; +static int required_argument = 1; +static const char no_arg[] = "option doesn't take an argument --%.*s"; +static const char non_opt_string[] = "not an option : %s"; +static const char ill_shortopt_char[] = "unknown option -%c"; +static const char ill_longopt_string[] = "unknown option --%s"; +static const char req_arg_string[] = "option requires an argument --%s"; +static const char req_arg_char[] = "option requires an argument -%c"; + +struct option { + const char *name; + int has_arg; + int *flag; + int value; +}; + +static +void warn(const char *fmt, ...) { + va_list args; + va_start(args, fmt); + vfprintf(stdout, fmt, args); + fprintf(stdout, "\n"); + va_end(args); +} + +int getopt_long(int nargc, char *const *nargv, const char *options, + const struct option *long_options, int *idx) { + char *check, *equal; + size_t current_opt_len; + bool all_flag = false; + int match = -1; + // illegal + if (options == NULL) { + return ILLEGAL; + } + if (optind == 0) { + optind = 1; + } + if (optind >= nargc) { + return END; + } + if (*options == '-') { + all_flag = true; + ++options; + } + optarg = NULL; + // illegal + if (*(ptr = nargv[optind]) != '-') { + ptr = EMPT; + if (all_flag) { + optarg = nargv[optind++]; + return SPECIAL_OPT; + } else { + warn(non_opt_string, nargv[optind]); + return ILLEGAL; + } + } + // likely a short option ? + if (ptr[1] != '\0' && *++ptr != '-' && ptr[1] == '\0') { + char opt_char = *ptr; + ptr = EMPT; + // really short option ? + if ((check = (char *)strchr(options, opt_char)) != NULL) { + if (check[1] == ':') { + ++optind; + if (optind >= nargc) { + warn(req_arg_char, opt_char); + return ILLEGAL; + } else { + optarg = nargv[optind]; + } + } + ++optind; + return opt_char; + } else { // illegal + warn(ill_shortopt_char, opt_char); + return ILLEGAL; + } + } + // we meet '--' + if (*ptr == '-' && ptr[1] == '\0') { + ptr = EMPT; + return END; + } + // we meet '--foo' , long option + if (long_options != NULL && *ptr == '-' && ptr[1] != '\0') { + ++ptr; + if ((equal = strchr(ptr, '=')) != NULL) { + // found --option=arg + current_opt_len = equal - ptr; + ++equal; + } else { + current_opt_len = strlen(ptr); + } + for (int i = 0; long_options[i].name; i++) { + if (!strcmp(ptr, long_options[i].name )) { + match = i; + break; + } + } + if (match == -1) { // don't match + warn(ill_longopt_string, ptr); + ptr = EMPT; + return ILLEGAL; + } else { + ++optind; + if (long_options[match].has_arg == required_argument) { + if (equal) { + optarg = equal; + } else if (optind < nargc) { + optarg = nargv[optind++]; + } else { + warn(req_arg_string, ptr); + ptr = EMPT; + return ILLEGAL; + } + } + if (long_options[match].has_arg == no_argument && equal) { + warn(no_arg, (int)current_opt_len, ptr); + ptr = EMPT; + return ILLEGAL; + } + ptr = EMPT; + if (long_options[match].flag) { + *long_options[match].flag = long_options[match].value; + return 0; + } else { + return (long_options[match].value); + } + } + } + warn(non_opt_string, ptr); + ptr = EMPT; + return ILLEGAL; +} + +#endif