diff --git a/CHANGELOG.md b/CHANGELOG.md
index ea44debe9..19a92b909 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,29 @@
 
 This is a list of notable changes to Hyperscan, in reverse chronological order.
 
+## [5.3.0] 2020-05-15
+- Improvement on literal matcher "Teddy" performance, including support for
+  Intel(R) AVX-512 Vector Byte Manipulation Instructions (Intel(R) AVX-512
+  VBMI).
+- Improvement on single-byte/two-byte matching performance, including support
+  for Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512).
+- hsbench: add hyphen support for -T option.
+- tools/fuzz: add test scripts for synthetic pattern generation.
+- Bugfix for acceleration path analysis in LimEx NFA.
+- Bugfix for duplicate matches for Small-write engine.
+- Bugfix for UTF8 checking problem for hscollider.
+- Bugfix for issue #205: avoid crash of `hs_compile_lit_multi()` with clang and
+  ASAN.
+- Bugfix for issue #211: fix error in `db_check_platform()` function.
+- Bugfix for issue #217: fix cmake parsing issue of CPU arch for non-English
+  locale.
+- Bugfix for issue #228: avoid undefined behavior when calling `close()` after
+  `fdopendir()` in `loadExpressions()`.
+- Bugfix for issue #239: fix hyperscan compile issue under gcc-10.
+- Add VLAN packets processing capability in pcap analysis script. (#214)
+- Avoid extra convert instruction for "Noodle". (#221)
+- Add Hyperscan version marcro in `hs.h`. (#222)
+
 ## [5.2.1] 2019-10-13
 - Bugfix for issue #186: fix compile issue when `BUILD_SHARED_LIBS` is on in
   release mode.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 83197af1e..59c6e6e2f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,8 +2,8 @@ cmake_minimum_required (VERSION 2.8.11)
 project (hyperscan C CXX)
 
 set (HS_MAJOR_VERSION 5)
-set (HS_MINOR_VERSION 2)
-set (HS_PATCH_VERSION 1)
+set (HS_MINOR_VERSION 3)
+set (HS_PATCH_VERSION 0)
 set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})
 
 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
@@ -187,9 +187,9 @@ else()
         set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -march=native -mtune=native)
         execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
             OUTPUT_VARIABLE _GCC_OUTPUT)
-        string(FIND "${_GCC_OUTPUT}" "Known" POS)
-        string(SUBSTRING "${_GCC_OUTPUT}" 0 ${POS} _GCC_OUTPUT)
-        string(REGEX REPLACE ".*march=[ \t]*([^ \n]*)[ \n].*" "\\1"
+        string(FIND "${_GCC_OUTPUT}" "march" POS)
+        string(SUBSTRING "${_GCC_OUTPUT}" ${POS} -1 _GCC_OUTPUT)
+        string(REGEX REPLACE "march=[ \t]*([^ \n]*)[ \n].*" "\\1"
             GNUCC_ARCH "${_GCC_OUTPUT}")
 
         # test the parsed flag
@@ -326,7 +326,7 @@ if (CMAKE_SYSTEM_NAME MATCHES "Linux")
             set (FAT_RUNTIME_REQUISITES TRUE)
         endif()
     endif()
-    CMAKE_DEPENDENT_OPTION(FAT_RUNTIME "Build a library that supports multiple microarchitecures" ${RELEASE_BUILD} "FAT_RUNTIME_REQUISITES" OFF)
+    CMAKE_DEPENDENT_OPTION(FAT_RUNTIME "Build a library that supports multiple microarchitectures" ${RELEASE_BUILD} "FAT_RUNTIME_REQUISITES" OFF)
 endif ()
 
 include (${CMAKE_MODULE_PATH}/arch.cmake)
@@ -340,7 +340,7 @@ if (NOT WIN32)
 set(C_FLAGS_TO_CHECK
 # Variable length arrays are way bad, most especially at run time
 "-Wvla"
-# Pointer arith on void pointers is doing it wong.
+# Pointer arith on void pointers is doing it wrong.
  "-Wpointer-arith"
 # Build our C code with -Wstrict-prototypes -Wmissing-prototypes
  "-Wstrict-prototypes"
@@ -383,7 +383,7 @@ if (CC_PAREN_EQUALITY)
     set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-parentheses-equality")
 endif()
 
-# clang compains about unused const vars in our Ragel-generated code.
+# clang complains about unused const vars in our Ragel-generated code.
 CHECK_CXX_COMPILER_FLAG("-Wunused-const-variable" CXX_UNUSED_CONST_VAR)
 if (CXX_UNUSED_CONST_VAR)
     set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-unused-const-variable")
@@ -418,6 +418,12 @@ CHECK_CXX_COMPILER_FLAG("-Wunused-local-typedefs" CXX_UNUSED_LOCAL_TYPEDEFS)
 # gcc5 complains about this
 CHECK_CXX_COMPILER_FLAG("-Wunused-variable" CXX_WUNUSED_VARIABLE)
 
+# gcc 10 complains about this
+CHECK_C_COMPILER_FLAG("-Wstringop-overflow" CC_STRINGOP_OVERFLOW)
+if(CC_STRINGOP_OVERFLOW)
+    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-stringop-overflow")
+endif()
+
 endif()
 
 include_directories(SYSTEM ${Boost_INCLUDE_DIRS})
diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index 5be258aa9..cced49c69 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -58,6 +58,18 @@ int main(){
     (void)_mm512_abs_epi8(z);
 }" HAVE_AVX512)
 
+# and now for AVX512VBMI
+CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
+#if !defined(__AVX512VBMI__)
+#error no avx512vbmi
+#endif
+
+int main(){
+    __m512i a = _mm512_set1_epi8(0xFF);
+    __m512i idx = _mm512_set_epi64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
+    (void)_mm512_permutexvar_epi8(idx, a);
+}" HAVE_AVX512VBMI)
+
 if (FAT_RUNTIME)
     if (NOT HAVE_SSSE3)
         message(FATAL_ERROR "SSSE3 support required to build fat runtime")
diff --git a/doc/dev-reference/compilation.rst b/doc/dev-reference/compilation.rst
index 93290467b..205b7348b 100644
--- a/doc/dev-reference/compilation.rst
+++ b/doc/dev-reference/compilation.rst
@@ -55,7 +55,7 @@ Hyperscan provides support for targeting a database at a particular CPU
 platform; see :ref:`instr_specialization` for details.
 
 =====================
-Compile Pure Literals 
+Compile Pure Literals
 =====================
 
 Pure literal is a special case of regular expression. A character sequence is
@@ -75,12 +75,12 @@ characters exist in regular grammer like ``[``, ``]``, ``(``, ``)``, ``{``,
 While in pure literal case, all these meta characters lost extra meanings
 expect for that they are just common ASCII codes.
 
-Hyperscan is initially designed to process common regualr expressions. It is
+Hyperscan is initially designed to process common regular expressions. It is
 hence embedded with a complex parser to do comprehensive regular grammer
 interpretion. Particularly, the identification of above meta characters is the
 basic step for the interpretion of far more complex regular grammers.
 
-However in real cases, patterns may not always be regualr expressions. They
+However in real cases, patterns may not always be regular expressions. They
 could just be pure literals. Problem will come if the pure literals contain
 regular meta characters. Supposing fed directly into traditional Hyperscan
 compile API, all these meta characters will be interpreted in predefined ways,
@@ -98,7 +98,7 @@ In ``v5.2.0``, Hyperscan introduces 2 new compile APIs for pure literal patterns
 #. :c:func:`hs_compile_lit_multi`: compiles an array of pure literals into a
    pattern database. All of the supplied patterns will be scanned for
    concurrently at scan time, with user-supplied identifiers returned when they
-   match. 
+   match.
 
 These 2 APIs are designed for use cases where all patterns contained in the
 target rule set are pure literals. Users can pass the initial pure literal
@@ -110,8 +110,8 @@ Hyperscan needs to locate the end position of the input expression via clearly
 knowing each literal's length, not by simply identifying character ``\0`` of a
 string.
 
-Supported flags: :c:member:`HS_FLAG_CASELESS`, :c:member:`HS_FLAG_MULTILINE`,
-:c:member:`HS_FLAG_SINGLEMATCH`, :c:member:`HS_FLAG_SOM_LEFTMOST`.
+Supported flags: :c:member:`HS_FLAG_CASELESS`, :c:member:`HS_FLAG_SINGLEMATCH`,
+:c:member:`HS_FLAG_SOM_LEFTMOST`.
 
 .. note:: We don't support literal compilation API with :ref:`extparam`. And
           for runtime implementation, traditional runtime APIs can still be
diff --git a/doc/dev-reference/getting_started.rst b/doc/dev-reference/getting_started.rst
index 45d4fbbb7..b38128733 100644
--- a/doc/dev-reference/getting_started.rst
+++ b/doc/dev-reference/getting_started.rst
@@ -260,7 +260,7 @@ instead of potentially executing illegal instructions. The API function
 :c:func:`hs_valid_platform` can be used by application writers to determine if
 the current platform is supported by Hyperscan.
 
-At of this release, the variants of the runtime that are built, and the CPU
+As of this release, the variants of the runtime that are built, and the CPU
 capability that is required, are the following:
 
 +----------+-------------------------------+---------------------------+
diff --git a/src/compiler/compiler.cpp b/src/compiler/compiler.cpp
index 3382ff421..666eefc9c 100644
--- a/src/compiler/compiler.cpp
+++ b/src/compiler/compiler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2019, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -125,7 +125,7 @@ ParsedLitExpression::ParsedLitExpression(unsigned index_in,
     : expr(index_in, false, flags & HS_FLAG_SINGLEMATCH, false, false,
            SOM_NONE, report, 0, MAX_OFFSET, 0, 0, 0, false) {
     // For pure literal expression, below 'HS_FLAG_'s are unuseful:
-    // DOTALL/ALLOWEMPTY/UTF8/UCP/PREFILTER/COMBINATION/QUIET
+    // DOTALL/ALLOWEMPTY/UTF8/UCP/PREFILTER/COMBINATION/QUIET/MULTILINE
 
     if (flags & ~HS_FLAG_ALL) {
         DEBUG_PRINTF("Unrecognised flag, flags=%u.\n", flags);
@@ -402,19 +402,18 @@ void addLitExpression(NG &ng, unsigned index, const char *expression,
     }
 
     // Ensure that our pattern isn't too long (in characters).
-    if (strlen(expression) > cc.grey.limitPatternLength) {
+    if (expLength > cc.grey.limitPatternLength) {
         throw CompileError("Pattern length exceeds limit.");
     }
 
     // filter out flags not supported by pure literal API.
     u64a not_supported = HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8 |
                          HS_FLAG_UCP | HS_FLAG_PREFILTER | HS_FLAG_COMBINATION |
-                         HS_FLAG_QUIET;
+                         HS_FLAG_QUIET | HS_FLAG_MULTILINE;
 
     if (flags & not_supported) {
-        throw CompileError("Only HS_FLAG_CASELESS, HS_FLAG_MULTILINE, "
-                           "HS_FLAG_SINGLEMATCH and HS_FLAG_SOM_LEFTMOST are "
-                           "supported in literal API.");
+        throw CompileError("Only HS_FLAG_CASELESS, HS_FLAG_SINGLEMATCH and "
+                           "HS_FLAG_SOM_LEFTMOST are supported in literal API.");
     }
 
     // This expression must be a pure literal, we can build ue2_literal
diff --git a/src/database.c b/src/database.c
index dc03bf1fb..1a79800e2 100644
--- a/src/database.c
+++ b/src/database.c
@@ -114,8 +114,8 @@ hs_error_t HS_CDECL hs_serialize_database(const hs_database_t *db, char **bytes,
 static
 hs_error_t db_check_platform(const u64a p) {
     if (p != hs_current_platform
-        && p != hs_current_platform_no_avx2
-        && p != hs_current_platform_no_avx512) {
+        && p != (hs_current_platform | hs_current_platform_no_avx2)
+        && p != (hs_current_platform | hs_current_platform_no_avx512)) {
         return HS_DB_PLATFORM_ERROR;
     }
     // passed all checks
diff --git a/src/fdr/teddy.c b/src/fdr/teddy.c
index 0b3fe28f0..960e2a415 100644
--- a/src/fdr/teddy.c
+++ b/src/fdr/teddy.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -74,6 +74,30 @@ const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
 };
 
+#if defined(HAVE_AVX512VBMI) // VBMI strong teddy
+
+#define CONF_CHUNK_64(chunk, bucket, off, reason, pt, conf_fn)              \
+do {                                                                        \
+    if (unlikely(chunk != ones_u64a)) {                                     \
+        chunk = ~chunk;                                                     \
+        conf_fn(&chunk, bucket, off, confBase, reason, a, pt,               \
+                &control, &last_match);                                     \
+        CHECK_HWLM_TERMINATE_MATCHING;                                      \
+    }                                                                       \
+} while(0)
+
+#define CONF_CHUNK_32(chunk, bucket, off, reason, pt, conf_fn)              \
+do {                                                                        \
+    if (unlikely(chunk != ones_u32)) {                                      \
+        chunk = ~chunk;                                                     \
+        conf_fn(&chunk, bucket, off, confBase, reason, a, pt,               \
+                &control, &last_match);                                     \
+        CHECK_HWLM_TERMINATE_MATCHING;                                      \
+    }                                                                       \
+} while(0)
+
+#else
+
 #define CONF_CHUNK_64(chunk, bucket, off, reason, conf_fn)                  \
 do {                                                                        \
     if (unlikely(chunk != ones_u64a)) {                                     \
@@ -94,7 +118,284 @@ do {                                                                        \
     }                                                                       \
 } while(0)
 
-#if defined(HAVE_AVX512) // AVX512 reinforced teddy
+#endif
+
+#if defined(HAVE_AVX512VBMI) // VBMI strong teddy
+
+#ifdef ARCH_64_BIT
+#define CONFIRM_TEDDY(var, bucket, offset, reason, pt, conf_fn)             \
+do {                                                                        \
+    if (unlikely(diff512(var, ones512()))) {                                \
+        m128 p128_0 = extract128from512(var, 0);                            \
+        m128 p128_1 = extract128from512(var, 1);                            \
+        m128 p128_2 = extract128from512(var, 2);                            \
+        m128 p128_3 = extract128from512(var, 3);                            \
+        u64a part1 = movq(p128_0);                                          \
+        u64a part2 = movq(rshiftbyte_m128(p128_0, 8));                      \
+        u64a part3 = movq(p128_1);                                          \
+        u64a part4 = movq(rshiftbyte_m128(p128_1, 8));                      \
+        u64a part5 = movq(p128_2);                                          \
+        u64a part6 = movq(rshiftbyte_m128(p128_2, 8));                      \
+        u64a part7 = movq(p128_3);                                          \
+        u64a part8 = movq(rshiftbyte_m128(p128_3, 8));                      \
+        CONF_CHUNK_64(part1, bucket, offset, reason, pt, conf_fn);          \
+        CONF_CHUNK_64(part2, bucket, offset + 8, reason, pt, conf_fn);      \
+        CONF_CHUNK_64(part3, bucket, offset + 16, reason, pt, conf_fn);     \
+        CONF_CHUNK_64(part4, bucket, offset + 24, reason, pt, conf_fn);     \
+        CONF_CHUNK_64(part5, bucket, offset + 32, reason, pt, conf_fn);     \
+        CONF_CHUNK_64(part6, bucket, offset + 40, reason, pt, conf_fn);     \
+        CONF_CHUNK_64(part7, bucket, offset + 48, reason, pt, conf_fn);     \
+        CONF_CHUNK_64(part8, bucket, offset + 56, reason, pt, conf_fn);     \
+    }                                                                       \
+} while(0)
+#else
+#define CONFIRM_TEDDY(var, bucket, offset, reason, pt, conf_fn)             \
+do {                                                                        \
+    if (unlikely(diff512(var, ones512()))) {                                \
+        m128 p128_0 = extract128from512(var, 0);                            \
+        m128 p128_1 = extract128from512(var, 1);                            \
+        m128 p128_2 = extract128from512(var, 2);                            \
+        m128 p128_3 = extract128from512(var, 3);                            \
+        u32 part1 = movd(p128_0);                                           \
+        u32 part2 = movd(rshiftbyte_m128(p128_0, 4));                       \
+        u32 part3 = movd(rshiftbyte_m128(p128_0, 8));                       \
+        u32 part4 = movd(rshiftbyte_m128(p128_0, 12));                      \
+        u32 part5 = movd(p128_1);                                           \
+        u32 part6 = movd(rshiftbyte_m128(p128_1, 4));                       \
+        u32 part7 = movd(rshiftbyte_m128(p128_1, 8));                       \
+        u32 part8 = movd(rshiftbyte_m128(p128_1, 12));                      \
+        u32 part9 = movd(p128_2);                                           \
+        u32 part10 = movd(rshiftbyte_m128(p128_2, 4));                      \
+        u32 part11 = movd(rshiftbyte_m128(p128_2, 8));                      \
+        u32 part12 = movd(rshiftbyte_m128(p128_2, 12));                     \
+        u32 part13 = movd(p128_3);                                          \
+        u32 part14 = movd(rshiftbyte_m128(p128_3, 4));                      \
+        u32 part15 = movd(rshiftbyte_m128(p128_3, 8));                      \
+        u32 part16 = movd(rshiftbyte_m128(p128_3, 12));                     \
+        CONF_CHUNK_32(part1, bucket, offset, reason, pt, conf_fn);          \
+        CONF_CHUNK_32(part2, bucket, offset + 4, reason, pt, conf_fn);      \
+        CONF_CHUNK_32(part3, bucket, offset + 8, reason, pt, conf_fn);      \
+        CONF_CHUNK_32(part4, bucket, offset + 12, reason, pt, conf_fn);     \
+        CONF_CHUNK_32(part5, bucket, offset + 16, reason, pt, conf_fn);     \
+        CONF_CHUNK_32(part6, bucket, offset + 20, reason, pt, conf_fn);     \
+        CONF_CHUNK_32(part7, bucket, offset + 24, reason, pt, conf_fn);     \
+        CONF_CHUNK_32(part8, bucket, offset + 28, reason, pt, conf_fn);     \
+        CONF_CHUNK_32(part9, bucket, offset + 32, reason, pt, conf_fn);     \
+        CONF_CHUNK_32(part10, bucket, offset + 36, reason, pt, conf_fn);    \
+        CONF_CHUNK_32(part11, bucket, offset + 40, reason, pt, conf_fn);    \
+        CONF_CHUNK_32(part12, bucket, offset + 44, reason, pt, conf_fn);    \
+        CONF_CHUNK_32(part13, bucket, offset + 48, reason, pt, conf_fn);    \
+        CONF_CHUNK_32(part14, bucket, offset + 52, reason, pt, conf_fn);    \
+        CONF_CHUNK_32(part15, bucket, offset + 56, reason, pt, conf_fn);    \
+        CONF_CHUNK_32(part16, bucket, offset + 60, reason, pt, conf_fn);    \
+    }                                                                       \
+} while(0)
+#endif
+
+#define PREP_SHUF_MASK                                                      \
+    m512 lo = and512(val, *lo_mask);                                        \
+    m512 hi = and512(rshift64_m512(val, 4), *lo_mask)
+
+#define TEDDY_VBMI_PSHUFB_OR_M1                              \
+    m512 shuf_or_b0 = or512(pshufb_m512(dup_mask[0], lo),    \
+                            pshufb_m512(dup_mask[1], hi));
+
+#define TEDDY_VBMI_PSHUFB_OR_M2                              \
+    TEDDY_VBMI_PSHUFB_OR_M1                                  \
+    m512 shuf_or_b1 = or512(pshufb_m512(dup_mask[2], lo),    \
+                            pshufb_m512(dup_mask[3], hi));
+
+#define TEDDY_VBMI_PSHUFB_OR_M3                              \
+    TEDDY_VBMI_PSHUFB_OR_M2                                  \
+    m512 shuf_or_b2 = or512(pshufb_m512(dup_mask[4], lo),    \
+                            pshufb_m512(dup_mask[5], hi));
+
+#define TEDDY_VBMI_PSHUFB_OR_M4                              \
+    TEDDY_VBMI_PSHUFB_OR_M3                                  \
+    m512 shuf_or_b3 = or512(pshufb_m512(dup_mask[6], lo),    \
+                            pshufb_m512(dup_mask[7], hi));
+
+#define TEDDY_VBMI_SL1_MASK   0xfffffffffffffffeULL
+#define TEDDY_VBMI_SL2_MASK   0xfffffffffffffffcULL
+#define TEDDY_VBMI_SL3_MASK   0xfffffffffffffff8ULL
+
+#define TEDDY_VBMI_SHIFT_M1
+
+#define TEDDY_VBMI_SHIFT_M2                      \
+    TEDDY_VBMI_SHIFT_M1                          \
+    m512 sl1 = maskz_vpermb512(TEDDY_VBMI_SL1_MASK, sl_msk[0], shuf_or_b1);
+
+#define TEDDY_VBMI_SHIFT_M3                      \
+    TEDDY_VBMI_SHIFT_M2                          \
+    m512 sl2 = maskz_vpermb512(TEDDY_VBMI_SL2_MASK, sl_msk[1], shuf_or_b2);
+
+#define TEDDY_VBMI_SHIFT_M4                      \
+    TEDDY_VBMI_SHIFT_M3                          \
+    m512 sl3 = maskz_vpermb512(TEDDY_VBMI_SL3_MASK, sl_msk[2], shuf_or_b3);
+
+#define SHIFT_OR_M1            \
+    shuf_or_b0
+
+#define SHIFT_OR_M2            \
+    or512(sl1, SHIFT_OR_M1)
+
+#define SHIFT_OR_M3            \
+    or512(sl2, SHIFT_OR_M2)
+
+#define SHIFT_OR_M4            \
+    or512(sl3, SHIFT_OR_M3)
+
+static really_inline
+m512 prep_conf_teddy_m1(const m512 *lo_mask, const m512 *dup_mask,
+                        UNUSED const m512 *sl_msk, const m512 val) {
+    PREP_SHUF_MASK;
+    TEDDY_VBMI_PSHUFB_OR_M1;
+    TEDDY_VBMI_SHIFT_M1;
+    return SHIFT_OR_M1;
+}
+
+static really_inline
+m512 prep_conf_teddy_m2(const m512 *lo_mask, const m512 *dup_mask,
+                        const m512 *sl_msk, const m512 val) {
+    PREP_SHUF_MASK;
+    TEDDY_VBMI_PSHUFB_OR_M2;
+    TEDDY_VBMI_SHIFT_M2;
+    return SHIFT_OR_M2;
+}
+
+static really_inline
+m512 prep_conf_teddy_m3(const m512 *lo_mask, const m512 *dup_mask,
+                        const m512 *sl_msk, const m512 val) {
+    PREP_SHUF_MASK;
+    TEDDY_VBMI_PSHUFB_OR_M3;
+    TEDDY_VBMI_SHIFT_M3;
+    return SHIFT_OR_M3;
+}
+
+static really_inline
+m512 prep_conf_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
+                        const m512 *sl_msk, const m512 val) {
+    PREP_SHUF_MASK;
+    TEDDY_VBMI_PSHUFB_OR_M4;
+    TEDDY_VBMI_SHIFT_M4;
+    return SHIFT_OR_M4;
+}
+
+#define PREP_CONF_FN(val, n)                                                  \
+    prep_conf_teddy_m##n(&lo_mask, dup_mask, sl_msk, val)
+
+const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = {
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+    0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f
+};
+
+#define TEDDY_VBMI_SL1_POS    15
+#define TEDDY_VBMI_SL2_POS    14
+#define TEDDY_VBMI_SL3_POS    13
+
+#define TEDDY_VBMI_LOAD_SHIFT_MASK_M1
+
+#define TEDDY_VBMI_LOAD_SHIFT_MASK_M2    \
+    TEDDY_VBMI_LOAD_SHIFT_MASK_M1        \
+    sl_msk[0] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL1_POS);
+
+#define TEDDY_VBMI_LOAD_SHIFT_MASK_M3    \
+    TEDDY_VBMI_LOAD_SHIFT_MASK_M2        \
+    sl_msk[1] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL2_POS);
+
+#define TEDDY_VBMI_LOAD_SHIFT_MASK_M4    \
+    TEDDY_VBMI_LOAD_SHIFT_MASK_M3        \
+    sl_msk[2] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL3_POS);
+
+#define PREPARE_MASKS_1                                                       \
+    dup_mask[0] = set4x128(maskBase[0]);                                      \
+    dup_mask[1] = set4x128(maskBase[1]);
+
+#define PREPARE_MASKS_2                                                       \
+    PREPARE_MASKS_1                                                           \
+    dup_mask[2] = set4x128(maskBase[2]);                                      \
+    dup_mask[3] = set4x128(maskBase[3]);
+
+#define PREPARE_MASKS_3                                                       \
+    PREPARE_MASKS_2                                                           \
+    dup_mask[4] = set4x128(maskBase[4]);                                      \
+    dup_mask[5] = set4x128(maskBase[5]);
+
+#define PREPARE_MASKS_4                                                       \
+    PREPARE_MASKS_3                                                           \
+    dup_mask[6] = set4x128(maskBase[6]);                                      \
+    dup_mask[7] = set4x128(maskBase[7]);
+
+#define PREPARE_MASKS(n)                                                      \
+    m512 lo_mask = set64x8(0xf);                                              \
+    m512 dup_mask[n * 2];                                                     \
+    m512 sl_msk[n - 1];                                                       \
+    PREPARE_MASKS_##n                                                         \
+    TEDDY_VBMI_LOAD_SHIFT_MASK_M##n
+
+#define TEDDY_VBMI_CONF_MASK_HEAD   (0xffffffffffffffffULL >> n_sh)
+#define TEDDY_VBMI_CONF_MASK_FULL   (0xffffffffffffffffULL << n_sh)
+#define TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffffffffffULL >> (64 - n) << overlap)
+#define TEDDY_VBMI_LOAD_MASK_PATCH  (0xffffffffffffffffULL >> (64 - n_sh))
+
+#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn)                       \
+do {                                                                          \
+    const u8 *buf_end = a->buf + a->len;                                      \
+    const u8 *ptr = a->buf + a->start_offset;                                 \
+    u32 floodBackoff = FLOOD_BACKOFF_START;                                   \
+    const u8 *tryFloodDetect = a->firstFloodDetect;                           \
+    u32 last_match = ones_u32;                                                \
+    const struct Teddy *teddy = (const struct Teddy *)fdr;                    \
+    const size_t iterBytes = 64;                                              \
+    u32 n_sh = n_msk - 1;                                                     \
+    const size_t loopBytes = 64 - n_sh;                                       \
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",                 \
+                 a->buf, a->len, a->start_offset);                            \
+                                                                              \
+    const m128 *maskBase = getMaskBase(teddy);                                \
+    PREPARE_MASKS(n_msk);                                                     \
+    const u32 *confBase = getConfBase(teddy);                                 \
+                                                                              \
+    u64a k = TEDDY_VBMI_CONF_MASK_FULL;                                       \
+    m512 p_mask = set_mask_m512(~k);                                          \
+    u32 overlap = 0;                                                          \
+    u64a patch = 0;                                                           \
+    if (likely(ptr + loopBytes <= buf_end)) {                                 \
+        m512 p_mask0 = set_mask_m512(~TEDDY_VBMI_CONF_MASK_HEAD);             \
+        m512 r_0 = PREP_CONF_FN(loadu512(ptr), n_msk);                        \
+        r_0 = or512(r_0, p_mask0);                                            \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, ptr, conf_fn);                    \
+        ptr += loopBytes;                                                     \
+        overlap = n_sh;                                                       \
+        patch = TEDDY_VBMI_LOAD_MASK_PATCH;                                   \
+    }                                                                         \
+                                                                              \
+    for (; ptr + loopBytes <= buf_end; ptr += loopBytes) {                    \
+        __builtin_prefetch(ptr - n_sh + (64 * 2));                            \
+        CHECK_FLOOD;                                                          \
+        m512 r_0 = PREP_CONF_FN(loadu512(ptr - n_sh), n_msk);                 \
+        r_0 = or512(r_0, p_mask);                                             \
+        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, ptr - n_sh, conf_fn);          \
+    }                                                                         \
+                                                                              \
+    assert(ptr + loopBytes > buf_end);                                        \
+    if (ptr < buf_end) {                                                      \
+        u32 left = (u32)(buf_end - ptr);                                      \
+        u64a k1 = TEDDY_VBMI_CONF_MASK_VAR(left);                             \
+        m512 p_mask1 = set_mask_m512(~k1);                                    \
+        m512 val_0 = loadu_maskz_m512(k1 | patch, ptr - overlap);             \
+        m512 r_0 = PREP_CONF_FN(val_0, n_msk);                                \
+        r_0 = or512(r_0, p_mask1);                                            \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, ptr - overlap, conf_fn);          \
+    }                                                                         \
+                                                                              \
+    return HWLM_SUCCESS;                                                      \
+} while(0)
+
+#elif defined(HAVE_AVX512) // AVX512 reinforced teddy
 
 #ifdef ARCH_64_BIT
 #define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
diff --git a/src/fdr/teddy_avx2.c b/src/fdr/teddy_avx2.c
index 56ec739f1..20ea938cf 100644
--- a/src/fdr/teddy_avx2.c
+++ b/src/fdr/teddy_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -134,7 +134,7 @@ const m256 *getMaskBase_fat(const struct Teddy *teddy) {
     return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
 }
 
-#if defined(HAVE_AVX512)
+#if defined(HAVE_AVX512_REVERT) // revert to AVX2 Fat Teddy
 
 static really_inline
 const u64a *getReinforcedMaskBase_fat(const struct Teddy *teddy, u8 numMask) {
diff --git a/src/fdr/teddy_runtime_common.h b/src/fdr/teddy_runtime_common.h
index 1dbeb0979..730850cb7 100644
--- a/src/fdr/teddy_runtime_common.h
+++ b/src/fdr/teddy_runtime_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -383,12 +383,16 @@ m512 vectoredLoad512(m512 *p_mask, const u8 *ptr, const size_t start_offset,
 
 static really_inline
 u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte,
-                CautionReason reason) {
+                UNUSED CautionReason reason) {
     u64a confVal = 0;
     const u8 *buf = a->buf;
     size_t len = a->len;
     const u8 *confirm_loc = ptr + byte - 7;
+#if defined(HAVE_AVX512VBMI)
+    if (likely(confirm_loc >= buf)) {
+#else
     if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
+#endif
         confVal = lv_u64a(confirm_loc, buf, buf + len);
     } else { // r == VECTORING, confirm_loc < buf
         u64a histBytes = a->histBytes;
diff --git a/src/hs.h b/src/hs.h
index 38215de6c..105919fb8 100644
--- a/src/hs.h
+++ b/src/hs.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -39,6 +39,12 @@
  * the individual component headers for documentation.
  */
 
+/* The current Hyperscan version information. */
+
+#define HS_MAJOR      5
+#define HS_MINOR      3
+#define HS_PATCH      0
+
 #include "hs_compile.h"
 #include "hs_runtime.h"
 
diff --git a/src/hs_compile.h b/src/hs_compile.h
index 4c372ffe0..081d46387 100644
--- a/src/hs_compile.h
+++ b/src/hs_compile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2019, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -563,7 +563,6 @@ hs_error_t HS_CDECL hs_compile_ext_multi(const char *const *expressions,
  *      be used by ORing them together. Compared to @ref hs_compile(), fewer
  *      valid values are provided:
  *       - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
- *       - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
  *       - HS_FLAG_SINGLEMATCH - Only one match will be generated for the
  *                               expression per stream.
  *       - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
@@ -637,7 +636,6 @@ hs_error_t HS_CDECL hs_compile_lit(const char *expression, unsigned flags,
  *      in place of an array will set the flags value for all patterns to zero.
  *      Compared to @ref hs_compile_multi(), fewer valid values are provided:
  *       - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
- *       - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
  *       - HS_FLAG_SINGLEMATCH - Only one match will be generated for the
  *                               expression per stream.
  *       - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
@@ -985,8 +983,8 @@ hs_error_t HS_CDECL hs_populate_platform(hs_platform_info_t *platform);
  * offset when a match is reported for this expression. (By default, no start
  * of match is returned.)
  *
- * Enabling this behaviour may reduce performance and increase stream state
- * requirements in streaming mode.
+ * For all the 3 modes, enabling this behaviour may reduce performance. And
+ * particularly, it may increase stream state requirements in streaming mode.
  */
 #define HS_FLAG_SOM_LEFTMOST    256
 
diff --git a/src/hwlm/noodle_engine_avx2.c b/src/hwlm/noodle_engine_avx2.c
index f10e4a7bc..5edc646af 100644
--- a/src/hwlm/noodle_engine_avx2.c
+++ b/src/hwlm/noodle_engine_avx2.c
@@ -210,7 +210,7 @@ hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf,
     const u8 *d = buf + start, *e = buf + end;
     DEBUG_PRINTF("start %zu end %zu \n", start, end);
     assert(d < e);
-    u8 lastz0 = 0;
+    u32 lastz0 = 0;
 
     for (; d < e; d += 32) {
         m256 v = noCase ? and256(load256(d), caseMask) : load256(d);
diff --git a/src/nfa/accel_dfa_build_strat.cpp b/src/nfa/accel_dfa_build_strat.cpp
index 4508d4f1f..ae71e141a 100644
--- a/src/nfa/accel_dfa_build_strat.cpp
+++ b/src/nfa/accel_dfa_build_strat.cpp
@@ -214,7 +214,7 @@ static
 bool double_byte_ok(const AccelScheme &info) {
     return !info.double_byte.empty() &&
            info.double_cr.count() < info.double_byte.size() &&
-           info.double_cr.count() <= 2 && !info.double_byte.empty();
+           info.double_cr.count() <= 2;
 }
 
 static
diff --git a/src/nfa/vermicelli.h b/src/nfa/vermicelli.h
index 817e681a8..ed797d83f 100644
--- a/src/nfa/vermicelli.h
+++ b/src/nfa/vermicelli.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -46,7 +46,20 @@ const u8 *vermicelliExec(char c, char nocase, const u8 *buf,
                  nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
     assert(buf < buf_end);
 
+    VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
+
     // Handle small scans.
+#ifdef HAVE_AVX512
+    if (buf_end - buf <= VERM_BOUNDARY) {
+        const u8 *ptr = nocase
+                      ? vermMiniNocase(chars, buf, buf_end, 0)
+                      : vermMini(chars, buf, buf_end, 0);
+        if (ptr) {
+            return ptr;
+        }
+        return buf_end;
+    }
+#else
     if (buf_end - buf < VERM_BOUNDARY) {
         for (; buf < buf_end; buf++) {
             char cur = (char)*buf;
@@ -59,8 +72,8 @@ const u8 *vermicelliExec(char c, char nocase, const u8 *buf,
         }
         return buf;
     }
+#endif
 
-    VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
     uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY;
     if (min) {
         // Input isn't aligned, so we need to run one iteration with an
@@ -99,7 +112,20 @@ const u8 *nvermicelliExec(char c, char nocase, const u8 *buf,
                  nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
     assert(buf < buf_end);
 
+    VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
+
     // Handle small scans.
+#ifdef HAVE_AVX512
+    if (buf_end - buf <= VERM_BOUNDARY) {
+        const u8 *ptr = nocase
+                      ? vermMiniNocase(chars, buf, buf_end, 1)
+                      : vermMini(chars, buf, buf_end, 1);
+        if (ptr) {
+            return ptr;
+        }
+        return buf_end;
+    }
+#else
     if (buf_end - buf < VERM_BOUNDARY) {
         for (; buf < buf_end; buf++) {
             char cur = (char)*buf;
@@ -112,8 +138,8 @@ const u8 *nvermicelliExec(char c, char nocase, const u8 *buf,
         }
         return buf;
     }
+#endif
 
-    VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
     size_t min = (size_t)buf % VERM_BOUNDARY;
     if (min) {
         // Input isn't aligned, so we need to run one iteration with an
@@ -149,12 +175,32 @@ const u8 *vermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf,
     DEBUG_PRINTF("double verm scan %s\\x%02hhx%02hhx over %zu bytes\n",
                  nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf));
     assert(buf < buf_end);
-    assert((buf_end - buf) >= VERM_BOUNDARY);
 
-    uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY;
     VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */
     VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */
 
+#ifdef HAVE_AVX512
+    if (buf_end - buf <= VERM_BOUNDARY) {
+        const u8 *ptr = nocase
+                      ? dvermMiniNocase(chars1, chars2, buf, buf_end)
+                      : dvermMini(chars1, chars2, buf, buf_end);
+        if (ptr) {
+            return ptr;
+        }
+
+        /* check for partial match at end */
+        u8 mask = nocase ? CASE_CLEAR : 0xff;
+        if ((buf_end[-1] & mask) == (u8)c1) {
+            DEBUG_PRINTF("partial!!!\n");
+            return buf_end - 1;
+        }
+
+        return buf_end;
+    }
+#endif
+
+    assert((buf_end - buf) >= VERM_BOUNDARY);
+    uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY;
     if (min) {
         // Input isn't aligned, so we need to run one iteration with an
         // unaligned load, then skip buf forward to the next aligned address.
@@ -205,14 +251,32 @@ const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2,
     DEBUG_PRINTF("double verm scan (\\x%02hhx&\\x%02hhx)(\\x%02hhx&\\x%02hhx) "
                  "over %zu bytes\n", c1, m1, c2, m2, (size_t)(buf_end - buf));
     assert(buf < buf_end);
-    assert((buf_end - buf) >= VERM_BOUNDARY);
 
-    uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY;
     VERM_TYPE chars1 = VERM_SET_FN(c1);
     VERM_TYPE chars2 = VERM_SET_FN(c2);
     VERM_TYPE mask1 = VERM_SET_FN(m1);
     VERM_TYPE mask2 = VERM_SET_FN(m2);
 
+#ifdef HAVE_AVX512
+    if (buf_end - buf <= VERM_BOUNDARY) {
+        const u8 *ptr = dvermMiniMasked(chars1, chars2, mask1, mask2, buf,
+                                        buf_end);
+        if (ptr) {
+            return ptr;
+        }
+
+        /* check for partial match at end */
+        if ((buf_end[-1] & m1) == (u8)c1) {
+            DEBUG_PRINTF("partial!!!\n");
+            return buf_end - 1;
+        }
+
+        return buf_end;
+    }
+#endif
+
+    assert((buf_end - buf) >= VERM_BOUNDARY);
+    uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY;
     if (min) {
         // Input isn't aligned, so we need to run one iteration with an
         // unaligned load, then skip buf forward to the next aligned address.
@@ -244,6 +308,7 @@ const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2,
 
     /* check for partial match at end */
     if ((buf_end[-1] & m1) == (u8)c1) {
+        DEBUG_PRINTF("partial!!!\n");
         return buf_end - 1;
     }
 
@@ -259,7 +324,20 @@ const u8 *rvermicelliExec(char c, char nocase, const u8 *buf,
                  nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
     assert(buf < buf_end);
 
+    VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
+
     // Handle small scans.
+#ifdef HAVE_AVX512
+    if (buf_end - buf <= VERM_BOUNDARY) {
+        const u8 *ptr = nocase
+                      ? rvermMiniNocase(chars, buf, buf_end, 0)
+                      : rvermMini(chars, buf, buf_end, 0);
+        if (ptr) {
+            return ptr;
+        }
+        return buf - 1;
+    }
+#else
     if (buf_end - buf < VERM_BOUNDARY) {
         for (buf_end--; buf_end >= buf; buf_end--) {
             char cur = (char)*buf_end;
@@ -272,26 +350,22 @@ const u8 *rvermicelliExec(char c, char nocase, const u8 *buf,
         }
         return buf_end;
     }
+#endif
 
-    VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
     size_t min = (size_t)buf_end % VERM_BOUNDARY;
-
     if (min) {
         // Input isn't aligned, so we need to run one iteration with an
         // unaligned load, then skip buf backward to the next aligned address.
         // There's some small overlap here, but we don't mind scanning it twice
         // if we can do it quickly, do we?
-        if (nocase) {
-            const u8 *ptr =
-                rvermUnalignNocase(chars, buf_end - VERM_BOUNDARY, 0);
-            if (ptr) {
-                return ptr;
-            }
-        } else {
-            const u8 *ptr = rvermUnalign(chars, buf_end - VERM_BOUNDARY, 0);
-            if (ptr) {
-                return ptr;
-            }
+        const u8 *ptr = nocase ? rvermUnalignNocase(chars,
+                                                    buf_end - VERM_BOUNDARY,
+                                                    0)
+                               : rvermUnalign(chars, buf_end - VERM_BOUNDARY,
+                                              0);
+
+        if (ptr) {
+            return ptr;
         }
 
         buf_end -= min;
@@ -322,7 +396,20 @@ const u8 *rnvermicelliExec(char c, char nocase, const u8 *buf,
                  nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
     assert(buf < buf_end);
 
+    VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
+
     // Handle small scans.
+#ifdef HAVE_AVX512
+    if (buf_end - buf <= VERM_BOUNDARY) {
+        const u8 *ptr = nocase
+                      ? rvermMiniNocase(chars, buf, buf_end, 1)
+                      : rvermMini(chars, buf, buf_end, 1);
+        if (ptr) {
+            return ptr;
+        }
+        return buf - 1;
+    }
+#else
     if (buf_end - buf < VERM_BOUNDARY) {
         for (buf_end--; buf_end >= buf; buf_end--) {
             char cur = (char)*buf_end;
@@ -335,26 +422,22 @@ const u8 *rnvermicelliExec(char c, char nocase, const u8 *buf,
         }
         return buf_end;
     }
+#endif
 
-    VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
     size_t min = (size_t)buf_end % VERM_BOUNDARY;
-
     if (min) {
         // Input isn't aligned, so we need to run one iteration with an
         // unaligned load, then skip buf backward to the next aligned address.
         // There's some small overlap here, but we don't mind scanning it twice
         // if we can do it quickly, do we?
-        if (nocase) {
-            const u8 *ptr =
-                rvermUnalignNocase(chars, buf_end - VERM_BOUNDARY, 1);
-            if (ptr) {
-                return ptr;
-            }
-        } else {
-            const u8 *ptr = rvermUnalign(chars, buf_end - VERM_BOUNDARY, 1);
-            if (ptr) {
-                return ptr;
-            }
+        const u8 *ptr = nocase ? rvermUnalignNocase(chars,
+                                                    buf_end - VERM_BOUNDARY,
+                                                    1)
+                               : rvermUnalign(chars, buf_end - VERM_BOUNDARY,
+                                              1);
+
+        if (ptr) {
+            return ptr;
         }
 
         buf_end -= min;
@@ -383,24 +466,36 @@ const u8 *rvermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf,
     DEBUG_PRINTF("rev double verm scan %s\\x%02hhx%02hhx over %zu bytes\n",
                  nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf));
     assert(buf < buf_end);
-    assert((buf_end - buf) >= VERM_BOUNDARY);
 
-    size_t min = (size_t)buf_end % VERM_BOUNDARY;
     VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */
     VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */
 
+#ifdef HAVE_AVX512
+    if (buf_end - buf <= VERM_BOUNDARY) {
+        const u8 *ptr = nocase
+                      ? rdvermMiniNocase(chars1, chars2, buf, buf_end)
+                      : rdvermMini(chars1, chars2, buf, buf_end);
+
+        if (ptr) {
+            return ptr;
+        }
+
+        // check for partial match at end ???
+        return buf - 1;
+    }
+#endif
+
+    assert((buf_end - buf) >= VERM_BOUNDARY);
+    size_t min = (size_t)buf_end % VERM_BOUNDARY;
     if (min) {
         // input not aligned, so we need to run one iteration with an unaligned
         // load, then skip buf forward to the next aligned address. There's
         // some small overlap here, but we don't mind scanning it twice if we
         // can do it quickly, do we?
-        const u8 *ptr;
-        if (nocase) {
-            ptr = rdvermPreconditionNocase(chars1, chars2,
-                                           buf_end - VERM_BOUNDARY);
-        } else {
-            ptr = rdvermPrecondition(chars1, chars2, buf_end - VERM_BOUNDARY);
-        }
+        const u8 *ptr = nocase ? rdvermPreconditionNocase(chars1, chars2,
+                                                          buf_end - VERM_BOUNDARY)
+                               : rdvermPrecondition(chars1, chars2,
+                                                    buf_end - VERM_BOUNDARY);
 
         if (ptr) {
             return ptr;
diff --git a/src/nfa/vermicelli_sse.h b/src/nfa/vermicelli_sse.h
index 0749470f5..3307486cf 100644
--- a/src/nfa/vermicelli_sse.h
+++ b/src/nfa/vermicelli_sse.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -32,6 +32,8 @@
  * (users should include vermicelli.h)
  */
 
+#if !defined(HAVE_AVX512)
+
 #define VERM_BOUNDARY 16
 #define VERM_TYPE m128
 #define VERM_SET_FN set16x8
@@ -391,3 +393,497 @@ const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
 
     return NULL;
 }
+
+#else // HAVE_AVX512
+
+#define VERM_BOUNDARY 64
+#define VERM_TYPE m512
+#define VERM_SET_FN set64x8
+
+static really_inline
+const u8 *vermMini(m512 chars, const u8 *buf, const u8 *buf_end, char negate) {
+    uintptr_t len = buf_end - buf;
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 data = loadu_maskz_m512(mask, buf);
+
+    u64a z = eq512mask(chars, data);
+
+    if (negate) {
+        z = ~z & mask;
+    }
+    z &= mask;
+    if (unlikely(z)) {
+        return buf + ctz64(z);
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *vermMiniNocase(m512 chars, const u8 *buf, const u8 *buf_end,
+                         char negate) {
+    uintptr_t len = buf_end - buf;
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 data = loadu_maskz_m512(mask, buf);
+    m512 casemask = set64x8(CASE_CLEAR);
+    m512 v = and512(casemask, data);
+
+    u64a z = eq512mask(chars, v);
+
+    if (negate) {
+        z = ~z & mask;
+    }
+    z &= mask;
+    if (unlikely(z)) {
+        return buf + ctz64(z);
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *vermSearchAligned(m512 chars, const u8 *buf, const u8 *buf_end,
+                            char negate) {
+    assert((size_t)buf % 64 == 0);
+    for (; buf + 63 < buf_end; buf += 64) {
+        m512 data = load512(buf);
+        u64a z = eq512mask(chars, data);
+        if (negate) {
+            z = ~z & ~0ULL;
+        }
+        if (unlikely(z)) {
+            u64a pos = ctz64(z);
+            return buf + pos;
+        }
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *vermSearchAlignedNocase(m512 chars, const u8 *buf,
+                                  const u8 *buf_end, char negate) {
+    assert((size_t)buf % 64 == 0);
+    m512 casemask = set64x8(CASE_CLEAR);
+
+    for (; buf + 63 < buf_end; buf += 64) {
+        m512 data = load512(buf);
+        u64a z = eq512mask(chars, and512(casemask, data));
+        if (negate) {
+            z = ~z & ~0ULL;
+        }
+        if (unlikely(z)) {
+            u64a pos = ctz64(z);
+            return buf + pos;
+        }
+    }
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *vermUnalign(m512 chars, const u8 *buf, char negate) {
+    m512 data = loadu512(buf); // unaligned
+    u64a z = eq512mask(chars, data);
+    if (negate) {
+        z = ~z & ~0ULL;
+    }
+    if (unlikely(z)) {
+        return buf + ctz64(z);
+    }
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *vermUnalignNocase(m512 chars, const u8 *buf, char negate) {
+    m512 casemask = set64x8(CASE_CLEAR);
+    m512 data = loadu512(buf); // unaligned
+    u64a z = eq512mask(chars, and512(casemask, data));
+    if (negate) {
+        z = ~z & ~0ULL;
+    }
+    if (unlikely(z)) {
+        return buf + ctz64(z);
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *dvermMini(m512 chars1, m512 chars2, const u8 *buf,
+                    const u8 *buf_end) {
+    uintptr_t len = buf_end - buf;
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 data = loadu_maskz_m512(mask, buf);
+
+    u64a z = eq512mask(chars1, data) & (eq512mask(chars2, data) >> 1);
+
+    z &= mask;
+    if (unlikely(z)) {
+        u64a pos = ctz64(z);
+        return buf + pos;
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *dvermMiniNocase(m512 chars1, m512 chars2, const u8 *buf,
+                          const u8 *buf_end) {
+    uintptr_t len = buf_end - buf;
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 data = loadu_maskz_m512(mask, buf);
+    m512 casemask = set64x8(CASE_CLEAR);
+    m512 v = and512(casemask, data);
+
+    u64a z = eq512mask(chars1, v) & (eq512mask(chars2, v) >> 1);
+
+    z &= mask;
+    if (unlikely(z)) {
+        u64a pos = ctz64(z);
+        return buf + pos;
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *dvermMiniMasked(m512 chars1, m512 chars2, m512 mask1, m512 mask2,
+                          const u8 *buf, const u8 *buf_end) {
+    uintptr_t len = buf_end - buf;
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 data = loadu_maskz_m512(mask, buf);
+    m512 v1 = and512(data, mask1);
+    m512 v2 = and512(data, mask2);
+
+    u64a z = eq512mask(chars1, v1) & (eq512mask(chars2, v2) >> 1);
+
+    z &= mask;
+    if (unlikely(z)) {
+        u64a pos = ctz64(z);
+        return buf + pos;
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *dvermSearchAligned(m512 chars1, m512 chars2, u8 c1, u8 c2,
+                             const u8 *buf, const u8 *buf_end) {
+    for (; buf + 64 < buf_end; buf += 64) {
+        m512 data = load512(buf);
+        u64a z = eq512mask(chars1, data) & (eq512mask(chars2, data) >> 1);
+        if (buf[63] == c1 && buf[64] == c2) {
+            z |= (1ULL << 63);
+        }
+        if (unlikely(z)) {
+            u64a pos = ctz64(z);
+            return buf + pos;
+        }
+    }
+
+    return NULL;
+}
+
+static really_inline
+const u8 *dvermSearchAlignedNocase(m512 chars1, m512 chars2, u8 c1, u8 c2,
+                                   const u8 *buf, const u8 *buf_end) {
+    assert((size_t)buf % 64 == 0);
+    m512 casemask = set64x8(CASE_CLEAR);
+
+    for (; buf + 64 < buf_end; buf += 64) {
+        m512 data = load512(buf);
+        m512 v = and512(casemask, data);
+        u64a z = eq512mask(chars1, v) & (eq512mask(chars2, v) >> 1);
+        if ((buf[63] & CASE_CLEAR) == c1 && (buf[64] & CASE_CLEAR) == c2) {
+            z |= (1ULL << 63);
+        }
+        if (unlikely(z)) {
+            u64a pos = ctz64(z);
+            return buf + pos;
+        }
+    }
+
+    return NULL;
+}
+
+static really_inline
+const u8 *dvermSearchAlignedMasked(m512 chars1, m512 chars2,
+                                   m512 mask1, m512 mask2, u8 c1, u8 c2, u8 m1,
+                                   u8 m2, const u8 *buf, const u8 *buf_end) {
+    assert((size_t)buf % 64 == 0);
+
+    for (; buf + 64 < buf_end; buf += 64) {
+        m512 data = load512(buf);
+        m512 v1 = and512(data, mask1);
+        m512 v2 = and512(data, mask2);
+        u64a z = eq512mask(chars1, v1) & (eq512mask(chars2, v2) >> 1);
+
+        if ((buf[63] & m1) == c1 && (buf[64] & m2) == c2) {
+            z |= (1ULL << 63);
+        }
+        if (unlikely(z)) {
+            u64a pos = ctz64(z);
+            return buf + pos;
+        }
+    }
+
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *dvermPrecondition(m512 chars1, m512 chars2, const u8 *buf) {
+    m512 data = loadu512(buf); // unaligned
+    u64a z = eq512mask(chars1, data) & (eq512mask(chars2, data) >> 1);
+
+    /* no fixup of the boundary required - the aligned run will pick it up */
+    if (unlikely(z)) {
+        u64a pos = ctz64(z);
+        return buf + pos;
+    }
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *dvermPreconditionNocase(m512 chars1, m512 chars2, const u8 *buf) {
+    /* due to laziness, nonalphas and nocase having interesting behaviour */
+    m512 casemask = set64x8(CASE_CLEAR);
+    m512 data = loadu512(buf); // unaligned
+    m512 v = and512(casemask, data);
+    u64a z = eq512mask(chars1, v) & (eq512mask(chars2, v) >> 1);
+
+    /* no fixup of the boundary required - the aligned run will pick it up */
+    if (unlikely(z)) {
+        u64a pos = ctz64(z);
+        return buf + pos;
+    }
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *dvermPreconditionMasked(m512 chars1, m512 chars2,
+                                  m512 mask1, m512 mask2, const u8 *buf) {
+    m512 data = loadu512(buf); // unaligned
+    m512 v1 = and512(data, mask1);
+    m512 v2 = and512(data, mask2);
+    u64a z = eq512mask(chars1, v1) & (eq512mask(chars2, v2) >> 1);
+
+    /* no fixup of the boundary required - the aligned run will pick it up */
+    if (unlikely(z)) {
+        u64a pos = ctz64(z);
+        return buf + pos;
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *lastMatchOffset(const u8 *buf_end, u64a z) {
+    assert(z);
+    return buf_end - 64 + 63 - clz64(z);
+}
+
+static really_inline
+const u8 *rvermMini(m512 chars, const u8 *buf, const u8 *buf_end, char negate) {
+    uintptr_t len = buf_end - buf;
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 data = loadu_maskz_m512(mask, buf);
+
+    u64a z = eq512mask(chars, data);
+
+    if (negate) {
+        z = ~z & mask;
+    }
+    z &= mask;
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 64, z);
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *rvermMiniNocase(m512 chars, const u8 *buf, const u8 *buf_end,
+                          char negate) {
+    uintptr_t len = buf_end - buf;
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 data = loadu_maskz_m512(mask, buf);
+    m512 casemask = set64x8(CASE_CLEAR);
+    m512 v = and512(casemask, data);
+
+    u64a z = eq512mask(chars, v);
+
+    if (negate) {
+        z = ~z & mask;
+    }
+    z &= mask;
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 64, z);
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *rvermSearchAligned(m512 chars, const u8 *buf, const u8 *buf_end,
+                             char negate) {
+    assert((size_t)buf_end % 64 == 0);
+    for (; buf + 63 < buf_end; buf_end -= 64) {
+        m512 data = load512(buf_end - 64);
+        u64a z = eq512mask(chars, data);
+        if (negate) {
+            z = ~z & ~0ULL;
+        }
+        if (unlikely(z)) {
+            return lastMatchOffset(buf_end, z);
+        }
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *rvermSearchAlignedNocase(m512 chars, const u8 *buf,
+                                   const u8 *buf_end, char negate) {
+    assert((size_t)buf_end % 64 == 0);
+    m512 casemask = set64x8(CASE_CLEAR);
+
+    for (; buf + 63 < buf_end; buf_end -= 64) {
+        m512 data = load512(buf_end - 64);
+        u64a z = eq512mask(chars, and512(casemask, data));
+        if (negate) {
+            z = ~z & ~0ULL;
+        }
+        if (unlikely(z)) {
+            return lastMatchOffset(buf_end, z);
+        }
+    }
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *rvermUnalign(m512 chars, const u8 *buf, char negate) {
+    m512 data = loadu512(buf); // unaligned
+    u64a z = eq512mask(chars, data);
+    if (negate) {
+        z = ~z & ~0ULL;
+    }
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 64, z);
+    }
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *rvermUnalignNocase(m512 chars, const u8 *buf, char negate) {
+    m512 casemask = set64x8(CASE_CLEAR);
+    m512 data = loadu512(buf); // unaligned
+    u64a z = eq512mask(chars, and512(casemask, data));
+    if (negate) {
+        z = ~z & ~0ULL;
+    }
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 64, z);
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *rdvermMini(m512 chars1, m512 chars2, const u8 *buf,
+                    const u8 *buf_end) {
+    uintptr_t len = buf_end - buf;
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 data = loadu_maskz_m512(mask, buf);
+
+    u64a z = eq512mask(chars2, data) & (eq512mask(chars1, data) << 1);
+
+    z &= mask;
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 64, z);
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *rdvermMiniNocase(m512 chars1, m512 chars2, const u8 *buf,
+                           const u8 *buf_end) {
+    uintptr_t len = buf_end - buf;
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 data = loadu_maskz_m512(mask, buf);
+    m512 casemask = set64x8(CASE_CLEAR);
+    m512 v = and512(casemask, data);
+
+    u64a z = eq512mask(chars2, v) & (eq512mask(chars1, v) << 1);
+
+    z &= mask;
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 64, z);
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *rdvermSearchAligned(m512 chars1, m512 chars2, u8 c1, u8 c2,
+                              const u8 *buf, const u8 *buf_end) {
+    assert((size_t)buf_end % 64 == 0);
+
+    for (; buf + 64 < buf_end; buf_end -= 64) {
+        m512 data = load512(buf_end - 64);
+        u64a z = eq512mask(chars2, data) & (eq512mask(chars1, data) << 1);
+        if (buf_end[-65] == c1 && buf_end[-64] == c2) {
+            z |= 1;
+        }
+        if (unlikely(z)) {
+            return lastMatchOffset(buf_end, z);
+        }
+    }
+    return buf_end;
+}
+
+static really_inline
+const u8 *rdvermSearchAlignedNocase(m512 chars1, m512 chars2, u8 c1, u8 c2,
+                                    const u8 *buf, const u8 *buf_end) {
+    assert((size_t)buf_end % 64 == 0);
+    m512 casemask = set64x8(CASE_CLEAR);
+
+    for (; buf + 64 < buf_end; buf_end -= 64) {
+        m512 data = load512(buf_end - 64);
+        m512 v = and512(casemask, data);
+        u64a z = eq512mask(chars2, v) & (eq512mask(chars1, v) << 1);
+        if ((buf_end[-65] & CASE_CLEAR) == c1
+            && (buf_end[-64] & CASE_CLEAR) == c2) {
+            z |= 1;
+        }
+        if (unlikely(z)) {
+            return lastMatchOffset(buf_end, z);
+        }
+    }
+    return buf_end;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *rdvermPrecondition(m512 chars1, m512 chars2, const u8 *buf) {
+    m512 data = loadu512(buf);
+    u64a z = eq512mask(chars2, data) & (eq512mask(chars1, data) << 1);
+
+    // no fixup of the boundary required - the aligned run will pick it up
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 64, z);
+    }
+
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *rdvermPreconditionNocase(m512 chars1, m512 chars2, const u8 *buf) {
+    // due to laziness, nonalphas and nocase having interesting behaviour
+    m512 casemask = set64x8(CASE_CLEAR);
+    m512 data = loadu512(buf);
+    m512 v = and512(casemask, data);
+    u64a z = eq512mask(chars2, v) & (eq512mask(chars1, v) << 1);
+    // no fixup of the boundary required - the aligned run will pick it up
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 64, z);
+    }
+
+    return NULL;
+}
+
+#endif // HAVE_AVX512
diff --git a/src/nfagraph/ng_cyclic_redundancy.cpp b/src/nfagraph/ng_cyclic_redundancy.cpp
index c8d34687e..0b24bf07a 100644
--- a/src/nfagraph/ng_cyclic_redundancy.cpp
+++ b/src/nfagraph/ng_cyclic_redundancy.cpp
@@ -205,7 +205,7 @@ bool removeCyclicPathRedundancy(Graph &g, typename Graph::vertex_descriptor v,
 
             DEBUG_PRINTF("  - checking w %zu\n", g[w].index);
 
-            if (!searchForward(g, reach, colours, s, w)) {
+            if (!searchForward(g, reach, colours, succ_v, w)) {
                 continue;
             }
 
diff --git a/src/nfagraph/ng_limex_accel.cpp b/src/nfagraph/ng_limex_accel.cpp
index fa46a42cc..f1f829f2c 100644
--- a/src/nfagraph/ng_limex_accel.cpp
+++ b/src/nfagraph/ng_limex_accel.cpp
@@ -170,7 +170,7 @@ void findPaths(const NGHolder &g, NFAVertex v,
             /* path has looped back to one of the active+boring acceleration
              * states.  We can ignore this path if we have sufficient back-
              * off. */
-            paths->push_back({CharReach()});
+            paths->push_back({cr});
             continue;
         }
 
diff --git a/src/rose/rose_build_dedupe.cpp b/src/rose/rose_build_dedupe.cpp
index 04144f560..d5d002d43 100644
--- a/src/rose/rose_build_dedupe.cpp
+++ b/src/rose/rose_build_dedupe.cpp
@@ -29,6 +29,7 @@
 #include "rose_build_impl.h"
 #include "nfa/castlecompile.h"
 #include "nfagraph/ng_repeat.h"
+#include "smallwrite/smallwrite_build.h"
 #include "util/compile_context.h"
 #include "util/boundary_reports.h"
 #include "util/make_unique.h"
@@ -159,6 +160,10 @@ RoseDedupeAuxImpl::RoseDedupeAuxImpl(const RoseBuildImpl &build_in)
         }
     }
 
+    for (const auto &report_id : build.smwr.all_reports()) {
+        live_reports.insert(report_id);
+    }
+
     // Collect live reports from boundary reports.
     insert(&live_reports, build.boundary.report_at_0);
     insert(&live_reports, build.boundary.report_at_0_eod);
diff --git a/src/util/arch.h b/src/util/arch.h
index c78ee9ced..985fec6ac 100644
--- a/src/util/arch.h
+++ b/src/util/arch.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Intel Corporation
+ * Copyright (c) 2017-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -57,6 +57,10 @@
 #define HAVE_AVX512
 #endif
 
+#if defined(__AVX512VBMI__)
+#define HAVE_AVX512VBMI
+#endif
+
 /*
  * ICC and MSVC don't break out POPCNT or BMI/2 as separate pre-def macros
  */
diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
index c1449711b..42223133d 100644
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -150,6 +150,14 @@ static really_inline u32 movd(const m128 in) {
     return _mm_cvtsi128_si32(in);
 }
 
+#if defined(HAVE_AVX512)
+static really_inline u32 movd512(const m512 in) {
+    // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in),
+    //       so we use 2-step convertions to work around.
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(in));
+}
+#endif
+
 static really_inline u64a movq(const m128 in) {
 #if defined(ARCH_X86_64)
     return _mm_cvtsi128_si64(in);
@@ -318,6 +326,12 @@ static really_inline
 m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) {
     return _mm512_maskz_shuffle_epi8(k, a, b);
 }
+
+#if defined(HAVE_AVX512VBMI)
+#define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a)
+#define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a)
+#endif
+
 #endif
 
 static really_inline
diff --git a/tools/fuzz/aristocrats.py b/tools/fuzz/aristocrats.py
new file mode 100755
index 000000000..7b6ff2bf3
--- /dev/null
+++ b/tools/fuzz/aristocrats.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+
+from random import choice,randint
+from optparse import OptionParser
+
+def generateRandomOptions():
+    if options.hybrid:
+        allflags = "smiH8W"
+    else:
+        # Maintain an ordering for consistency.
+        allflags = "smiHV8WLP"
+    flags = ""
+    for f in allflags:
+        flags += choice(['', f])
+    return flags
+
+parser = OptionParser()
+parser.add_option("-d", "--depth",
+                  action="store", type="int", dest="depth", default=200,
+                  help="Depth of generation (akin to maximum length)")
+parser.add_option("-c", "--count",
+                  action="store", type="int", dest="count", default=1000,
+                  help="Number of expressions to generate")
+parser.add_option("-f", "--full",
+                  action="store_true", dest="full", default=False,
+                  help="Use a full character set including unprintables")
+parser.add_option("-H", "--hybrid",
+                  action="store_true", dest="hybrid",
+                  help="Generate random flags for hybrid mode")
+
+(options, args) = parser.parse_args()
+if len(args) != 0:
+    parser.error("incorrect number of arguments")
+
+if (options.full):
+    crange = range(0,256)
+    crange.remove(ord('\n'))
+else:
+    crange = range(32, 127)
+
+for i in xrange(0, options.count):
+    len = randint(1, options.depth)
+    s = [ chr(choice(crange)) for x in xrange(len) ]
+    line = str(i) + ":/" + "".join(s) + "/" + generateRandomOptions()
+    print line
diff --git a/tools/fuzz/completocrats.py b/tools/fuzz/completocrats.py
new file mode 100755
index 000000000..60ac4d7ef
--- /dev/null
+++ b/tools/fuzz/completocrats.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python
+
+from itertools import *
+from optparse import OptionParser
+
+LIMITED_ALPHABET = "abc[](){}*?+^$|:=.\\-"
+
+parser = OptionParser()
+parser.add_option("-d", "--depth",
+                  action="store", type="int", dest="depth", default=200,
+                  help="Depth of generation (akin to maximum length)")
+
+parser.add_option("-f", "--full",
+                  action="store_true", dest="full", default=False,
+                  help="Use a full character set including unprintables")
+
+parser.add_option("-l", "--limited",
+                  action="store_true", dest="limited", default=False,
+                  help="Use a very limited character set: just " + LIMITED_ALPHABET)
+
+(options, args) = parser.parse_args()
+if len(args) != 0:
+    parser.error("incorrect number of arguments")
+
+if (options.full):
+    crange = range(0,256)
+    crange.remove(ord('\n'))
+elif (options.limited):
+    crange = [ ord(c) for c in LIMITED_ALPHABET ]
+else:
+    crange = range(32, 127)
+
+srange = [ chr(c) for c in crange ]
+
+i = 0
+for x in product(srange, repeat = options.depth):
+    line = str(i) + ":/" + "".join(x) + "/"
+    print line
+    i += 1
diff --git a/tools/fuzz/heuristocrats.py b/tools/fuzz/heuristocrats.py
new file mode 100755
index 000000000..49c7acb43
--- /dev/null
+++ b/tools/fuzz/heuristocrats.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python
+
+from optparse import OptionParser
+from random import *
+import string
+import sys
+
+# return a random non-degenerate (ie not [10]) partition of nChildren 
+def chooseLeafWidth(nChildren):
+    width = randint(1, 5)
+    width = min(width, nChildren-1)
+    s = sample(range(1, nChildren), width)
+    s.sort()
+    s = [0] + s + [nChildren]
+    v = [ s[i+1] - s[i] for i in range(0, len(s)-1) if s[i+1] != s[i] ]
+    return v
+
+def generateConcat(nChildren, atTopIgnored):
+    v = [ generateRE(w, atTop = False) for w in chooseLeafWidth(nChildren) ]
+    v = [ r for r in v if r != '' ]	
+    return string.join(v, "")
+
+def makeGroup(s):
+    # Parenthesise either in normal parens or a non-capturing group.
+    if randint(0, 1) == 0:
+        return "(" + s + ")"
+    else:
+        return "(?:" + s + ")"
+
+def generateAlt(nChildren, atTop):
+    v = [ generateRE(w, [generateAlt], atTop) for w in chooseLeafWidth(nChildren) ]
+    v = [ r for r in v if r != '' ]	
+    s = string.join(v, "|")
+    if len(v) == 1:
+	    return s
+    else:
+        return makeGroup(s)
+
+def generateQuant(nChildren, atTopIgnored):
+    lo = int(round(expovariate(0.2)))
+    hi = lo + int(round(expovariate(0.2)))
+    q = choice(["*", "?", "+", "{%d}"%lo, "{%d,}"%lo, "{%d,%d}"%(lo,hi)])
+    r = generateRE(nChildren, [generateQuant], atTop = False)
+    if (len(r) == 1) or (r[0] != '(' and r[-1] != ")"):  
+        return r + q
+    else:
+        return makeGroup(r) + q
+
+def generateChar(nChildren, atTop = False):
+    return chr(choice(alphabet))
+
+def generateNocaseChar(nChildren, atTop = False):
+    'Either generate an uppercase char from the alphabet or a nocase class [Aa]'
+    c = generateChar(nChildren, atTop)
+    if random() < 0.5:
+        return c.upper()
+    else:
+        return '[' + c.upper() + c.lower() + ']'
+
+def generateDot(nChildren, atTop = False):
+    return "."
+
+def generateBoundary(nChildren, atTop = False):
+    # \b, \B in parens so that we can repeat them and still be accepted by 
+    # libpcre
+    return makeGroup('\\' + choice('bB'))
+
+def generateCharClass(nChildren, atTop = False):
+    s = ""
+    if random() < 0.2:
+        s = "^"
+        nChars = randint(1,4)
+    else:
+        nChars = randint(2,4)
+
+    for i in xrange(nChars):
+        s += generateChar(1)
+    return "[" + s + "]"
+
+def generateOptionsFlags(nChildren, atTop = False):
+    allflags = "smix"
+    pos_flags = sample(allflags, randint(1, len(allflags)))
+    neg_flags = sample(allflags, randint(1, len(allflags)))
+    s = '(?' + ''.join(pos_flags) + '-' + ''.join(neg_flags) + ')'
+    return s 
+
+def generateLogicalId(nChildren, atTop = False):
+    return str(randint(0, options.count))
+
+def makeLogicalGroup(s):
+    return "(" + s + ")"
+
+def generateLogicalNot(nChildren, atTop):
+    r = generateCombination(nChildren, [generateLogicalNot], atTop = False)
+    return "!" + makeLogicalGroup(r)
+
+def generateLogicalAnd(nChildren, atTop):
+    v = [ generateCombination(w, [generateLogicalAnd], atTop = False) for w in chooseLeafWidth(nChildren) ]
+    v = [ r for r in v if r != '' ]
+    s = string.join(v, "&")
+    if len(v) == 1:
+	    return s
+    else:
+        return makeLogicalGroup(s)
+
+def generateLogicalOr(nChildren, atTop):
+    v = [ generateCombination(w, [generateLogicalOr], atTop = False) for w in chooseLeafWidth(nChildren) ]
+    v = [ r for r in v if r != '' ]
+    s = string.join(v, "|")
+    if len(v) == 1:
+	    return s
+    else:
+        return makeLogicalGroup(s)
+
+weightsTree = [
+    (generateConcat, 10),
+    (generateAlt, 3),
+    (generateQuant, 2),
+    ]
+
+weightsLeaf = [
+    (generateChar, 30),
+    (generateCharClass, 5),
+    (generateDot, 5),
+    (generateNocaseChar, 2),
+    (generateBoundary, 1),
+    (generateOptionsFlags, 1)
+    ]
+
+weightsLogicalTree = [
+    (generateLogicalNot, 1),
+    (generateLogicalAnd, 5),
+    (generateLogicalOr, 5),
+    ]
+
+weightsLogicalLeaf = [
+    (generateLogicalId, 1),
+    ]
+
+def genChoices(weighted):
+    r = []
+    for (f, w) in weighted:
+        r = r + [f] * w
+    return r
+
+choicesTree = genChoices(weightsTree)
+choicesLeaf = genChoices(weightsLeaf)
+choicesLogicalTree = genChoices(weightsLogicalTree)
+choicesLogicalLeaf = genChoices(weightsLogicalLeaf)
+
+weightsAnchor = [
+    ("\\A%s\\Z", 1),
+    ("\\A%s\\z", 1),
+    ("\\A%s",  4),
+    ("%s\\Z", 2),
+    ("%s\\z", 2),
+    ("^%s$", 1),
+    ("^%s",  4),
+    ("%s$", 2),
+    ("%s", 25)
+    ]
+choicesAnchor = genChoices(weightsAnchor)
+
+def generateRE(nChildren, suppressList = [], atTop = False):
+    if atTop:
+        anchorSubstituteString = choice(choicesAnchor)
+    else:
+        anchorSubstituteString = "%s"
+
+    nChildren -= 1
+    if nChildren == 0:
+        res = choice(choicesLeaf)(nChildren, atTop)
+    else:
+        c = [ ch for ch in choicesTree if ch not in suppressList ]
+        res = choice(c)(nChildren, atTop)
+
+    return anchorSubstituteString % res
+
+def generateCombination(nChildren, suppressList = [], atTop = False):
+    nChildren -= 1
+    if nChildren == 0:
+        res = choice(choicesLogicalLeaf)(nChildren, atTop)
+    else:
+        c = [ ch for ch in choicesLogicalTree if ch not in suppressList ]
+        res = choice(c)(nChildren, atTop)
+
+    return res
+
+def generateRandomOptions():
+    if options.hybrid:
+        allflags = "smiH8W"
+    else:
+        # Maintain an ordering for consistency.
+        allflags = "smiHV8WLP"
+    flags = ""
+    for f in allflags:
+        flags += choice(['', f])
+    if options.logical:
+        flags += choice(['', 'Q'])
+    return flags
+
+def generateRandomExtParam(depth, extparam):
+    if not extparam:
+        return ""
+    params = []
+    if choice((False, True)):
+        params.append("min_length=%u" % randint(1, depth))
+    if choice((False, True)):
+        params.append("min_offset=%u" % randint(1, depth))
+    if choice((False, True)):
+        params.append("max_offset=%u" % randint(1, depth*3))
+    if choice((False, True)):
+        dist = randint(1, 3)
+        if choice((False, True)):
+            params.append("edit_distance=%u" % dist)
+        else:
+            params.append("hamming_distance=%u" % dist)
+    if params:
+        return "{" + ",".join(params) + "}"
+    else:
+        return ""
+
+parser = OptionParser()
+parser.add_option("-d", "--depth",
+                  action="store", type="int", dest="depth", default=200,
+                  help="Depth of generation (akin to maximum length)")
+parser.add_option("-c", "--count",
+                  action="store", type="int", dest="count", default=1000,
+                  help="Number of expressions to generate")
+parser.add_option("-a", "--alphabet",
+                  action="store", type="int", dest="alphabet", default=26,
+                  help="Size of alphabet to generate character expressions over (starting with lowercase 'a')")
+parser.add_option("-i", "--nocase",
+                  action="store_true", dest="nocase",
+                  help="Use a caseless alphabet for character generation")
+parser.add_option("-x", "--extparam",
+                  action="store_true", dest="extparam",
+                  help="Generate random extended parameters")
+parser.add_option("-l", "--logical",
+                  action="store_true", dest="logical",
+                  help="Generate logical combination expressions")
+parser.add_option("-H", "--hybrid",
+                  action="store_true", dest="hybrid",
+                  help="Generate random flags for hybrid mode")
+
+(options, args) = parser.parse_args()
+if len(args) != 0:
+    parser.error("incorrect number of arguments")
+
+alphabet = range(ord('a'), ord('a') + options.alphabet)
+if options.nocase:
+    alphabet += range(ord('A'), ord('A') + options.alphabet)
+    
+for i in xrange(0, options.count):
+    print "%08d:/%s/%s%s" % (i, generateRE(randint(1, options.depth), atTop = True), generateRandomOptions(), generateRandomExtParam(options.depth, options.extparam))
+
+if options.logical:
+    for i in xrange(options.count, options.count + 3000):
+        print "%08d:/%s/C" % (i, generateCombination(randint(1, options.depth), atTop = True))
diff --git a/tools/fuzz/limited_dict.txt b/tools/fuzz/limited_dict.txt
new file mode 100644
index 000000000..7c3daf4ba
--- /dev/null
+++ b/tools/fuzz/limited_dict.txt
@@ -0,0 +1,9 @@
+hatstand
+teakettle
+badgerbrush
+mnemosyne
+rapscallion
+acerbic
+blackhat
+rufous
+echolalia
diff --git a/tools/hsbench/main.cpp b/tools/hsbench/main.cpp
index 8e85d7aea..4e65c8e0b 100644
--- a/tools/hsbench/main.cpp
+++ b/tools/hsbench/main.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -207,7 +207,9 @@ void usage(const char *error) {
     printf("  -P              Benchmark using PCRE (if supported).\n");
 #endif
 #if defined(HAVE_DECL_PTHREAD_SETAFFINITY_NP) || defined(_WIN32)
-    printf("  -T CPU,CPU,...  Benchmark with threads on these CPUs.\n");
+    printf("  -T CPU,CPU,... or -T CPU-CPU\n");
+    printf("                  Benchmark with threads on specified CPUs or CPU"
+           " range.\n");
 #endif
     printf("  -i DIR          Don't compile, load from files in DIR"
            " instead.\n");
@@ -354,7 +356,8 @@ void processArgs(int argc, char *argv[], vector<BenchmarkSigs> &sigSets,
         case 'T':
             if (!strToList(optarg, threadCores)) {
                 usage("Couldn't parse argument to -T flag, should be"
-                      " a list of positive integers.");
+                      " a list of positive integers or 2 integers"
+                      " connected with hyphen.");
                 exit(1);
             }
             break;
diff --git a/tools/hsbench/scripts/pcapCorpus.py b/tools/hsbench/scripts/pcapCorpus.py
index c10bfef37..30d6192c6 100755
--- a/tools/hsbench/scripts/pcapCorpus.py
+++ b/tools/hsbench/scripts/pcapCorpus.py
@@ -216,8 +216,9 @@ def enchunk_pcap(pcapFN, sqliteFN):
     #
     # Read in the contents of the pcap file, adding stream segments as found
     #
-    pkt_cnt = 0;
-    ip_pkt_cnt = 0;
+    pkt_cnt = 0
+    ip_pkt_cnt = 0
+    ip_pkt_off = 0
     unsupported_ip_protocol_cnt = 0
     pcap_ref = pcap.pcap(pcapFN)
     done = False
@@ -231,16 +232,24 @@ def enchunk_pcap(pcapFN, sqliteFN):
         pkt_cnt += 1
 
         linkLayerType = struct.unpack('!H', packet[(pcap_ref.dloff - 2):pcap_ref.dloff])[0]
-        if linkLayerType != ETHERTYPE_IP:
-            #
-            # We're only interested in IP packets
-            #
+        #
+        # We're only interested in IP packets
+        #
+        if linkLayerType == ETHERTYPE_VLAN:
+            linkLayerType = struct.unpack('!H', packet[(pcap_ref.dloff + 2):(pcap_ref.dloff + 4)])[0]
+            if linkLayerType != ETHERTYPE_IP:
+                continue
+            else:
+                ip_pkt_off = pcap_ref.dloff + 4
+        elif linkLayerType == ETHERTYPE_IP:
+            ip_pkt_off = pcap_ref.dloff
+        else:
             continue
 
         ip_pkt_cnt += 1
 
-        ip_pkt_total_len = struct.unpack('!H', packet[pcap_ref.dloff + 2: pcap_ref.dloff + 4])[0]
-        ip_pkt = packet[pcap_ref.dloff:pcap_ref.dloff + ip_pkt_total_len]
+        ip_pkt_total_len = struct.unpack('!H', packet[ip_pkt_off + 2: ip_pkt_off + 4])[0]
+        ip_pkt = packet[ip_pkt_off:ip_pkt_off + ip_pkt_total_len]
         pkt_protocol = struct.unpack('B', ip_pkt[9])[0]
 
         if (pkt_protocol != IPPROTO_UDP) and (pkt_protocol != IPPROTO_TCP):
diff --git a/tools/hscollider/GroundTruth.cpp b/tools/hscollider/GroundTruth.cpp
index f30a8f5eb..a2673063c 100644
--- a/tools/hscollider/GroundTruth.cpp
+++ b/tools/hscollider/GroundTruth.cpp
@@ -241,6 +241,13 @@ void addCallout(string &re) {
     re.append("\\E)(?C)");
 }
 
+static
+bool isUtf8(const CompiledPcre &compiled) {
+    unsigned long int options = 0;
+    pcre_fullinfo(compiled.bytecode, NULL, PCRE_INFO_OPTIONS, &options);
+    return options & PCRE_UTF8;
+}
+
 unique_ptr<CompiledPcre>
 GroundTruth::compile(unsigned id, bool no_callouts) {
     bool highlander = false;
@@ -380,6 +387,8 @@ GroundTruth::compile(unsigned id, bool no_callouts) {
         throw PcreCompileFailure(oss.str());
     }
 
+    compiled->utf8 |= isUtf8(*compiled);
+
     return compiled;
 }
 
@@ -451,13 +460,6 @@ int scanBasic(const CompiledPcre &compiled, const string &buffer,
     return ret;
 }
 
-static
-bool isUtf8(const CompiledPcre &compiled) {
-    unsigned long int options = 0;
-    pcre_fullinfo(compiled.bytecode, NULL, PCRE_INFO_OPTIONS, &options);
-    return options & PCRE_UTF8;
-}
-
 static
 CaptureVec makeCaptureVec(const vector<int> &ovector, int ret) {
     assert(ret > 0);
diff --git a/unit/internal/graph_undirected.cpp b/unit/internal/graph_undirected.cpp
index babc01a6a..73d3e3570 100644
--- a/unit/internal/graph_undirected.cpp
+++ b/unit/internal/graph_undirected.cpp
@@ -40,12 +40,12 @@ using namespace std;
 using namespace ue2;
 
 struct SimpleV {
-    size_t index;
+    size_t index = 0;
     string test_v = "SimpleV";
 };
 
 struct SimpleE {
-    size_t index;
+    size_t index = 0;
     string test_e = "SimpleE";
 };
 
diff --git a/util/expressions.cpp b/util/expressions.cpp
index b33f89729..d6334bad9 100644
--- a/util/expressions.cpp
+++ b/util/expressions.cpp
@@ -146,9 +146,8 @@ bool isIgnorable(const std::string &f) {
 #ifndef _WIN32
 void loadExpressions(const string &inPath, ExpressionMap &exprMap) {
     // Is our input path a file or a directory?
-    int fd = open(inPath.c_str(), O_RDONLY);
     struct stat st;
-    if (fstat(fd, &st) != 0) {
+    if (stat(inPath.c_str(), &st) != 0) {
         cerr << "Can't stat path: '" << inPath << "'" << endl;
         exit(1);
     }
@@ -161,7 +160,7 @@ void loadExpressions(const string &inPath, ExpressionMap &exprMap) {
             exit(1);
         }
     } else if (S_ISDIR(st.st_mode)) {
-        DIR *d = fdopendir(fd);
+        DIR *d = opendir(inPath.c_str());
         if (d == nullptr) {
             cerr << "Can't open directory: '" << inPath << "'" << endl;
             exit(1);
@@ -192,10 +191,11 @@ void loadExpressions(const string &inPath, ExpressionMap &exprMap) {
         }
         (void)closedir(d);
     } else {
-        cerr << "Can't stat path: '" << inPath << "'" << endl;
+        cerr << "Unsupported file type "
+	    << hex << showbase << (st.st_mode & S_IFMT)
+	    << " for path: '" << inPath << "'" << endl;
         exit(1);
     }
-    (void)close(fd);
 }
 #else // windows TODO: improve
 void HS_CDECL loadExpressions(const string &inPath, ExpressionMap &exprMap) {
diff --git a/util/string_util.h b/util/string_util.h
index b44586ea7..ab3751c1f 100644
--- a/util/string_util.h
+++ b/util/string_util.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2019, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -54,8 +54,8 @@ inline bool fromString(const std::string &s, T& val)
     return true;
 }
 
-// read in a comma-separated set of values: very simple impl, not for
-// external consumption
+// read in a comma-separated or hyphen-connected set of values: very simple
+// impl, not for external consumption
 template<typename T>
 inline bool strToList(const std::string &s, std::vector<T>& out)
 {
@@ -68,7 +68,17 @@ inline bool strToList(const std::string &s, std::vector<T>& out)
         }
 
         out.push_back(val);
-    } while (i.get(c) && c == ',');
+
+        i.get(c);
+        if (c == '-') {
+            T val_end;
+            i >> val_end;
+            while (val < val_end) {
+                out.push_back(++val);
+            }
+            break;
+        }
+    } while (c == ',');
 
     return !out.empty();
 }