From edb3f8c51e0b75d9927868b3f5dec39b64dbc26d Mon Sep 17 00:00:00 2001
From: Frank Wessels <fwessels@xs4all.nl>
Date: Thu, 1 Aug 2019 13:47:54 -0700
Subject: [PATCH 01/25] Fix small typo

---
 doc/dev-reference/getting_started.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/dev-reference/getting_started.rst b/doc/dev-reference/getting_started.rst
index 45d4fbbb7..b38128733 100644
--- a/doc/dev-reference/getting_started.rst
+++ b/doc/dev-reference/getting_started.rst
@@ -260,7 +260,7 @@ instead of potentially executing illegal instructions. The API function
 :c:func:`hs_valid_platform` can be used by application writers to determine if
 the current platform is supported by Hyperscan.
 
-At of this release, the variants of the runtime that are built, and the CPU
+As of this release, the variants of the runtime that are built, and the CPU
 capability that is required, are the following:
 
 +----------+-------------------------------+---------------------------+

From 3ca360275552e26905b9edd3464a676506ff8501 Mon Sep 17 00:00:00 2001
From: Pavel Shlyak <shlyak-pavel2011@ya.ru>
Date: Mon, 19 Aug 2019 00:50:03 +0300
Subject: [PATCH 02/25] A tiny cleanup

---
 src/nfa/accel_dfa_build_strat.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nfa/accel_dfa_build_strat.cpp b/src/nfa/accel_dfa_build_strat.cpp
index 4508d4f1f..ae71e141a 100644
--- a/src/nfa/accel_dfa_build_strat.cpp
+++ b/src/nfa/accel_dfa_build_strat.cpp
@@ -214,7 +214,7 @@ static
 bool double_byte_ok(const AccelScheme &info) {
     return !info.double_byte.empty() &&
            info.double_cr.count() < info.double_byte.size() &&
-           info.double_cr.count() <= 2 && !info.double_byte.empty();
+           info.double_cr.count() <= 2;
 }
 
 static

From 7b406c647cb6e8f65659539aec145453dd1b9d91 Mon Sep 17 00:00:00 2001
From: flip111 <flip101@gmail.com>
Date: Fri, 29 Nov 2019 15:59:42 +0100
Subject: [PATCH 03/25] Update compilation.rst

fixed two spelling mistakes
---
 doc/dev-reference/compilation.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/dev-reference/compilation.rst b/doc/dev-reference/compilation.rst
index 93290467b..a2eee106d 100644
--- a/doc/dev-reference/compilation.rst
+++ b/doc/dev-reference/compilation.rst
@@ -75,12 +75,12 @@ characters exist in regular grammer like ``[``, ``]``, ``(``, ``)``, ``{``,
 While in pure literal case, all these meta characters lost extra meanings
 expect for that they are just common ASCII codes.
 
-Hyperscan is initially designed to process common regualr expressions. It is
+Hyperscan is initially designed to process common regular expressions. It is
 hence embedded with a complex parser to do comprehensive regular grammer
 interpretion. Particularly, the identification of above meta characters is the
 basic step for the interpretion of far more complex regular grammers.
 
-However in real cases, patterns may not always be regualr expressions. They
+However in real cases, patterns may not always be regular expressions. They
 could just be pure literals. Problem will come if the pure literals contain
 regular meta characters. Supposing fed directly into traditional Hyperscan
 compile API, all these meta characters will be interpreted in predefined ways,

From 17de350599b984f22f78728ac4d615445472b78c Mon Sep 17 00:00:00 2001
From: Dmitry Yakovenko <jakovenko-dm@yandex.ru>
Date: Sun, 12 Jan 2020 00:35:37 +0300
Subject: [PATCH 04/25] Fix platform compatibility check

---
 src/database.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/database.c b/src/database.c
index dc03bf1fb..1a79800e2 100644
--- a/src/database.c
+++ b/src/database.c
@@ -114,8 +114,8 @@ hs_error_t HS_CDECL hs_serialize_database(const hs_database_t *db, char **bytes,
 static
 hs_error_t db_check_platform(const u64a p) {
     if (p != hs_current_platform
-        && p != hs_current_platform_no_avx2
-        && p != hs_current_platform_no_avx512) {
+        && p != (hs_current_platform | hs_current_platform_no_avx2)
+        && p != (hs_current_platform | hs_current_platform_no_avx512)) {
         return HS_DB_PLATFORM_ERROR;
     }
     // passed all checks

From 6f6e2744df560b7c5dda5b30eccf7c8b8a7636e0 Mon Sep 17 00:00:00 2001
From: "Hong, Yang A" <yang.a.hong@intel.com>
Date: Mon, 6 Jan 2020 15:32:37 +0000
Subject: [PATCH 05/25] Cyclic redundancy: change DFS termination condition
 into successors of cyclic vertex

---
 src/nfagraph/ng_cyclic_redundancy.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nfagraph/ng_cyclic_redundancy.cpp b/src/nfagraph/ng_cyclic_redundancy.cpp
index c8d34687e..0b24bf07a 100644
--- a/src/nfagraph/ng_cyclic_redundancy.cpp
+++ b/src/nfagraph/ng_cyclic_redundancy.cpp
@@ -205,7 +205,7 @@ bool removeCyclicPathRedundancy(Graph &g, typename Graph::vertex_descriptor v,
 
             DEBUG_PRINTF("  - checking w %zu\n", g[w].index);
 
-            if (!searchForward(g, reach, colours, s, w)) {
+            if (!searchForward(g, reach, colours, succ_v, w)) {
                 continue;
             }
 

From 954719597b960540af21ae650b780ac12dfc6a0c Mon Sep 17 00:00:00 2001
From: Nan Xiao <xiaonan830818@gmail.com>
Date: Tue, 7 Jan 2020 09:52:46 +0800
Subject: [PATCH 06/25] Processing VLAN packets in pcapCorpus.py

Signed-off-by: Nan Xiao <xiaonan830818@gmail.com>
---
 tools/hsbench/scripts/pcapCorpus.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/tools/hsbench/scripts/pcapCorpus.py b/tools/hsbench/scripts/pcapCorpus.py
index c10bfef37..30d6192c6 100755
--- a/tools/hsbench/scripts/pcapCorpus.py
+++ b/tools/hsbench/scripts/pcapCorpus.py
@@ -216,8 +216,9 @@ def enchunk_pcap(pcapFN, sqliteFN):
     #
     # Read in the contents of the pcap file, adding stream segments as found
     #
-    pkt_cnt = 0;
-    ip_pkt_cnt = 0;
+    pkt_cnt = 0
+    ip_pkt_cnt = 0
+    ip_pkt_off = 0
     unsupported_ip_protocol_cnt = 0
     pcap_ref = pcap.pcap(pcapFN)
     done = False
@@ -231,16 +232,24 @@ def enchunk_pcap(pcapFN, sqliteFN):
         pkt_cnt += 1
 
         linkLayerType = struct.unpack('!H', packet[(pcap_ref.dloff - 2):pcap_ref.dloff])[0]
-        if linkLayerType != ETHERTYPE_IP:
-            #
-            # We're only interested in IP packets
-            #
+        #
+        # We're only interested in IP packets
+        #
+        if linkLayerType == ETHERTYPE_VLAN:
+            linkLayerType = struct.unpack('!H', packet[(pcap_ref.dloff + 2):(pcap_ref.dloff + 4)])[0]
+            if linkLayerType != ETHERTYPE_IP:
+                continue
+            else:
+                ip_pkt_off = pcap_ref.dloff + 4
+        elif linkLayerType == ETHERTYPE_IP:
+            ip_pkt_off = pcap_ref.dloff
+        else:
             continue
 
         ip_pkt_cnt += 1
 
-        ip_pkt_total_len = struct.unpack('!H', packet[pcap_ref.dloff + 2: pcap_ref.dloff + 4])[0]
-        ip_pkt = packet[pcap_ref.dloff:pcap_ref.dloff + ip_pkt_total_len]
+        ip_pkt_total_len = struct.unpack('!H', packet[ip_pkt_off + 2: ip_pkt_off + 4])[0]
+        ip_pkt = packet[ip_pkt_off:ip_pkt_off + ip_pkt_total_len]
         pkt_protocol = struct.unpack('B', ip_pkt[9])[0]
 
         if (pkt_protocol != IPPROTO_UDP) and (pkt_protocol != IPPROTO_TCP):

From cbf04d615d8b1dad5bbaa8deba28ebc5750dc063 Mon Sep 17 00:00:00 2001
From: Wang Xiang W <xiang.w.wang@intel.com>
Date: Mon, 17 Feb 2020 04:41:36 -0500
Subject: [PATCH 07/25] gcc-9: fix CMake parsing of CPU architecture for
 non-English locale

Fixes github issue #217
---
 CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 83197af1e..297a3b33a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -187,9 +187,9 @@ else()
         set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -march=native -mtune=native)
         execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
             OUTPUT_VARIABLE _GCC_OUTPUT)
-        string(FIND "${_GCC_OUTPUT}" "Known" POS)
-        string(SUBSTRING "${_GCC_OUTPUT}" 0 ${POS} _GCC_OUTPUT)
-        string(REGEX REPLACE ".*march=[ \t]*([^ \n]*)[ \n].*" "\\1"
+        string(FIND "${_GCC_OUTPUT}" "march" POS)
+        string(SUBSTRING "${_GCC_OUTPUT}" ${POS} -1 _GCC_OUTPUT)
+        string(REGEX REPLACE "march=[ \t]*([^ \n]*)[ \n].*" "\\1"
             GNUCC_ARCH "${_GCC_OUTPUT}")
 
         # test the parsed flag

From f658c4e149ae9bfe259b55397e2b294f6bf60687 Mon Sep 17 00:00:00 2001
From: Wang Xiang W <xiang.w.wang@intel.com>
Date: Fri, 6 Mar 2020 03:24:07 -0500
Subject: [PATCH 08/25] Noodle: avoid an extra convert instruction

fixes github issue #221
---
 src/hwlm/noodle_engine_avx2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/hwlm/noodle_engine_avx2.c b/src/hwlm/noodle_engine_avx2.c
index f10e4a7bc..5edc646af 100644
--- a/src/hwlm/noodle_engine_avx2.c
+++ b/src/hwlm/noodle_engine_avx2.c
@@ -210,7 +210,7 @@ hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf,
     const u8 *d = buf + start, *e = buf + end;
     DEBUG_PRINTF("start %zu end %zu \n", start, end);
     assert(d < e);
-    u8 lastz0 = 0;
+    u32 lastz0 = 0;
 
     for (; d < e; d += 32) {
         m256 v = noCase ? and256(load256(d), caseMask) : load256(d);

From 411317639baacd39b92649d6b8720b4595f96897 Mon Sep 17 00:00:00 2001
From: Wang Xiang W <xiang.w.wang@intel.com>
Date: Wed, 18 Mar 2020 09:36:51 -0400
Subject: [PATCH 09/25] Limex: fix acceleration path analysis

---
 src/nfagraph/ng_limex_accel.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nfagraph/ng_limex_accel.cpp b/src/nfagraph/ng_limex_accel.cpp
index fa46a42cc..f1f829f2c 100644
--- a/src/nfagraph/ng_limex_accel.cpp
+++ b/src/nfagraph/ng_limex_accel.cpp
@@ -170,7 +170,7 @@ void findPaths(const NGHolder &g, NFAVertex v,
             /* path has looped back to one of the active+boring acceleration
              * states.  We can ignore this path if we have sufficient back-
              * off. */
-            paths->push_back({CharReach()});
+            paths->push_back({cr});
             continue;
         }
 

From 7c4490cfc9dbee7e95c9d80c54be57e91f63cd67 Mon Sep 17 00:00:00 2001
From: Wang Xiang W <xiang.w.wang@intel.com>
Date: Fri, 3 Apr 2020 12:08:10 -0400
Subject: [PATCH 10/25] smallwrite: add report dedupe check

---
 src/rose/rose_build_dedupe.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/rose/rose_build_dedupe.cpp b/src/rose/rose_build_dedupe.cpp
index 04144f560..d5d002d43 100644
--- a/src/rose/rose_build_dedupe.cpp
+++ b/src/rose/rose_build_dedupe.cpp
@@ -29,6 +29,7 @@
 #include "rose_build_impl.h"
 #include "nfa/castlecompile.h"
 #include "nfagraph/ng_repeat.h"
+#include "smallwrite/smallwrite_build.h"
 #include "util/compile_context.h"
 #include "util/boundary_reports.h"
 #include "util/make_unique.h"
@@ -159,6 +160,10 @@ RoseDedupeAuxImpl::RoseDedupeAuxImpl(const RoseBuildImpl &build_in)
         }
     }
 
+    for (const auto &report_id : build.smwr.all_reports()) {
+        live_reports.insert(report_id);
+    }
+
     // Collect live reports from boundary reports.
     insert(&live_reports, build.boundary.report_at_0);
     insert(&live_reports, build.boundary.report_at_0_eod);

From 934473f4061d99c13ce89feb3fa59fb1d23553b9 Mon Sep 17 00:00:00 2001
From: Wang Xiang W <xiang.w.wang@intel.com>
Date: Wed, 8 Apr 2020 11:26:05 -0400
Subject: [PATCH 11/25] hscollider: fix UTF8 check for patterns

---
 tools/hscollider/GroundTruth.cpp | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/tools/hscollider/GroundTruth.cpp b/tools/hscollider/GroundTruth.cpp
index f30a8f5eb..a2673063c 100644
--- a/tools/hscollider/GroundTruth.cpp
+++ b/tools/hscollider/GroundTruth.cpp
@@ -241,6 +241,13 @@ void addCallout(string &re) {
     re.append("\\E)(?C)");
 }
 
+static
+bool isUtf8(const CompiledPcre &compiled) {
+    unsigned long int options = 0;
+    pcre_fullinfo(compiled.bytecode, NULL, PCRE_INFO_OPTIONS, &options);
+    return options & PCRE_UTF8;
+}
+
 unique_ptr<CompiledPcre>
 GroundTruth::compile(unsigned id, bool no_callouts) {
     bool highlander = false;
@@ -380,6 +387,8 @@ GroundTruth::compile(unsigned id, bool no_callouts) {
         throw PcreCompileFailure(oss.str());
     }
 
+    compiled->utf8 |= isUtf8(*compiled);
+
     return compiled;
 }
 
@@ -451,13 +460,6 @@ int scanBasic(const CompiledPcre &compiled, const string &buffer,
     return ret;
 }
 
-static
-bool isUtf8(const CompiledPcre &compiled) {
-    unsigned long int options = 0;
-    pcre_fullinfo(compiled.bytecode, NULL, PCRE_INFO_OPTIONS, &options);
-    return options & PCRE_UTF8;
-}
-
 static
 CaptureVec makeCaptureVec(const vector<int> &ovector, int ret) {
     assert(ret > 0);

From 8344395bfd28a1b479f3a08ddbd0f2c2d43876e5 Mon Sep 17 00:00:00 2001
From: "Hong, Yang A" <yang.a.hong@intel.com>
Date: Tue, 14 Apr 2020 09:36:46 +0000
Subject: [PATCH 12/25] avoid crash in addLitExpression()

fixes github issue #205
---
 src/compiler/compiler.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/compiler/compiler.cpp b/src/compiler/compiler.cpp
index 3382ff421..de6909e76 100644
--- a/src/compiler/compiler.cpp
+++ b/src/compiler/compiler.cpp
@@ -402,7 +402,7 @@ void addLitExpression(NG &ng, unsigned index, const char *expression,
     }
 
     // Ensure that our pattern isn't too long (in characters).
-    if (strlen(expression) > cc.grey.limitPatternLength) {
+    if (expLength > cc.grey.limitPatternLength) {
         throw CompileError("Pattern length exceeds limit.");
     }
 

From 22991f2da1bd849cb19ca363e8956f3a151231f2 Mon Sep 17 00:00:00 2001
From: "Hong, Yang A" <yang.a.hong@intel.com>
Date: Thu, 16 Apr 2020 12:16:14 +0000
Subject: [PATCH 13/25] update description of HS_FLAG_SOM_LEFTMOST to eliminate
 ambiguity

---
 doc/dev-reference/compilation.rst | 4 ++--
 src/hs_compile.h                  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/dev-reference/compilation.rst b/doc/dev-reference/compilation.rst
index a2eee106d..e78552b40 100644
--- a/doc/dev-reference/compilation.rst
+++ b/doc/dev-reference/compilation.rst
@@ -55,7 +55,7 @@ Hyperscan provides support for targeting a database at a particular CPU
 platform; see :ref:`instr_specialization` for details.
 
 =====================
-Compile Pure Literals 
+Compile Pure Literals
 =====================
 
 Pure literal is a special case of regular expression. A character sequence is
@@ -98,7 +98,7 @@ In ``v5.2.0``, Hyperscan introduces 2 new compile APIs for pure literal patterns
 #. :c:func:`hs_compile_lit_multi`: compiles an array of pure literals into a
    pattern database. All of the supplied patterns will be scanned for
    concurrently at scan time, with user-supplied identifiers returned when they
-   match. 
+   match.
 
 These 2 APIs are designed for use cases where all patterns contained in the
 target rule set are pure literals. Users can pass the initial pure literal
diff --git a/src/hs_compile.h b/src/hs_compile.h
index 4c372ffe0..8f48a7925 100644
--- a/src/hs_compile.h
+++ b/src/hs_compile.h
@@ -985,8 +985,8 @@ hs_error_t HS_CDECL hs_populate_platform(hs_platform_info_t *platform);
  * offset when a match is reported for this expression. (By default, no start
  * of match is returned.)
  *
- * Enabling this behaviour may reduce performance and increase stream state
- * requirements in streaming mode.
+ * For all the 3 modes, enabling this behaviour may reduce performance. And
+ * particularly, it may increase stream state requirements in streaming mode.
  */
 #define HS_FLAG_SOM_LEFTMOST    256
 

From a742a5fb8b6984fa911db07a52f2aa58dd238430 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Petr=20Van=C4=9Bk?= <pv@excello.cz>
Date: Thu, 6 Feb 2020 11:28:34 +0100
Subject: [PATCH 14/25] Fix few typos in CMakeLists.txt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Petr Vaněk <pv@excello.cz>
---
 CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 297a3b33a..ff7a3984c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -326,7 +326,7 @@ if (CMAKE_SYSTEM_NAME MATCHES "Linux")
             set (FAT_RUNTIME_REQUISITES TRUE)
         endif()
     endif()
-    CMAKE_DEPENDENT_OPTION(FAT_RUNTIME "Build a library that supports multiple microarchitecures" ${RELEASE_BUILD} "FAT_RUNTIME_REQUISITES" OFF)
+    CMAKE_DEPENDENT_OPTION(FAT_RUNTIME "Build a library that supports multiple microarchitectures" ${RELEASE_BUILD} "FAT_RUNTIME_REQUISITES" OFF)
 endif ()
 
 include (${CMAKE_MODULE_PATH}/arch.cmake)
@@ -340,7 +340,7 @@ if (NOT WIN32)
 set(C_FLAGS_TO_CHECK
 # Variable length arrays are way bad, most especially at run time
 "-Wvla"
-# Pointer arith on void pointers is doing it wong.
+# Pointer arith on void pointers is doing it wrong.
  "-Wpointer-arith"
 # Build our C code with -Wstrict-prototypes -Wmissing-prototypes
  "-Wstrict-prototypes"
@@ -383,7 +383,7 @@ if (CC_PAREN_EQUALITY)
     set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-parentheses-equality")
 endif()
 
-# clang compains about unused const vars in our Ragel-generated code.
+# clang complains about unused const vars in our Ragel-generated code.
 CHECK_CXX_COMPILER_FLAG("-Wunused-const-variable" CXX_UNUSED_CONST_VAR)
 if (CXX_UNUSED_CONST_VAR)
     set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-unused-const-variable")

From 55f336751d5a5925c82f78802291ea3cc03beb6b Mon Sep 17 00:00:00 2001
From: "Federico G. Schwindt" <fgsch@lodoss.net>
Date: Fri, 20 Mar 2020 11:44:18 +0000
Subject: [PATCH 15/25] Fix undefined behaviour

Just use stat and opendir.  Fixes #228.

While here correct the error message.
---
 util/expressions.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/util/expressions.cpp b/util/expressions.cpp
index b33f89729..d6334bad9 100644
--- a/util/expressions.cpp
+++ b/util/expressions.cpp
@@ -146,9 +146,8 @@ bool isIgnorable(const std::string &f) {
 #ifndef _WIN32
 void loadExpressions(const string &inPath, ExpressionMap &exprMap) {
     // Is our input path a file or a directory?
-    int fd = open(inPath.c_str(), O_RDONLY);
     struct stat st;
-    if (fstat(fd, &st) != 0) {
+    if (stat(inPath.c_str(), &st) != 0) {
         cerr << "Can't stat path: '" << inPath << "'" << endl;
         exit(1);
     }
@@ -161,7 +160,7 @@ void loadExpressions(const string &inPath, ExpressionMap &exprMap) {
             exit(1);
         }
     } else if (S_ISDIR(st.st_mode)) {
-        DIR *d = fdopendir(fd);
+        DIR *d = opendir(inPath.c_str());
         if (d == nullptr) {
             cerr << "Can't open directory: '" << inPath << "'" << endl;
             exit(1);
@@ -192,10 +191,11 @@ void loadExpressions(const string &inPath, ExpressionMap &exprMap) {
         }
         (void)closedir(d);
     } else {
-        cerr << "Can't stat path: '" << inPath << "'" << endl;
+        cerr << "Unsupported file type "
+	    << hex << showbase << (st.st_mode & S_IFMT)
+	    << " for path: '" << inPath << "'" << endl;
         exit(1);
     }
-    (void)close(fd);
 }
 #else // windows TODO: improve
 void HS_CDECL loadExpressions(const string &inPath, ExpressionMap &exprMap) {

From 43204dda4885eec3da1e0e10fdf58714a201305b Mon Sep 17 00:00:00 2001
From: "Chang, Harry" <harry.chang@intel.com>
Date: Tue, 5 Nov 2019 21:19:38 +0800
Subject: [PATCH 16/25] AVX512VBMI Teddy.

---
 cmake/arch.cmake               |  12 ++
 src/fdr/teddy.c                | 305 ++++++++++++++++++++++++++++++++-
 src/fdr/teddy_runtime_common.h |   8 +-
 src/util/arch.h                |   6 +-
 src/util/simd_utils.h          |  16 +-
 5 files changed, 341 insertions(+), 6 deletions(-)

diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index 5be258aa9..cced49c69 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -58,6 +58,18 @@ int main(){
     (void)_mm512_abs_epi8(z);
 }" HAVE_AVX512)
 
+# and now for AVX512VBMI
+CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
+#if !defined(__AVX512VBMI__)
+#error no avx512vbmi
+#endif
+
+int main(){
+    __m512i a = _mm512_set1_epi8(0xFF);
+    __m512i idx = _mm512_set_epi64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
+    (void)_mm512_permutexvar_epi8(idx, a);
+}" HAVE_AVX512VBMI)
+
 if (FAT_RUNTIME)
     if (NOT HAVE_SSSE3)
         message(FATAL_ERROR "SSSE3 support required to build fat runtime")
diff --git a/src/fdr/teddy.c b/src/fdr/teddy.c
index 0b3fe28f0..960e2a415 100644
--- a/src/fdr/teddy.c
+++ b/src/fdr/teddy.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -74,6 +74,30 @@ const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
 };
 
+#if defined(HAVE_AVX512VBMI) // VBMI strong teddy
+
+#define CONF_CHUNK_64(chunk, bucket, off, reason, pt, conf_fn)              \
+do {                                                                        \
+    if (unlikely(chunk != ones_u64a)) {                                     \
+        chunk = ~chunk;                                                     \
+        conf_fn(&chunk, bucket, off, confBase, reason, a, pt,               \
+                &control, &last_match);                                     \
+        CHECK_HWLM_TERMINATE_MATCHING;                                      \
+    }                                                                       \
+} while(0)
+
+#define CONF_CHUNK_32(chunk, bucket, off, reason, pt, conf_fn)              \
+do {                                                                        \
+    if (unlikely(chunk != ones_u32)) {                                      \
+        chunk = ~chunk;                                                     \
+        conf_fn(&chunk, bucket, off, confBase, reason, a, pt,               \
+                &control, &last_match);                                     \
+        CHECK_HWLM_TERMINATE_MATCHING;                                      \
+    }                                                                       \
+} while(0)
+
+#else
+
 #define CONF_CHUNK_64(chunk, bucket, off, reason, conf_fn)                  \
 do {                                                                        \
     if (unlikely(chunk != ones_u64a)) {                                     \
@@ -94,7 +118,284 @@ do {                                                                        \
     }                                                                       \
 } while(0)
 
-#if defined(HAVE_AVX512) // AVX512 reinforced teddy
+#endif
+
+#if defined(HAVE_AVX512VBMI) // VBMI strong teddy
+
+#ifdef ARCH_64_BIT
+#define CONFIRM_TEDDY(var, bucket, offset, reason, pt, conf_fn)             \
+do {                                                                        \
+    if (unlikely(diff512(var, ones512()))) {                                \
+        m128 p128_0 = extract128from512(var, 0);                            \
+        m128 p128_1 = extract128from512(var, 1);                            \
+        m128 p128_2 = extract128from512(var, 2);                            \
+        m128 p128_3 = extract128from512(var, 3);                            \
+        u64a part1 = movq(p128_0);                                          \
+        u64a part2 = movq(rshiftbyte_m128(p128_0, 8));                      \
+        u64a part3 = movq(p128_1);                                          \
+        u64a part4 = movq(rshiftbyte_m128(p128_1, 8));                      \
+        u64a part5 = movq(p128_2);                                          \
+        u64a part6 = movq(rshiftbyte_m128(p128_2, 8));                      \
+        u64a part7 = movq(p128_3);                                          \
+        u64a part8 = movq(rshiftbyte_m128(p128_3, 8));                      \
+        CONF_CHUNK_64(part1, bucket, offset, reason, pt, conf_fn);          \
+        CONF_CHUNK_64(part2, bucket, offset + 8, reason, pt, conf_fn);      \
+        CONF_CHUNK_64(part3, bucket, offset + 16, reason, pt, conf_fn);     \
+        CONF_CHUNK_64(part4, bucket, offset + 24, reason, pt, conf_fn);     \
+        CONF_CHUNK_64(part5, bucket, offset + 32, reason, pt, conf_fn);     \
+        CONF_CHUNK_64(part6, bucket, offset + 40, reason, pt, conf_fn);     \
+        CONF_CHUNK_64(part7, bucket, offset + 48, reason, pt, conf_fn);     \
+        CONF_CHUNK_64(part8, bucket, offset + 56, reason, pt, conf_fn);     \
+    }                                                                       \
+} while(0)
+#else
+#define CONFIRM_TEDDY(var, bucket, offset, reason, pt, conf_fn)             \
+do {                                                                        \
+    if (unlikely(diff512(var, ones512()))) {                                \
+        m128 p128_0 = extract128from512(var, 0);                            \
+        m128 p128_1 = extract128from512(var, 1);                            \
+        m128 p128_2 = extract128from512(var, 2);                            \
+        m128 p128_3 = extract128from512(var, 3);                            \
+        u32 part1 = movd(p128_0);                                           \
+        u32 part2 = movd(rshiftbyte_m128(p128_0, 4));                       \
+        u32 part3 = movd(rshiftbyte_m128(p128_0, 8));                       \
+        u32 part4 = movd(rshiftbyte_m128(p128_0, 12));                      \
+        u32 part5 = movd(p128_1);                                           \
+        u32 part6 = movd(rshiftbyte_m128(p128_1, 4));                       \
+        u32 part7 = movd(rshiftbyte_m128(p128_1, 8));                       \
+        u32 part8 = movd(rshiftbyte_m128(p128_1, 12));                      \
+        u32 part9 = movd(p128_2);                                           \
+        u32 part10 = movd(rshiftbyte_m128(p128_2, 4));                      \
+        u32 part11 = movd(rshiftbyte_m128(p128_2, 8));                      \
+        u32 part12 = movd(rshiftbyte_m128(p128_2, 12));                     \
+        u32 part13 = movd(p128_3);                                          \
+        u32 part14 = movd(rshiftbyte_m128(p128_3, 4));                      \
+        u32 part15 = movd(rshiftbyte_m128(p128_3, 8));                      \
+        u32 part16 = movd(rshiftbyte_m128(p128_3, 12));                     \
+        CONF_CHUNK_32(part1, bucket, offset, reason, pt, conf_fn);          \
+        CONF_CHUNK_32(part2, bucket, offset + 4, reason, pt, conf_fn);      \
+        CONF_CHUNK_32(part3, bucket, offset + 8, reason, pt, conf_fn);      \
+        CONF_CHUNK_32(part4, bucket, offset + 12, reason, pt, conf_fn);     \
+        CONF_CHUNK_32(part5, bucket, offset + 16, reason, pt, conf_fn);     \
+        CONF_CHUNK_32(part6, bucket, offset + 20, reason, pt, conf_fn);     \
+        CONF_CHUNK_32(part7, bucket, offset + 24, reason, pt, conf_fn);     \
+        CONF_CHUNK_32(part8, bucket, offset + 28, reason, pt, conf_fn);     \
+        CONF_CHUNK_32(part9, bucket, offset + 32, reason, pt, conf_fn);     \
+        CONF_CHUNK_32(part10, bucket, offset + 36, reason, pt, conf_fn);    \
+        CONF_CHUNK_32(part11, bucket, offset + 40, reason, pt, conf_fn);    \
+        CONF_CHUNK_32(part12, bucket, offset + 44, reason, pt, conf_fn);    \
+        CONF_CHUNK_32(part13, bucket, offset + 48, reason, pt, conf_fn);    \
+        CONF_CHUNK_32(part14, bucket, offset + 52, reason, pt, conf_fn);    \
+        CONF_CHUNK_32(part15, bucket, offset + 56, reason, pt, conf_fn);    \
+        CONF_CHUNK_32(part16, bucket, offset + 60, reason, pt, conf_fn);    \
+    }                                                                       \
+} while(0)
+#endif
+
+#define PREP_SHUF_MASK                                                      \
+    m512 lo = and512(val, *lo_mask);                                        \
+    m512 hi = and512(rshift64_m512(val, 4), *lo_mask)
+
+#define TEDDY_VBMI_PSHUFB_OR_M1                              \
+    m512 shuf_or_b0 = or512(pshufb_m512(dup_mask[0], lo),    \
+                            pshufb_m512(dup_mask[1], hi));
+
+#define TEDDY_VBMI_PSHUFB_OR_M2                              \
+    TEDDY_VBMI_PSHUFB_OR_M1                                  \
+    m512 shuf_or_b1 = or512(pshufb_m512(dup_mask[2], lo),    \
+                            pshufb_m512(dup_mask[3], hi));
+
+#define TEDDY_VBMI_PSHUFB_OR_M3                              \
+    TEDDY_VBMI_PSHUFB_OR_M2                                  \
+    m512 shuf_or_b2 = or512(pshufb_m512(dup_mask[4], lo),    \
+                            pshufb_m512(dup_mask[5], hi));
+
+#define TEDDY_VBMI_PSHUFB_OR_M4                              \
+    TEDDY_VBMI_PSHUFB_OR_M3                                  \
+    m512 shuf_or_b3 = or512(pshufb_m512(dup_mask[6], lo),    \
+                            pshufb_m512(dup_mask[7], hi));
+
+#define TEDDY_VBMI_SL1_MASK   0xfffffffffffffffeULL
+#define TEDDY_VBMI_SL2_MASK   0xfffffffffffffffcULL
+#define TEDDY_VBMI_SL3_MASK   0xfffffffffffffff8ULL
+
+#define TEDDY_VBMI_SHIFT_M1
+
+#define TEDDY_VBMI_SHIFT_M2                      \
+    TEDDY_VBMI_SHIFT_M1                          \
+    m512 sl1 = maskz_vpermb512(TEDDY_VBMI_SL1_MASK, sl_msk[0], shuf_or_b1);
+
+#define TEDDY_VBMI_SHIFT_M3                      \
+    TEDDY_VBMI_SHIFT_M2                          \
+    m512 sl2 = maskz_vpermb512(TEDDY_VBMI_SL2_MASK, sl_msk[1], shuf_or_b2);
+
+#define TEDDY_VBMI_SHIFT_M4                      \
+    TEDDY_VBMI_SHIFT_M3                          \
+    m512 sl3 = maskz_vpermb512(TEDDY_VBMI_SL3_MASK, sl_msk[2], shuf_or_b3);
+
+#define SHIFT_OR_M1            \
+    shuf_or_b0
+
+#define SHIFT_OR_M2            \
+    or512(sl1, SHIFT_OR_M1)
+
+#define SHIFT_OR_M3            \
+    or512(sl2, SHIFT_OR_M2)
+
+#define SHIFT_OR_M4            \
+    or512(sl3, SHIFT_OR_M3)
+
+static really_inline
+m512 prep_conf_teddy_m1(const m512 *lo_mask, const m512 *dup_mask,
+                        UNUSED const m512 *sl_msk, const m512 val) {
+    PREP_SHUF_MASK;
+    TEDDY_VBMI_PSHUFB_OR_M1;
+    TEDDY_VBMI_SHIFT_M1;
+    return SHIFT_OR_M1;
+}
+
+static really_inline
+m512 prep_conf_teddy_m2(const m512 *lo_mask, const m512 *dup_mask,
+                        const m512 *sl_msk, const m512 val) {
+    PREP_SHUF_MASK;
+    TEDDY_VBMI_PSHUFB_OR_M2;
+    TEDDY_VBMI_SHIFT_M2;
+    return SHIFT_OR_M2;
+}
+
+static really_inline
+m512 prep_conf_teddy_m3(const m512 *lo_mask, const m512 *dup_mask,
+                        const m512 *sl_msk, const m512 val) {
+    PREP_SHUF_MASK;
+    TEDDY_VBMI_PSHUFB_OR_M3;
+    TEDDY_VBMI_SHIFT_M3;
+    return SHIFT_OR_M3;
+}
+
+static really_inline
+m512 prep_conf_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
+                        const m512 *sl_msk, const m512 val) {
+    PREP_SHUF_MASK;
+    TEDDY_VBMI_PSHUFB_OR_M4;
+    TEDDY_VBMI_SHIFT_M4;
+    return SHIFT_OR_M4;
+}
+
+#define PREP_CONF_FN(val, n)                                                  \
+    prep_conf_teddy_m##n(&lo_mask, dup_mask, sl_msk, val)
+
+const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = {
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+    0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f
+};
+
+#define TEDDY_VBMI_SL1_POS    15
+#define TEDDY_VBMI_SL2_POS    14
+#define TEDDY_VBMI_SL3_POS    13
+
+#define TEDDY_VBMI_LOAD_SHIFT_MASK_M1
+
+#define TEDDY_VBMI_LOAD_SHIFT_MASK_M2    \
+    TEDDY_VBMI_LOAD_SHIFT_MASK_M1        \
+    sl_msk[0] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL1_POS);
+
+#define TEDDY_VBMI_LOAD_SHIFT_MASK_M3    \
+    TEDDY_VBMI_LOAD_SHIFT_MASK_M2        \
+    sl_msk[1] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL2_POS);
+
+#define TEDDY_VBMI_LOAD_SHIFT_MASK_M4    \
+    TEDDY_VBMI_LOAD_SHIFT_MASK_M3        \
+    sl_msk[2] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL3_POS);
+
+#define PREPARE_MASKS_1                                                       \
+    dup_mask[0] = set4x128(maskBase[0]);                                      \
+    dup_mask[1] = set4x128(maskBase[1]);
+
+#define PREPARE_MASKS_2                                                       \
+    PREPARE_MASKS_1                                                           \
+    dup_mask[2] = set4x128(maskBase[2]);                                      \
+    dup_mask[3] = set4x128(maskBase[3]);
+
+#define PREPARE_MASKS_3                                                       \
+    PREPARE_MASKS_2                                                           \
+    dup_mask[4] = set4x128(maskBase[4]);                                      \
+    dup_mask[5] = set4x128(maskBase[5]);
+
+#define PREPARE_MASKS_4                                                       \
+    PREPARE_MASKS_3                                                           \
+    dup_mask[6] = set4x128(maskBase[6]);                                      \
+    dup_mask[7] = set4x128(maskBase[7]);
+
+#define PREPARE_MASKS(n)                                                      \
+    m512 lo_mask = set64x8(0xf);                                              \
+    m512 dup_mask[n * 2];                                                     \
+    m512 sl_msk[n - 1];                                                       \
+    PREPARE_MASKS_##n                                                         \
+    TEDDY_VBMI_LOAD_SHIFT_MASK_M##n
+
+#define TEDDY_VBMI_CONF_MASK_HEAD   (0xffffffffffffffffULL >> n_sh)
+#define TEDDY_VBMI_CONF_MASK_FULL   (0xffffffffffffffffULL << n_sh)
+#define TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffffffffffULL >> (64 - n) << overlap)
+#define TEDDY_VBMI_LOAD_MASK_PATCH  (0xffffffffffffffffULL >> (64 - n_sh))
+
+#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn)                       \
+do {                                                                          \
+    const u8 *buf_end = a->buf + a->len;                                      \
+    const u8 *ptr = a->buf + a->start_offset;                                 \
+    u32 floodBackoff = FLOOD_BACKOFF_START;                                   \
+    const u8 *tryFloodDetect = a->firstFloodDetect;                           \
+    u32 last_match = ones_u32;                                                \
+    const struct Teddy *teddy = (const struct Teddy *)fdr;                    \
+    const size_t iterBytes = 64;                                              \
+    u32 n_sh = n_msk - 1;                                                     \
+    const size_t loopBytes = 64 - n_sh;                                       \
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",                 \
+                 a->buf, a->len, a->start_offset);                            \
+                                                                              \
+    const m128 *maskBase = getMaskBase(teddy);                                \
+    PREPARE_MASKS(n_msk);                                                     \
+    const u32 *confBase = getConfBase(teddy);                                 \
+                                                                              \
+    u64a k = TEDDY_VBMI_CONF_MASK_FULL;                                       \
+    m512 p_mask = set_mask_m512(~k);                                          \
+    u32 overlap = 0;                                                          \
+    u64a patch = 0;                                                           \
+    if (likely(ptr + loopBytes <= buf_end)) {                                 \
+        m512 p_mask0 = set_mask_m512(~TEDDY_VBMI_CONF_MASK_HEAD);             \
+        m512 r_0 = PREP_CONF_FN(loadu512(ptr), n_msk);                        \
+        r_0 = or512(r_0, p_mask0);                                            \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, ptr, conf_fn);                    \
+        ptr += loopBytes;                                                     \
+        overlap = n_sh;                                                       \
+        patch = TEDDY_VBMI_LOAD_MASK_PATCH;                                   \
+    }                                                                         \
+                                                                              \
+    for (; ptr + loopBytes <= buf_end; ptr += loopBytes) {                    \
+        __builtin_prefetch(ptr - n_sh + (64 * 2));                            \
+        CHECK_FLOOD;                                                          \
+        m512 r_0 = PREP_CONF_FN(loadu512(ptr - n_sh), n_msk);                 \
+        r_0 = or512(r_0, p_mask);                                             \
+        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, ptr - n_sh, conf_fn);          \
+    }                                                                         \
+                                                                              \
+    assert(ptr + loopBytes > buf_end);                                        \
+    if (ptr < buf_end) {                                                      \
+        u32 left = (u32)(buf_end - ptr);                                      \
+        u64a k1 = TEDDY_VBMI_CONF_MASK_VAR(left);                             \
+        m512 p_mask1 = set_mask_m512(~k1);                                    \
+        m512 val_0 = loadu_maskz_m512(k1 | patch, ptr - overlap);             \
+        m512 r_0 = PREP_CONF_FN(val_0, n_msk);                                \
+        r_0 = or512(r_0, p_mask1);                                            \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, ptr - overlap, conf_fn);          \
+    }                                                                         \
+                                                                              \
+    return HWLM_SUCCESS;                                                      \
+} while(0)
+
+#elif defined(HAVE_AVX512) // AVX512 reinforced teddy
 
 #ifdef ARCH_64_BIT
 #define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
diff --git a/src/fdr/teddy_runtime_common.h b/src/fdr/teddy_runtime_common.h
index 1dbeb0979..730850cb7 100644
--- a/src/fdr/teddy_runtime_common.h
+++ b/src/fdr/teddy_runtime_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -383,12 +383,16 @@ m512 vectoredLoad512(m512 *p_mask, const u8 *ptr, const size_t start_offset,
 
 static really_inline
 u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte,
-                CautionReason reason) {
+                UNUSED CautionReason reason) {
     u64a confVal = 0;
     const u8 *buf = a->buf;
     size_t len = a->len;
     const u8 *confirm_loc = ptr + byte - 7;
+#if defined(HAVE_AVX512VBMI)
+    if (likely(confirm_loc >= buf)) {
+#else
     if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
+#endif
         confVal = lv_u64a(confirm_loc, buf, buf + len);
     } else { // r == VECTORING, confirm_loc < buf
         u64a histBytes = a->histBytes;
diff --git a/src/util/arch.h b/src/util/arch.h
index c78ee9ced..985fec6ac 100644
--- a/src/util/arch.h
+++ b/src/util/arch.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Intel Corporation
+ * Copyright (c) 2017-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -57,6 +57,10 @@
 #define HAVE_AVX512
 #endif
 
+#if defined(__AVX512VBMI__)
+#define HAVE_AVX512VBMI
+#endif
+
 /*
  * ICC and MSVC don't break out POPCNT or BMI/2 as separate pre-def macros
  */
diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
index c1449711b..42223133d 100644
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -150,6 +150,14 @@ static really_inline u32 movd(const m128 in) {
     return _mm_cvtsi128_si32(in);
 }
 
+#if defined(HAVE_AVX512)
+static really_inline u32 movd512(const m512 in) {
+    // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in),
+    //       so we use 2-step convertions to work around.
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(in));
+}
+#endif
+
 static really_inline u64a movq(const m128 in) {
 #if defined(ARCH_X86_64)
     return _mm_cvtsi128_si64(in);
@@ -318,6 +326,12 @@ static really_inline
 m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) {
     return _mm512_maskz_shuffle_epi8(k, a, b);
 }
+
+#if defined(HAVE_AVX512VBMI)
+#define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a)
+#define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a)
+#endif
+
 #endif
 
 static really_inline

From e665e959a0fff4e5a9112b0fb3ec0c05f27edf10 Mon Sep 17 00:00:00 2001
From: "Chang, Harry" <harry.chang@intel.com>
Date: Tue, 14 Apr 2020 15:56:31 +0800
Subject: [PATCH 17/25] Revert to AVX2 Fat Teddy instead of AVX512 reinforced
 Fat Teddy.

---
 src/fdr/teddy_avx2.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fdr/teddy_avx2.c b/src/fdr/teddy_avx2.c
index 56ec739f1..20ea938cf 100644
--- a/src/fdr/teddy_avx2.c
+++ b/src/fdr/teddy_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -134,7 +134,7 @@ const m256 *getMaskBase_fat(const struct Teddy *teddy) {
     return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
 }
 
-#if defined(HAVE_AVX512)
+#if defined(HAVE_AVX512_REVERT) // revert to AVX2 Fat Teddy
 
 static really_inline
 const u64a *getReinforcedMaskBase_fat(const struct Teddy *teddy, u8 numMask) {

From 3ff54f68e4a57c156eadb2d2d351ba64e01e6c13 Mon Sep 17 00:00:00 2001
From: "Hong, Yang A" <yang.a.hong@intel.com>
Date: Mon, 20 Apr 2020 13:05:37 +0000
Subject: [PATCH 18/25] add Hyperscan version marcro in public header

---
 src/hs.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/hs.h b/src/hs.h
index 38215de6c..105919fb8 100644
--- a/src/hs.h
+++ b/src/hs.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -39,6 +39,12 @@
  * the individual component headers for documentation.
  */
 
+/* The current Hyperscan version information. */
+
+#define HS_MAJOR      5
+#define HS_MINOR      3
+#define HS_PATCH      0
+
 #include "hs_compile.h"
 #include "hs_runtime.h"
 

From ec68facfaa26c926a2a3a4fef6a04600a40c4a18 Mon Sep 17 00:00:00 2001
From: "Hong, Yang A" <yang.a.hong@intel.com>
Date: Tue, 28 Apr 2020 10:14:55 +0000
Subject: [PATCH 19/25] hsbench: add hyphen support for -T option

---
 tools/hsbench/main.cpp |  9 ++++++---
 util/string_util.h     | 18 ++++++++++++++----
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/tools/hsbench/main.cpp b/tools/hsbench/main.cpp
index 8e85d7aea..4e65c8e0b 100644
--- a/tools/hsbench/main.cpp
+++ b/tools/hsbench/main.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -207,7 +207,9 @@ void usage(const char *error) {
     printf("  -P              Benchmark using PCRE (if supported).\n");
 #endif
 #if defined(HAVE_DECL_PTHREAD_SETAFFINITY_NP) || defined(_WIN32)
-    printf("  -T CPU,CPU,...  Benchmark with threads on these CPUs.\n");
+    printf("  -T CPU,CPU,... or -T CPU-CPU\n");
+    printf("                  Benchmark with threads on specified CPUs or CPU"
+           " range.\n");
 #endif
     printf("  -i DIR          Don't compile, load from files in DIR"
            " instead.\n");
@@ -354,7 +356,8 @@ void processArgs(int argc, char *argv[], vector<BenchmarkSigs> &sigSets,
         case 'T':
             if (!strToList(optarg, threadCores)) {
                 usage("Couldn't parse argument to -T flag, should be"
-                      " a list of positive integers.");
+                      " a list of positive integers or 2 integers"
+                      " connected with hyphen.");
                 exit(1);
             }
             break;
diff --git a/util/string_util.h b/util/string_util.h
index b44586ea7..ab3751c1f 100644
--- a/util/string_util.h
+++ b/util/string_util.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2019, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -54,8 +54,8 @@ inline bool fromString(const std::string &s, T& val)
     return true;
 }
 
-// read in a comma-separated set of values: very simple impl, not for
-// external consumption
+// read in a comma-separated or hyphen-connected set of values: very simple
+// impl, not for external consumption
 template<typename T>
 inline bool strToList(const std::string &s, std::vector<T>& out)
 {
@@ -68,7 +68,17 @@ inline bool strToList(const std::string &s, std::vector<T>& out)
         }
 
         out.push_back(val);
-    } while (i.get(c) && c == ',');
+
+        i.get(c);
+        if (c == '-') {
+            T val_end;
+            i >> val_end;
+            while (val < val_end) {
+                out.push_back(++val);
+            }
+            break;
+        }
+    } while (c == ',');
 
     return !out.empty();
 }

From 88a18dcf980e9c8496041a6010fc25f6548188ae Mon Sep 17 00:00:00 2001
From: "Hong, Yang A" <yang.a.hong@intel.com>
Date: Tue, 28 Apr 2020 10:15:40 +0000
Subject: [PATCH 20/25] add AVX512 support for vermicelli model

---
 src/nfa/vermicelli.h     | 179 ++++++++++----
 src/nfa/vermicelli_sse.h | 498 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 634 insertions(+), 43 deletions(-)

diff --git a/src/nfa/vermicelli.h b/src/nfa/vermicelli.h
index 817e681a8..ed797d83f 100644
--- a/src/nfa/vermicelli.h
+++ b/src/nfa/vermicelli.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -46,7 +46,20 @@ const u8 *vermicelliExec(char c, char nocase, const u8 *buf,
                  nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
     assert(buf < buf_end);
 
+    VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
+
     // Handle small scans.
+#ifdef HAVE_AVX512
+    if (buf_end - buf <= VERM_BOUNDARY) {
+        const u8 *ptr = nocase
+                      ? vermMiniNocase(chars, buf, buf_end, 0)
+                      : vermMini(chars, buf, buf_end, 0);
+        if (ptr) {
+            return ptr;
+        }
+        return buf_end;
+    }
+#else
     if (buf_end - buf < VERM_BOUNDARY) {
         for (; buf < buf_end; buf++) {
             char cur = (char)*buf;
@@ -59,8 +72,8 @@ const u8 *vermicelliExec(char c, char nocase, const u8 *buf,
         }
         return buf;
     }
+#endif
 
-    VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
     uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY;
     if (min) {
         // Input isn't aligned, so we need to run one iteration with an
@@ -99,7 +112,20 @@ const u8 *nvermicelliExec(char c, char nocase, const u8 *buf,
                  nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
     assert(buf < buf_end);
 
+    VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
+
     // Handle small scans.
+#ifdef HAVE_AVX512
+    if (buf_end - buf <= VERM_BOUNDARY) {
+        const u8 *ptr = nocase
+                      ? vermMiniNocase(chars, buf, buf_end, 1)
+                      : vermMini(chars, buf, buf_end, 1);
+        if (ptr) {
+            return ptr;
+        }
+        return buf_end;
+    }
+#else
     if (buf_end - buf < VERM_BOUNDARY) {
         for (; buf < buf_end; buf++) {
             char cur = (char)*buf;
@@ -112,8 +138,8 @@ const u8 *nvermicelliExec(char c, char nocase, const u8 *buf,
         }
         return buf;
     }
+#endif
 
-    VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
     size_t min = (size_t)buf % VERM_BOUNDARY;
     if (min) {
         // Input isn't aligned, so we need to run one iteration with an
@@ -149,12 +175,32 @@ const u8 *vermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf,
     DEBUG_PRINTF("double verm scan %s\\x%02hhx%02hhx over %zu bytes\n",
                  nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf));
     assert(buf < buf_end);
-    assert((buf_end - buf) >= VERM_BOUNDARY);
 
-    uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY;
     VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */
     VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */
 
+#ifdef HAVE_AVX512
+    if (buf_end - buf <= VERM_BOUNDARY) {
+        const u8 *ptr = nocase
+                      ? dvermMiniNocase(chars1, chars2, buf, buf_end)
+                      : dvermMini(chars1, chars2, buf, buf_end);
+        if (ptr) {
+            return ptr;
+        }
+
+        /* check for partial match at end */
+        u8 mask = nocase ? CASE_CLEAR : 0xff;
+        if ((buf_end[-1] & mask) == (u8)c1) {
+            DEBUG_PRINTF("partial!!!\n");
+            return buf_end - 1;
+        }
+
+        return buf_end;
+    }
+#endif
+
+    assert((buf_end - buf) >= VERM_BOUNDARY);
+    uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY;
     if (min) {
         // Input isn't aligned, so we need to run one iteration with an
         // unaligned load, then skip buf forward to the next aligned address.
@@ -205,14 +251,32 @@ const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2,
     DEBUG_PRINTF("double verm scan (\\x%02hhx&\\x%02hhx)(\\x%02hhx&\\x%02hhx) "
                  "over %zu bytes\n", c1, m1, c2, m2, (size_t)(buf_end - buf));
     assert(buf < buf_end);
-    assert((buf_end - buf) >= VERM_BOUNDARY);
 
-    uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY;
     VERM_TYPE chars1 = VERM_SET_FN(c1);
     VERM_TYPE chars2 = VERM_SET_FN(c2);
     VERM_TYPE mask1 = VERM_SET_FN(m1);
     VERM_TYPE mask2 = VERM_SET_FN(m2);
 
+#ifdef HAVE_AVX512
+    if (buf_end - buf <= VERM_BOUNDARY) {
+        const u8 *ptr = dvermMiniMasked(chars1, chars2, mask1, mask2, buf,
+                                        buf_end);
+        if (ptr) {
+            return ptr;
+        }
+
+        /* check for partial match at end */
+        if ((buf_end[-1] & m1) == (u8)c1) {
+            DEBUG_PRINTF("partial!!!\n");
+            return buf_end - 1;
+        }
+
+        return buf_end;
+    }
+#endif
+
+    assert((buf_end - buf) >= VERM_BOUNDARY);
+    uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY;
     if (min) {
         // Input isn't aligned, so we need to run one iteration with an
         // unaligned load, then skip buf forward to the next aligned address.
@@ -244,6 +308,7 @@ const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2,
 
     /* check for partial match at end */
     if ((buf_end[-1] & m1) == (u8)c1) {
+        DEBUG_PRINTF("partial!!!\n");
         return buf_end - 1;
     }
 
@@ -259,7 +324,20 @@ const u8 *rvermicelliExec(char c, char nocase, const u8 *buf,
                  nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
     assert(buf < buf_end);
 
+    VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
+
     // Handle small scans.
+#ifdef HAVE_AVX512
+    if (buf_end - buf <= VERM_BOUNDARY) {
+        const u8 *ptr = nocase
+                      ? rvermMiniNocase(chars, buf, buf_end, 0)
+                      : rvermMini(chars, buf, buf_end, 0);
+        if (ptr) {
+            return ptr;
+        }
+        return buf - 1;
+    }
+#else
     if (buf_end - buf < VERM_BOUNDARY) {
         for (buf_end--; buf_end >= buf; buf_end--) {
             char cur = (char)*buf_end;
@@ -272,26 +350,22 @@ const u8 *rvermicelliExec(char c, char nocase, const u8 *buf,
         }
         return buf_end;
     }
+#endif
 
-    VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
     size_t min = (size_t)buf_end % VERM_BOUNDARY;
-
     if (min) {
         // Input isn't aligned, so we need to run one iteration with an
         // unaligned load, then skip buf backward to the next aligned address.
         // There's some small overlap here, but we don't mind scanning it twice
         // if we can do it quickly, do we?
-        if (nocase) {
-            const u8 *ptr =
-                rvermUnalignNocase(chars, buf_end - VERM_BOUNDARY, 0);
-            if (ptr) {
-                return ptr;
-            }
-        } else {
-            const u8 *ptr = rvermUnalign(chars, buf_end - VERM_BOUNDARY, 0);
-            if (ptr) {
-                return ptr;
-            }
+        const u8 *ptr = nocase ? rvermUnalignNocase(chars,
+                                                    buf_end - VERM_BOUNDARY,
+                                                    0)
+                               : rvermUnalign(chars, buf_end - VERM_BOUNDARY,
+                                              0);
+
+        if (ptr) {
+            return ptr;
         }
 
         buf_end -= min;
@@ -322,7 +396,20 @@ const u8 *rnvermicelliExec(char c, char nocase, const u8 *buf,
                  nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
     assert(buf < buf_end);
 
+    VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
+
     // Handle small scans.
+#ifdef HAVE_AVX512
+    if (buf_end - buf <= VERM_BOUNDARY) {
+        const u8 *ptr = nocase
+                      ? rvermMiniNocase(chars, buf, buf_end, 1)
+                      : rvermMini(chars, buf, buf_end, 1);
+        if (ptr) {
+            return ptr;
+        }
+        return buf - 1;
+    }
+#else
     if (buf_end - buf < VERM_BOUNDARY) {
         for (buf_end--; buf_end >= buf; buf_end--) {
             char cur = (char)*buf_end;
@@ -335,26 +422,22 @@ const u8 *rnvermicelliExec(char c, char nocase, const u8 *buf,
         }
         return buf_end;
     }
+#endif
 
-    VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
     size_t min = (size_t)buf_end % VERM_BOUNDARY;
-
     if (min) {
         // Input isn't aligned, so we need to run one iteration with an
         // unaligned load, then skip buf backward to the next aligned address.
         // There's some small overlap here, but we don't mind scanning it twice
         // if we can do it quickly, do we?
-        if (nocase) {
-            const u8 *ptr =
-                rvermUnalignNocase(chars, buf_end - VERM_BOUNDARY, 1);
-            if (ptr) {
-                return ptr;
-            }
-        } else {
-            const u8 *ptr = rvermUnalign(chars, buf_end - VERM_BOUNDARY, 1);
-            if (ptr) {
-                return ptr;
-            }
+        const u8 *ptr = nocase ? rvermUnalignNocase(chars,
+                                                    buf_end - VERM_BOUNDARY,
+                                                    1)
+                               : rvermUnalign(chars, buf_end - VERM_BOUNDARY,
+                                              1);
+
+        if (ptr) {
+            return ptr;
         }
 
         buf_end -= min;
@@ -383,24 +466,36 @@ const u8 *rvermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf,
     DEBUG_PRINTF("rev double verm scan %s\\x%02hhx%02hhx over %zu bytes\n",
                  nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf));
     assert(buf < buf_end);
-    assert((buf_end - buf) >= VERM_BOUNDARY);
 
-    size_t min = (size_t)buf_end % VERM_BOUNDARY;
     VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */
     VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */
 
+#ifdef HAVE_AVX512
+    if (buf_end - buf <= VERM_BOUNDARY) {
+        const u8 *ptr = nocase
+                      ? rdvermMiniNocase(chars1, chars2, buf, buf_end)
+                      : rdvermMini(chars1, chars2, buf, buf_end);
+
+        if (ptr) {
+            return ptr;
+        }
+
+        // check for partial match at end ???
+        return buf - 1;
+    }
+#endif
+
+    assert((buf_end - buf) >= VERM_BOUNDARY);
+    size_t min = (size_t)buf_end % VERM_BOUNDARY;
     if (min) {
         // input not aligned, so we need to run one iteration with an unaligned
         // load, then skip buf forward to the next aligned address. There's
         // some small overlap here, but we don't mind scanning it twice if we
         // can do it quickly, do we?
-        const u8 *ptr;
-        if (nocase) {
-            ptr = rdvermPreconditionNocase(chars1, chars2,
-                                           buf_end - VERM_BOUNDARY);
-        } else {
-            ptr = rdvermPrecondition(chars1, chars2, buf_end - VERM_BOUNDARY);
-        }
+        const u8 *ptr = nocase ? rdvermPreconditionNocase(chars1, chars2,
+                                                          buf_end - VERM_BOUNDARY)
+                               : rdvermPrecondition(chars1, chars2,
+                                                    buf_end - VERM_BOUNDARY);
 
         if (ptr) {
             return ptr;
diff --git a/src/nfa/vermicelli_sse.h b/src/nfa/vermicelli_sse.h
index 0749470f5..3307486cf 100644
--- a/src/nfa/vermicelli_sse.h
+++ b/src/nfa/vermicelli_sse.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -32,6 +32,8 @@
  * (users should include vermicelli.h)
  */
 
+#if !defined(HAVE_AVX512)
+
 #define VERM_BOUNDARY 16
 #define VERM_TYPE m128
 #define VERM_SET_FN set16x8
@@ -391,3 +393,497 @@ const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
 
     return NULL;
 }
+
+#else // HAVE_AVX512
+
+#define VERM_BOUNDARY 64
+#define VERM_TYPE m512
+#define VERM_SET_FN set64x8
+
+static really_inline
+const u8 *vermMini(m512 chars, const u8 *buf, const u8 *buf_end, char negate) {
+    uintptr_t len = buf_end - buf;
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 data = loadu_maskz_m512(mask, buf);
+
+    u64a z = eq512mask(chars, data);
+
+    if (negate) {
+        z = ~z & mask;
+    }
+    z &= mask;
+    if (unlikely(z)) {
+        return buf + ctz64(z);
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *vermMiniNocase(m512 chars, const u8 *buf, const u8 *buf_end,
+                         char negate) {
+    uintptr_t len = buf_end - buf;
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 data = loadu_maskz_m512(mask, buf);
+    m512 casemask = set64x8(CASE_CLEAR);
+    m512 v = and512(casemask, data);
+
+    u64a z = eq512mask(chars, v);
+
+    if (negate) {
+        z = ~z & mask;
+    }
+    z &= mask;
+    if (unlikely(z)) {
+        return buf + ctz64(z);
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *vermSearchAligned(m512 chars, const u8 *buf, const u8 *buf_end,
+                            char negate) {
+    assert((size_t)buf % 64 == 0);
+    for (; buf + 63 < buf_end; buf += 64) {
+        m512 data = load512(buf);
+        u64a z = eq512mask(chars, data);
+        if (negate) {
+            z = ~z & ~0ULL;
+        }
+        if (unlikely(z)) {
+            u64a pos = ctz64(z);
+            return buf + pos;
+        }
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *vermSearchAlignedNocase(m512 chars, const u8 *buf,
+                                  const u8 *buf_end, char negate) {
+    assert((size_t)buf % 64 == 0);
+    m512 casemask = set64x8(CASE_CLEAR);
+
+    for (; buf + 63 < buf_end; buf += 64) {
+        m512 data = load512(buf);
+        u64a z = eq512mask(chars, and512(casemask, data));
+        if (negate) {
+            z = ~z & ~0ULL;
+        }
+        if (unlikely(z)) {
+            u64a pos = ctz64(z);
+            return buf + pos;
+        }
+    }
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *vermUnalign(m512 chars, const u8 *buf, char negate) {
+    m512 data = loadu512(buf); // unaligned
+    u64a z = eq512mask(chars, data);
+    if (negate) {
+        z = ~z & ~0ULL;
+    }
+    if (unlikely(z)) {
+        return buf + ctz64(z);
+    }
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *vermUnalignNocase(m512 chars, const u8 *buf, char negate) {
+    m512 casemask = set64x8(CASE_CLEAR);
+    m512 data = loadu512(buf); // unaligned
+    u64a z = eq512mask(chars, and512(casemask, data));
+    if (negate) {
+        z = ~z & ~0ULL;
+    }
+    if (unlikely(z)) {
+        return buf + ctz64(z);
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *dvermMini(m512 chars1, m512 chars2, const u8 *buf,
+                    const u8 *buf_end) {
+    uintptr_t len = buf_end - buf;
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 data = loadu_maskz_m512(mask, buf);
+
+    u64a z = eq512mask(chars1, data) & (eq512mask(chars2, data) >> 1);
+
+    z &= mask;
+    if (unlikely(z)) {
+        u64a pos = ctz64(z);
+        return buf + pos;
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *dvermMiniNocase(m512 chars1, m512 chars2, const u8 *buf,
+                          const u8 *buf_end) {
+    uintptr_t len = buf_end - buf;
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 data = loadu_maskz_m512(mask, buf);
+    m512 casemask = set64x8(CASE_CLEAR);
+    m512 v = and512(casemask, data);
+
+    u64a z = eq512mask(chars1, v) & (eq512mask(chars2, v) >> 1);
+
+    z &= mask;
+    if (unlikely(z)) {
+        u64a pos = ctz64(z);
+        return buf + pos;
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *dvermMiniMasked(m512 chars1, m512 chars2, m512 mask1, m512 mask2,
+                          const u8 *buf, const u8 *buf_end) {
+    uintptr_t len = buf_end - buf;
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 data = loadu_maskz_m512(mask, buf);
+    m512 v1 = and512(data, mask1);
+    m512 v2 = and512(data, mask2);
+
+    u64a z = eq512mask(chars1, v1) & (eq512mask(chars2, v2) >> 1);
+
+    z &= mask;
+    if (unlikely(z)) {
+        u64a pos = ctz64(z);
+        return buf + pos;
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *dvermSearchAligned(m512 chars1, m512 chars2, u8 c1, u8 c2,
+                             const u8 *buf, const u8 *buf_end) {
+    for (; buf + 64 < buf_end; buf += 64) {
+        m512 data = load512(buf);
+        u64a z = eq512mask(chars1, data) & (eq512mask(chars2, data) >> 1);
+        if (buf[63] == c1 && buf[64] == c2) {
+            z |= (1ULL << 63);
+        }
+        if (unlikely(z)) {
+            u64a pos = ctz64(z);
+            return buf + pos;
+        }
+    }
+
+    return NULL;
+}
+
+static really_inline
+const u8 *dvermSearchAlignedNocase(m512 chars1, m512 chars2, u8 c1, u8 c2,
+                                   const u8 *buf, const u8 *buf_end) {
+    assert((size_t)buf % 64 == 0);
+    m512 casemask = set64x8(CASE_CLEAR);
+
+    for (; buf + 64 < buf_end; buf += 64) {
+        m512 data = load512(buf);
+        m512 v = and512(casemask, data);
+        u64a z = eq512mask(chars1, v) & (eq512mask(chars2, v) >> 1);
+        if ((buf[63] & CASE_CLEAR) == c1 && (buf[64] & CASE_CLEAR) == c2) {
+            z |= (1ULL << 63);
+        }
+        if (unlikely(z)) {
+            u64a pos = ctz64(z);
+            return buf + pos;
+        }
+    }
+
+    return NULL;
+}
+
+static really_inline
+const u8 *dvermSearchAlignedMasked(m512 chars1, m512 chars2,
+                                   m512 mask1, m512 mask2, u8 c1, u8 c2, u8 m1,
+                                   u8 m2, const u8 *buf, const u8 *buf_end) {
+    assert((size_t)buf % 64 == 0);
+
+    for (; buf + 64 < buf_end; buf += 64) {
+        m512 data = load512(buf);
+        m512 v1 = and512(data, mask1);
+        m512 v2 = and512(data, mask2);
+        u64a z = eq512mask(chars1, v1) & (eq512mask(chars2, v2) >> 1);
+
+        if ((buf[63] & m1) == c1 && (buf[64] & m2) == c2) {
+            z |= (1ULL << 63);
+        }
+        if (unlikely(z)) {
+            u64a pos = ctz64(z);
+            return buf + pos;
+        }
+    }
+
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *dvermPrecondition(m512 chars1, m512 chars2, const u8 *buf) {
+    m512 data = loadu512(buf); // unaligned
+    u64a z = eq512mask(chars1, data) & (eq512mask(chars2, data) >> 1);
+
+    /* no fixup of the boundary required - the aligned run will pick it up */
+    if (unlikely(z)) {
+        u64a pos = ctz64(z);
+        return buf + pos;
+    }
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *dvermPreconditionNocase(m512 chars1, m512 chars2, const u8 *buf) {
+    /* due to laziness, nonalphas and nocase having interesting behaviour */
+    m512 casemask = set64x8(CASE_CLEAR);
+    m512 data = loadu512(buf); // unaligned
+    m512 v = and512(casemask, data);
+    u64a z = eq512mask(chars1, v) & (eq512mask(chars2, v) >> 1);
+
+    /* no fixup of the boundary required - the aligned run will pick it up */
+    if (unlikely(z)) {
+        u64a pos = ctz64(z);
+        return buf + pos;
+    }
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *dvermPreconditionMasked(m512 chars1, m512 chars2,
+                                  m512 mask1, m512 mask2, const u8 *buf) {
+    m512 data = loadu512(buf); // unaligned
+    m512 v1 = and512(data, mask1);
+    m512 v2 = and512(data, mask2);
+    u64a z = eq512mask(chars1, v1) & (eq512mask(chars2, v2) >> 1);
+
+    /* no fixup of the boundary required - the aligned run will pick it up */
+    if (unlikely(z)) {
+        u64a pos = ctz64(z);
+        return buf + pos;
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *lastMatchOffset(const u8 *buf_end, u64a z) {
+    assert(z);
+    return buf_end - 64 + 63 - clz64(z);
+}
+
+static really_inline
+const u8 *rvermMini(m512 chars, const u8 *buf, const u8 *buf_end, char negate) {
+    uintptr_t len = buf_end - buf;
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 data = loadu_maskz_m512(mask, buf);
+
+    u64a z = eq512mask(chars, data);
+
+    if (negate) {
+        z = ~z & mask;
+    }
+    z &= mask;
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 64, z);
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *rvermMiniNocase(m512 chars, const u8 *buf, const u8 *buf_end,
+                          char negate) {
+    uintptr_t len = buf_end - buf;
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 data = loadu_maskz_m512(mask, buf);
+    m512 casemask = set64x8(CASE_CLEAR);
+    m512 v = and512(casemask, data);
+
+    u64a z = eq512mask(chars, v);
+
+    if (negate) {
+        z = ~z & mask;
+    }
+    z &= mask;
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 64, z);
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *rvermSearchAligned(m512 chars, const u8 *buf, const u8 *buf_end,
+                             char negate) {
+    assert((size_t)buf_end % 64 == 0);
+    for (; buf + 63 < buf_end; buf_end -= 64) {
+        m512 data = load512(buf_end - 64);
+        u64a z = eq512mask(chars, data);
+        if (negate) {
+            z = ~z & ~0ULL;
+        }
+        if (unlikely(z)) {
+            return lastMatchOffset(buf_end, z);
+        }
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *rvermSearchAlignedNocase(m512 chars, const u8 *buf,
+                                   const u8 *buf_end, char negate) {
+    assert((size_t)buf_end % 64 == 0);
+    m512 casemask = set64x8(CASE_CLEAR);
+
+    for (; buf + 63 < buf_end; buf_end -= 64) {
+        m512 data = load512(buf_end - 64);
+        u64a z = eq512mask(chars, and512(casemask, data));
+        if (negate) {
+            z = ~z & ~0ULL;
+        }
+        if (unlikely(z)) {
+            return lastMatchOffset(buf_end, z);
+        }
+    }
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *rvermUnalign(m512 chars, const u8 *buf, char negate) {
+    m512 data = loadu512(buf); // unaligned
+    u64a z = eq512mask(chars, data);
+    if (negate) {
+        z = ~z & ~0ULL;
+    }
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 64, z);
+    }
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *rvermUnalignNocase(m512 chars, const u8 *buf, char negate) {
+    m512 casemask = set64x8(CASE_CLEAR);
+    m512 data = loadu512(buf); // unaligned
+    u64a z = eq512mask(chars, and512(casemask, data));
+    if (negate) {
+        z = ~z & ~0ULL;
+    }
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 64, z);
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *rdvermMini(m512 chars1, m512 chars2, const u8 *buf,
+                    const u8 *buf_end) {
+    uintptr_t len = buf_end - buf;
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 data = loadu_maskz_m512(mask, buf);
+
+    u64a z = eq512mask(chars2, data) & (eq512mask(chars1, data) << 1);
+
+    z &= mask;
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 64, z);
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *rdvermMiniNocase(m512 chars1, m512 chars2, const u8 *buf,
+                           const u8 *buf_end) {
+    uintptr_t len = buf_end - buf;
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 data = loadu_maskz_m512(mask, buf);
+    m512 casemask = set64x8(CASE_CLEAR);
+    m512 v = and512(casemask, data);
+
+    u64a z = eq512mask(chars2, v) & (eq512mask(chars1, v) << 1);
+
+    z &= mask;
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 64, z);
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *rdvermSearchAligned(m512 chars1, m512 chars2, u8 c1, u8 c2,
+                              const u8 *buf, const u8 *buf_end) {
+    assert((size_t)buf_end % 64 == 0);
+
+    for (; buf + 64 < buf_end; buf_end -= 64) {
+        m512 data = load512(buf_end - 64);
+        u64a z = eq512mask(chars2, data) & (eq512mask(chars1, data) << 1);
+        if (buf_end[-65] == c1 && buf_end[-64] == c2) {
+            z |= 1;
+        }
+        if (unlikely(z)) {
+            return lastMatchOffset(buf_end, z);
+        }
+    }
+    return buf_end;
+}
+
+static really_inline
+const u8 *rdvermSearchAlignedNocase(m512 chars1, m512 chars2, u8 c1, u8 c2,
+                                    const u8 *buf, const u8 *buf_end) {
+    assert((size_t)buf_end % 64 == 0);
+    m512 casemask = set64x8(CASE_CLEAR);
+
+    for (; buf + 64 < buf_end; buf_end -= 64) {
+        m512 data = load512(buf_end - 64);
+        m512 v = and512(casemask, data);
+        u64a z = eq512mask(chars2, v) & (eq512mask(chars1, v) << 1);
+        if ((buf_end[-65] & CASE_CLEAR) == c1
+            && (buf_end[-64] & CASE_CLEAR) == c2) {
+            z |= 1;
+        }
+        if (unlikely(z)) {
+            return lastMatchOffset(buf_end, z);
+        }
+    }
+    return buf_end;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *rdvermPrecondition(m512 chars1, m512 chars2, const u8 *buf) {
+    m512 data = loadu512(buf);
+    u64a z = eq512mask(chars2, data) & (eq512mask(chars1, data) << 1);
+
+    // no fixup of the boundary required - the aligned run will pick it up
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 64, z);
+    }
+
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *rdvermPreconditionNocase(m512 chars1, m512 chars2, const u8 *buf) {
+    // due to laziness, nonalphas and nocase having interesting behaviour
+    m512 casemask = set64x8(CASE_CLEAR);
+    m512 data = loadu512(buf);
+    m512 v = and512(casemask, data);
+    u64a z = eq512mask(chars2, v) & (eq512mask(chars1, v) << 1);
+    // no fixup of the boundary required - the aligned run will pick it up
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 64, z);
+    }
+
+    return NULL;
+}
+
+#endif // HAVE_AVX512

From 765b8f9fb94dc831f3190222594ce04a486ded80 Mon Sep 17 00:00:00 2001
From: "Hong, Yang A" <yang.a.hong@intel.com>
Date: Wed, 13 May 2020 16:37:57 +0000
Subject: [PATCH 21/25] literal API: remove HS_FLAG_MULTILINE flag support

fixes github issue #237
---
 doc/dev-reference/compilation.rst |  4 ++--
 src/compiler/compiler.cpp         | 11 +++++------
 src/hs_compile.h                  |  4 +---
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/doc/dev-reference/compilation.rst b/doc/dev-reference/compilation.rst
index e78552b40..205b7348b 100644
--- a/doc/dev-reference/compilation.rst
+++ b/doc/dev-reference/compilation.rst
@@ -110,8 +110,8 @@ Hyperscan needs to locate the end position of the input expression via clearly
 knowing each literal's length, not by simply identifying character ``\0`` of a
 string.
 
-Supported flags: :c:member:`HS_FLAG_CASELESS`, :c:member:`HS_FLAG_MULTILINE`,
-:c:member:`HS_FLAG_SINGLEMATCH`, :c:member:`HS_FLAG_SOM_LEFTMOST`.
+Supported flags: :c:member:`HS_FLAG_CASELESS`, :c:member:`HS_FLAG_SINGLEMATCH`,
+:c:member:`HS_FLAG_SOM_LEFTMOST`.
 
 .. note:: We don't support literal compilation API with :ref:`extparam`. And
           for runtime implementation, traditional runtime APIs can still be
diff --git a/src/compiler/compiler.cpp b/src/compiler/compiler.cpp
index de6909e76..666eefc9c 100644
--- a/src/compiler/compiler.cpp
+++ b/src/compiler/compiler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2019, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -125,7 +125,7 @@ ParsedLitExpression::ParsedLitExpression(unsigned index_in,
     : expr(index_in, false, flags & HS_FLAG_SINGLEMATCH, false, false,
            SOM_NONE, report, 0, MAX_OFFSET, 0, 0, 0, false) {
     // For pure literal expression, below 'HS_FLAG_'s are unuseful:
-    // DOTALL/ALLOWEMPTY/UTF8/UCP/PREFILTER/COMBINATION/QUIET
+    // DOTALL/ALLOWEMPTY/UTF8/UCP/PREFILTER/COMBINATION/QUIET/MULTILINE
 
     if (flags & ~HS_FLAG_ALL) {
         DEBUG_PRINTF("Unrecognised flag, flags=%u.\n", flags);
@@ -409,12 +409,11 @@ void addLitExpression(NG &ng, unsigned index, const char *expression,
     // filter out flags not supported by pure literal API.
     u64a not_supported = HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8 |
                          HS_FLAG_UCP | HS_FLAG_PREFILTER | HS_FLAG_COMBINATION |
-                         HS_FLAG_QUIET;
+                         HS_FLAG_QUIET | HS_FLAG_MULTILINE;
 
     if (flags & not_supported) {
-        throw CompileError("Only HS_FLAG_CASELESS, HS_FLAG_MULTILINE, "
-                           "HS_FLAG_SINGLEMATCH and HS_FLAG_SOM_LEFTMOST are "
-                           "supported in literal API.");
+        throw CompileError("Only HS_FLAG_CASELESS, HS_FLAG_SINGLEMATCH and "
+                           "HS_FLAG_SOM_LEFTMOST are supported in literal API.");
     }
 
     // This expression must be a pure literal, we can build ue2_literal
diff --git a/src/hs_compile.h b/src/hs_compile.h
index 8f48a7925..081d46387 100644
--- a/src/hs_compile.h
+++ b/src/hs_compile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2019, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -563,7 +563,6 @@ hs_error_t HS_CDECL hs_compile_ext_multi(const char *const *expressions,
  *      be used by ORing them together. Compared to @ref hs_compile(), fewer
  *      valid values are provided:
  *       - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
- *       - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
  *       - HS_FLAG_SINGLEMATCH - Only one match will be generated for the
  *                               expression per stream.
  *       - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
@@ -637,7 +636,6 @@ hs_error_t HS_CDECL hs_compile_lit(const char *expression, unsigned flags,
  *      in place of an array will set the flags value for all patterns to zero.
  *      Compared to @ref hs_compile_multi(), fewer valid values are provided:
  *       - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
- *       - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
  *       - HS_FLAG_SINGLEMATCH - Only one match will be generated for the
  *                               expression per stream.
  *       - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset

From 4d33736a5cdffb3f2c760d4c77e319b4fdf0f3fb Mon Sep 17 00:00:00 2001
From: "Hong, Yang A" <yang.a.hong@intel.com>
Date: Mon, 18 May 2020 20:08:20 +0000
Subject: [PATCH 22/25] gcc-10: fix hyperscan compile issue

Fixes github issue #239
---
 CMakeLists.txt                     | 6 ++++++
 unit/internal/graph_undirected.cpp | 4 ++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ff7a3984c..ef27bc800 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -418,6 +418,12 @@ CHECK_CXX_COMPILER_FLAG("-Wunused-local-typedefs" CXX_UNUSED_LOCAL_TYPEDEFS)
 # gcc5 complains about this
 CHECK_CXX_COMPILER_FLAG("-Wunused-variable" CXX_WUNUSED_VARIABLE)
 
+# gcc 10 complains about this
+CHECK_C_COMPILER_FLAG("-Wstringop-overflow" CC_STRINGOP_OVERFLOW)
+if(CC_STRINGOP_OVERFLOW)
+    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-stringop-overflow")
+endif()
+
 endif()
 
 include_directories(SYSTEM ${Boost_INCLUDE_DIRS})
diff --git a/unit/internal/graph_undirected.cpp b/unit/internal/graph_undirected.cpp
index babc01a6a..73d3e3570 100644
--- a/unit/internal/graph_undirected.cpp
+++ b/unit/internal/graph_undirected.cpp
@@ -40,12 +40,12 @@ using namespace std;
 using namespace ue2;
 
 struct SimpleV {
-    size_t index;
+    size_t index = 0;
     string test_v = "SimpleV";
 };
 
 struct SimpleE {
-    size_t index;
+    size_t index = 0;
     string test_e = "SimpleE";
 };
 

From f1db5b71d87f1478abba452c4e3908730bbf830c Mon Sep 17 00:00:00 2001
From: "Hong, Yang A" <yang.a.hong@intel.com>
Date: Mon, 25 May 2020 14:22:18 +0000
Subject: [PATCH 23/25] tools/fuzz: add test scripts for synthetic pattern
 generation.

---
 tools/fuzz/aristocrats.py   |  45 +++++++
 tools/fuzz/completocrats.py |  39 ++++++
 tools/fuzz/heuristocrats.py | 259 ++++++++++++++++++++++++++++++++++++
 tools/fuzz/limited_dict.txt |   9 ++
 4 files changed, 352 insertions(+)
 create mode 100755 tools/fuzz/aristocrats.py
 create mode 100755 tools/fuzz/completocrats.py
 create mode 100755 tools/fuzz/heuristocrats.py
 create mode 100644 tools/fuzz/limited_dict.txt

diff --git a/tools/fuzz/aristocrats.py b/tools/fuzz/aristocrats.py
new file mode 100755
index 000000000..7b6ff2bf3
--- /dev/null
+++ b/tools/fuzz/aristocrats.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+
+from random import choice,randint
+from optparse import OptionParser
+
+def generateRandomOptions():
+    if options.hybrid:
+        allflags = "smiH8W"
+    else:
+        # Maintain an ordering for consistency.
+        allflags = "smiHV8WLP"
+    flags = ""
+    for f in allflags:
+        flags += choice(['', f])
+    return flags
+
+parser = OptionParser()
+parser.add_option("-d", "--depth",
+                  action="store", type="int", dest="depth", default=200,
+                  help="Depth of generation (akin to maximum length)")
+parser.add_option("-c", "--count",
+                  action="store", type="int", dest="count", default=1000,
+                  help="Number of expressions to generate")
+parser.add_option("-f", "--full",
+                  action="store_true", dest="full", default=False,
+                  help="Use a full character set including unprintables")
+parser.add_option("-H", "--hybrid",
+                  action="store_true", dest="hybrid",
+                  help="Generate random flags for hybrid mode")
+
+(options, args) = parser.parse_args()
+if len(args) != 0:
+    parser.error("incorrect number of arguments")
+
+if (options.full):
+    crange = range(0,256)
+    crange.remove(ord('\n'))
+else:
+    crange = range(32, 127)
+
+for i in xrange(0, options.count):
+    len = randint(1, options.depth)
+    s = [ chr(choice(crange)) for x in xrange(len) ]
+    line = str(i) + ":/" + "".join(s) + "/" + generateRandomOptions()
+    print line
diff --git a/tools/fuzz/completocrats.py b/tools/fuzz/completocrats.py
new file mode 100755
index 000000000..60ac4d7ef
--- /dev/null
+++ b/tools/fuzz/completocrats.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python
+
+from itertools import *
+from optparse import OptionParser
+
+LIMITED_ALPHABET = "abc[](){}*?+^$|:=.\\-"
+
+parser = OptionParser()
+parser.add_option("-d", "--depth",
+                  action="store", type="int", dest="depth", default=200,
+                  help="Depth of generation (akin to maximum length)")
+
+parser.add_option("-f", "--full",
+                  action="store_true", dest="full", default=False,
+                  help="Use a full character set including unprintables")
+
+parser.add_option("-l", "--limited",
+                  action="store_true", dest="limited", default=False,
+                  help="Use a very limited character set: just " + LIMITED_ALPHABET)
+
+(options, args) = parser.parse_args()
+if len(args) != 0:
+    parser.error("incorrect number of arguments")
+
+if (options.full):
+    crange = range(0,256)
+    crange.remove(ord('\n'))
+elif (options.limited):
+    crange = [ ord(c) for c in LIMITED_ALPHABET ]
+else:
+    crange = range(32, 127)
+
+srange = [ chr(c) for c in crange ]
+
+i = 0
+for x in product(srange, repeat = options.depth):
+    line = str(i) + ":/" + "".join(x) + "/"
+    print line
+    i += 1
diff --git a/tools/fuzz/heuristocrats.py b/tools/fuzz/heuristocrats.py
new file mode 100755
index 000000000..49c7acb43
--- /dev/null
+++ b/tools/fuzz/heuristocrats.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python
+
+from optparse import OptionParser
+from random import *
+import string
+import sys
+
+# return a random non-degenerate (ie not [10]) partition of nChildren 
+def chooseLeafWidth(nChildren):
+    width = randint(1, 5)
+    width = min(width, nChildren-1)
+    s = sample(range(1, nChildren), width)
+    s.sort()
+    s = [0] + s + [nChildren]
+    v = [ s[i+1] - s[i] for i in range(0, len(s)-1) if s[i+1] != s[i] ]
+    return v
+
+def generateConcat(nChildren, atTopIgnored):
+    v = [ generateRE(w, atTop = False) for w in chooseLeafWidth(nChildren) ]
+    v = [ r for r in v if r != '' ]	
+    return string.join(v, "")
+
+def makeGroup(s):
+    # Parenthesise either in normal parens or a non-capturing group.
+    if randint(0, 1) == 0:
+        return "(" + s + ")"
+    else:
+        return "(?:" + s + ")"
+
+def generateAlt(nChildren, atTop):
+    v = [ generateRE(w, [generateAlt], atTop) for w in chooseLeafWidth(nChildren) ]
+    v = [ r for r in v if r != '' ]	
+    s = string.join(v, "|")
+    if len(v) == 1:
+	    return s
+    else:
+        return makeGroup(s)
+
+def generateQuant(nChildren, atTopIgnored):
+    lo = int(round(expovariate(0.2)))
+    hi = lo + int(round(expovariate(0.2)))
+    q = choice(["*", "?", "+", "{%d}"%lo, "{%d,}"%lo, "{%d,%d}"%(lo,hi)])
+    r = generateRE(nChildren, [generateQuant], atTop = False)
+    if (len(r) == 1) or (r[0] != '(' and r[-1] != ")"):  
+        return r + q
+    else:
+        return makeGroup(r) + q
+
+def generateChar(nChildren, atTop = False):
+    return chr(choice(alphabet))
+
+def generateNocaseChar(nChildren, atTop = False):
+    'Either generate an uppercase char from the alphabet or a nocase class [Aa]'
+    c = generateChar(nChildren, atTop)
+    if random() < 0.5:
+        return c.upper()
+    else:
+        return '[' + c.upper() + c.lower() + ']'
+
+def generateDot(nChildren, atTop = False):
+    return "."
+
+def generateBoundary(nChildren, atTop = False):
+    # \b, \B in parens so that we can repeat them and still be accepted by 
+    # libpcre
+    return makeGroup('\\' + choice('bB'))
+
+def generateCharClass(nChildren, atTop = False):
+    s = ""
+    if random() < 0.2:
+        s = "^"
+        nChars = randint(1,4)
+    else:
+        nChars = randint(2,4)
+
+    for i in xrange(nChars):
+        s += generateChar(1)
+    return "[" + s + "]"
+
+def generateOptionsFlags(nChildren, atTop = False):
+    allflags = "smix"
+    pos_flags = sample(allflags, randint(1, len(allflags)))
+    neg_flags = sample(allflags, randint(1, len(allflags)))
+    s = '(?' + ''.join(pos_flags) + '-' + ''.join(neg_flags) + ')'
+    return s 
+
+def generateLogicalId(nChildren, atTop = False):
+    return str(randint(0, options.count))
+
+def makeLogicalGroup(s):
+    return "(" + s + ")"
+
+def generateLogicalNot(nChildren, atTop):
+    r = generateCombination(nChildren, [generateLogicalNot], atTop = False)
+    return "!" + makeLogicalGroup(r)
+
+def generateLogicalAnd(nChildren, atTop):
+    v = [ generateCombination(w, [generateLogicalAnd], atTop = False) for w in chooseLeafWidth(nChildren) ]
+    v = [ r for r in v if r != '' ]
+    s = string.join(v, "&")
+    if len(v) == 1:
+	    return s
+    else:
+        return makeLogicalGroup(s)
+
+def generateLogicalOr(nChildren, atTop):
+    v = [ generateCombination(w, [generateLogicalOr], atTop = False) for w in chooseLeafWidth(nChildren) ]
+    v = [ r for r in v if r != '' ]
+    s = string.join(v, "|")
+    if len(v) == 1:
+	    return s
+    else:
+        return makeLogicalGroup(s)
+
+weightsTree = [
+    (generateConcat, 10),
+    (generateAlt, 3),
+    (generateQuant, 2),
+    ]
+
+weightsLeaf = [
+    (generateChar, 30),
+    (generateCharClass, 5),
+    (generateDot, 5),
+    (generateNocaseChar, 2),
+    (generateBoundary, 1),
+    (generateOptionsFlags, 1)
+    ]
+
+weightsLogicalTree = [
+    (generateLogicalNot, 1),
+    (generateLogicalAnd, 5),
+    (generateLogicalOr, 5),
+    ]
+
+weightsLogicalLeaf = [
+    (generateLogicalId, 1),
+    ]
+
+def genChoices(weighted):
+    r = []
+    for (f, w) in weighted:
+        r = r + [f] * w
+    return r
+
+choicesTree = genChoices(weightsTree)
+choicesLeaf = genChoices(weightsLeaf)
+choicesLogicalTree = genChoices(weightsLogicalTree)
+choicesLogicalLeaf = genChoices(weightsLogicalLeaf)
+
+weightsAnchor = [
+    ("\\A%s\\Z", 1),
+    ("\\A%s\\z", 1),
+    ("\\A%s",  4),
+    ("%s\\Z", 2),
+    ("%s\\z", 2),
+    ("^%s$", 1),
+    ("^%s",  4),
+    ("%s$", 2),
+    ("%s", 25)
+    ]
+choicesAnchor = genChoices(weightsAnchor)
+
+def generateRE(nChildren, suppressList = [], atTop = False):
+    if atTop:
+        anchorSubstituteString = choice(choicesAnchor)
+    else:
+        anchorSubstituteString = "%s"
+
+    nChildren -= 1
+    if nChildren == 0:
+        res = choice(choicesLeaf)(nChildren, atTop)
+    else:
+        c = [ ch for ch in choicesTree if ch not in suppressList ]
+        res = choice(c)(nChildren, atTop)
+
+    return anchorSubstituteString % res
+
+def generateCombination(nChildren, suppressList = [], atTop = False):
+    nChildren -= 1
+    if nChildren == 0:
+        res = choice(choicesLogicalLeaf)(nChildren, atTop)
+    else:
+        c = [ ch for ch in choicesLogicalTree if ch not in suppressList ]
+        res = choice(c)(nChildren, atTop)
+
+    return res
+
+def generateRandomOptions():
+    if options.hybrid:
+        allflags = "smiH8W"
+    else:
+        # Maintain an ordering for consistency.
+        allflags = "smiHV8WLP"
+    flags = ""
+    for f in allflags:
+        flags += choice(['', f])
+    if options.logical:
+        flags += choice(['', 'Q'])
+    return flags
+
+def generateRandomExtParam(depth, extparam):
+    if not extparam:
+        return ""
+    params = []
+    if choice((False, True)):
+        params.append("min_length=%u" % randint(1, depth))
+    if choice((False, True)):
+        params.append("min_offset=%u" % randint(1, depth))
+    if choice((False, True)):
+        params.append("max_offset=%u" % randint(1, depth*3))
+    if choice((False, True)):
+        dist = randint(1, 3)
+        if choice((False, True)):
+            params.append("edit_distance=%u" % dist)
+        else:
+            params.append("hamming_distance=%u" % dist)
+    if params:
+        return "{" + ",".join(params) + "}"
+    else:
+        return ""
+
+parser = OptionParser()
+parser.add_option("-d", "--depth",
+                  action="store", type="int", dest="depth", default=200,
+                  help="Depth of generation (akin to maximum length)")
+parser.add_option("-c", "--count",
+                  action="store", type="int", dest="count", default=1000,
+                  help="Number of expressions to generate")
+parser.add_option("-a", "--alphabet",
+                  action="store", type="int", dest="alphabet", default=26,
+                  help="Size of alphabet to generate character expressions over (starting with lowercase 'a')")
+parser.add_option("-i", "--nocase",
+                  action="store_true", dest="nocase",
+                  help="Use a caseless alphabet for character generation")
+parser.add_option("-x", "--extparam",
+                  action="store_true", dest="extparam",
+                  help="Generate random extended parameters")
+parser.add_option("-l", "--logical",
+                  action="store_true", dest="logical",
+                  help="Generate logical combination expressions")
+parser.add_option("-H", "--hybrid",
+                  action="store_true", dest="hybrid",
+                  help="Generate random flags for hybrid mode")
+
+(options, args) = parser.parse_args()
+if len(args) != 0:
+    parser.error("incorrect number of arguments")
+
+alphabet = range(ord('a'), ord('a') + options.alphabet)
+if options.nocase:
+    alphabet += range(ord('A'), ord('A') + options.alphabet)
+    
+for i in xrange(0, options.count):
+    print "%08d:/%s/%s%s" % (i, generateRE(randint(1, options.depth), atTop = True), generateRandomOptions(), generateRandomExtParam(options.depth, options.extparam))
+
+if options.logical:
+    for i in xrange(options.count, options.count + 3000):
+        print "%08d:/%s/C" % (i, generateCombination(randint(1, options.depth), atTop = True))
diff --git a/tools/fuzz/limited_dict.txt b/tools/fuzz/limited_dict.txt
new file mode 100644
index 000000000..7c3daf4ba
--- /dev/null
+++ b/tools/fuzz/limited_dict.txt
@@ -0,0 +1,9 @@
+hatstand
+teakettle
+badgerbrush
+mnemosyne
+rapscallion
+acerbic
+blackhat
+rufous
+echolalia

From 016457d5552b30a6b72541b61addc3c186742097 Mon Sep 17 00:00:00 2001
From: "Hong, Yang A" <yang.a.hong@intel.com>
Date: Tue, 19 May 2020 09:49:19 +0000
Subject: [PATCH 24/25] changelog: updates for 5.3.0 release

---
 CHANGELOG.md | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ea44debe9..19a92b909 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,29 @@
 
 This is a list of notable changes to Hyperscan, in reverse chronological order.
 
+## [5.3.0] 2020-05-15
+- Improvement on literal matcher "Teddy" performance, including support for
+  Intel(R) AVX-512 Vector Byte Manipulation Instructions (Intel(R) AVX-512
+  VBMI).
+- Improvement on single-byte/two-byte matching performance, including support
+  for Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512).
+- hsbench: add hyphen support for -T option.
+- tools/fuzz: add test scripts for synthetic pattern generation.
+- Bugfix for acceleration path analysis in LimEx NFA.
+- Bugfix for duplicate matches for Small-write engine.
+- Bugfix for UTF8 checking problem for hscollider.
+- Bugfix for issue #205: avoid crash of `hs_compile_lit_multi()` with clang and
+  ASAN.
+- Bugfix for issue #211: fix error in `db_check_platform()` function.
+- Bugfix for issue #217: fix cmake parsing issue of CPU arch for non-English
+  locale.
+- Bugfix for issue #228: avoid undefined behavior when calling `close()` after
+  `fdopendir()` in `loadExpressions()`.
+- Bugfix for issue #239: fix hyperscan compile issue under gcc-10.
+- Add VLAN packets processing capability in pcap analysis script. (#214)
+- Avoid extra convert instruction for "Noodle". (#221)
+- Add Hyperscan version marcro in `hs.h`. (#222)
+
 ## [5.2.1] 2019-10-13
 - Bugfix for issue #186: fix compile issue when `BUILD_SHARED_LIBS` is on in
   release mode.

From c758cdfb07b3569020bc91a7579e7e5e31f00c21 Mon Sep 17 00:00:00 2001
From: "Hong, Yang A" <yang.a.hong@intel.com>
Date: Fri, 8 May 2020 14:34:28 +0000
Subject: [PATCH 25/25] Bump version number for release

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ef27bc800..59c6e6e2f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,8 +2,8 @@ cmake_minimum_required (VERSION 2.8.11)
 project (hyperscan C CXX)
 
 set (HS_MAJOR_VERSION 5)
-set (HS_MINOR_VERSION 2)
-set (HS_PATCH_VERSION 1)
+set (HS_MINOR_VERSION 3)
+set (HS_PATCH_VERSION 0)
 set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})
 
 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)